scikit-learn · glemaitre · Feb 23, 2020 · Feb 23, 2020 · Feb 23, 2020 · Feb 23, 2020
diff --git a/doc/model_selection.rst b/doc/model_selection.rst
@@ -14,6 +14,7 @@ Model selection and evaluation
 
     modules/cross_validation
     modules/grid_search
+    modules/prediction
     modules/model_evaluation
     modules/model_persistence
     modules/learning_curve
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1153,6 +1153,7 @@ Splitter Classes
    :toctree: generated/
    :template: class.rst
 
+   model_selection.CutoffClassifier
    model_selection.GroupKFold
    model_selection.GroupShuffleSplit
    model_selection.KFold

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
@@ -0,0 +1,34 @@
+.. currentmodule:: sklearn.model_selection
+
+.. _prediction_tuning:
+
+================================================
+Tuning of the decision threshold of an estimator
-Tuning of the decision threshold of an estimator
+Tuning of the decision threshold of a classifier
-Tuning of the decision threshold of an estimator
+Tuning of the decision threshold of a classifier
+================================================
+
+The real-valued decision functions, i.e. `decision_function` and
+`predict_proba`, of machine-learning classifiers carry the inherited biases of
-The real-valued decision functions, i.e. `decision_function` and
-`predict_proba`, of machine-learning classifiers carry the inherited biases of
+The real-valued decision functions, i.e. :term:`decision_function` and
+:term:`predict_proba`, of machine-learning classifiers carry the inherited biases of
-The real-valued decision functions, i.e. `decision_function` and
-`predict_proba`, of machine-learning classifiers carry the inherited biases of
+The real-valued decision functions, i.e. :term:`decision_function` and
+:term:`predict_proba`, of machine-learning classifiers carry the inherited biases of
+the fitted model; e.g, in a class imbalanced setting, a classifier
-the fitted model; e.g, in a class imbalanced setting, a classifier
+the fitted model; e.g, in a class-imbalance setting, a classifier
-the fitted model; e.g, in a class imbalanced setting, a classifier
+the fitted model; e.g, in a class-imbalance setting, a classifier
+will naturally lean toward the most frequent class. In some other cases, the
+generic objective function used to train a model is generally unaware of the
+evaluation criteria used to evaluate the model; e.g., one might want to
+penalized differently a false-positive and false-negative ---it will be less
-penalized differently a false-positive and false-negative ---it will be less
+penalize differently a false-positive and false-negative: it will be less
-penalized differently a false-positive and false-negative ---it will be less
+penalize differently a false-positive and false-negative: it will be less
+detrimental to show an MR image without a cancer (i.e., false-positive) to a
+radiologist than hidding one with a cancer (i.e, false-negtative) when
-radiologist than hidding one with a cancer (i.e, false-negtative) when
+radiologist than to hide one with a cancer (i.e, false-negtative) when
-radiologist than hidding one with a cancer (i.e, false-negtative) when
+radiologist than to hide one with a cancer (i.e, false-negtative) when
+developing some computer-aided diagnosis system.
+
+In a binary classification scenario, the hard-prediction, i.e. `predict`, for a
+classifier most commonly use the `predict_proba` and apply a decision threshold
+at 0.5 to output a positive or negative label. Thus, this hard-prediction
+suffers from the same drawbacks than the one raised in the above paragraph.
+
+Post-tuning of the decision threshold
+=====================================
+
+:class:`CutoffClassifier` allows for post-tuning the decision threshold using
+either `decision_function` or `predict_proba` and an objective metric for which
+we want our threshold to be optimized for.
+
+Fine-tune using a single objective metric
+-----------------------------------------
+
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
@@ -611,6 +611,12 @@ Changelog
   be removed in 0.25. :pr:`16401` by
   :user:`Arie Pratama Sutiono <ariepratama>`
 
+- |MajorFeature| :class:`model_selection.CutoffClassifier` calibrates the
+  decision threshold function of a classifier by maximizing a binary
-  decision threshold function of a classifier by maximizing a binary
+  decision threshold function of a binary classifier by maximizing a
-  decision threshold function of a classifier by maximizing a binary
+  decision threshold function of a binary classifier by maximizing a
+  classification metric through cross-validation.
-  classification metric through cross-validation.
+  classification metric through cross-validation.
+  That decision threshold is then used to convert a probability or the output of the decision function into a predicted class.
-  classification metric through cross-validation.
+  classification metric through cross-validation.
+  That decision threshold is then used to convert a probability or the output of the decision function into a predicted class.
+  :pr:`16525` by :user:`Guillaume Lemaitre <glemaitre>` and
+  :user:`Prokopis Gryllos <PGryllos>`.
+
 :mod:`sklearn.multioutput`
 ..........................
 

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -314,6 +314,13 @@ Changelog
   class to be used when computing the roc auc statistics.
   :pr:`17651` by :user:`Clara Matos <claramatos>`.
 
+- |Fix| Fix scorers that accept a pos_label parameter and compute their metrics
+  from values returned by `decision_function` or `predict_proba`. Previously,
+  they would return erroneous values when pos_label was not corresponding to
+  `classifier.classes_[1]`. This is especially important when training
+  classifiers directly with string labeled target classes.
+  :pr:`#18114` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
@@ -1251,7 +1251,7 @@ def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label):
                          str(average_options))
 
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
-    present_labels = unique_labels(y_true, y_pred)
+    present_labels = unique_labels(y_true, y_pred).tolist()
     if average == 'binary':
         if y_type == 'binary':
             if pos_label not in present_labels:

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
@@ -127,6 +127,48 @@ def __init__(self, score_func, sign, kwargs):
         self._score_func = score_func
         self._sign = sign
 
+    @staticmethod
+    def _check_pos_label(pos_label, classes):
+        if pos_label not in list(classes):
+            raise ValueError(
+                f"pos_label={pos_label} is not a valid label: {classes}"
+            )
+
+    def _check_decision_function(self, y_pred, classes):
+        """Reverse the decision function depending of pos_label."""
+        pos_label = self._kwargs.get("pos_label", classes[1])
+        self._check_pos_label(pos_label, classes)
+        if pos_label == classes[0]:
+            # The implicit positive class of the binary classifier
+            # does not match `pos_label`: we need to invert the
+            # predictions
+            y_pred *= -1
+
+        return y_pred
+
+    def _select_proba(self, y_pred, classes, support_multi_class):
+        """Select the column of y_pred when probabilities are provided."""
+        if y_pred.shape[1] == 2:
+            pos_label = self._kwargs.get("pos_label", classes[1])
+            self._check_pos_label(pos_label, classes)
+            col_idx = np.flatnonzero(classes == pos_label)[0]
+            y_pred = y_pred[:, col_idx]
+        else:
+            err_msg = (
+                f"Got predict_proba of shape {y_pred.shape}, but need "
+                f"classifier with two classes for {self._score_func.__name__} "
+                f"scoring"
+            )
+            if support_multi_class and y_pred.shape[1] == 1:
+                # In _ProbaScorer, y_true can be tagged as binary while the
+                # y_pred is multi_class. This case is supported when label is
+                # provided.
+                raise ValueError(err_msg)
+            elif not support_multi_class:
+                raise ValueError(err_msg)
+
+        return y_pred
+
     def __repr__(self):
         kwargs_string = "".join([", %s=%s" % (str(k), str(v))
                                  for k, v in self._kwargs.items()])
@@ -238,13 +280,9 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
         y_type = type_of_target(y)
         y_pred = method_caller(clf, "predict_proba", X)
         if y_type == "binary":
-            if y_pred.shape[1] == 2:
-                y_pred = y_pred[:, 1]
-            elif y_pred.shape[1] == 1:  # not multiclass
-                raise ValueError('got predict_proba of shape {},'
-                                 ' but need classifier with two'
-                                 ' classes for {} scoring'.format(
-                                     y_pred.shape, self._score_func.__name__))
+            y_pred = self._select_proba(
+                y_pred, clf.classes_, support_multi_class=True
+            )
         if sample_weight is not None:
             return self._sign * self._score_func(y, y_pred,
                                                  sample_weight=sample_weight,
@@ -298,22 +336,21 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
             try:
                 y_pred = method_caller(clf, "decision_function", X)
 
-                # For multi-output multi-class estimator
                 if isinstance(y_pred, list):
+                    # For multi-output multi-class estimator
                     y_pred = np.vstack([p for p in y_pred]).T
+                elif y_type == "binary":
+                    y_pred = self._check_decision_function(
+                        y_pred, clf.classes_
+                    )
 
             except (NotImplementedError, AttributeError):
                 y_pred = method_caller(clf, "predict_proba", X)
 
                 if y_type == "binary":
-                    if y_pred.shape[1] == 2:
-                        y_pred = y_pred[:, 1]
-                    else:
-                        raise ValueError('got predict_proba of shape {},'
-                                         ' but need classifier with two'
-                                         ' classes for {} scoring'.format(
-                                             y_pred.shape,
-                                             self._score_func.__name__))
+                    y_pred = self._select_proba(
+                        y_pred, clf.classes_, support_multi_class=False,
+                    )
                 elif isinstance(y_pred, list):
                     y_pred = np.vstack([p[:, -1] for p in y_pred]).T
 

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -4,7 +4,6 @@
 from itertools import chain
 from itertools import permutations
 import warnings
-import re
 
 import numpy as np
 from scipy import linalg
@@ -1247,7 +1246,7 @@ def test_multilabel_hamming_loss():
 def test_jaccard_score_validation():
     y_true = np.array([0, 1, 0, 1, 1])
     y_pred = np.array([0, 1, 0, 1, 1])
-    err_msg = r"pos_label=2 is not a valid label: array\(\[0, 1\]\)"
+    err_msg = r"pos_label=2 is not a valid label: \[0, 1\]"
     with pytest.raises(ValueError, match=err_msg):
         jaccard_score(y_true, y_pred, average='binary', pos_label=2)
 
@@ -2262,9 +2261,12 @@ def test_brier_score_loss():
     # ensure to raise an error for multiclass y_true
     y_true = np.array([0, 1, 2, 0])
     y_pred = np.array([0.8, 0.6, 0.4, 0.2])
-    error_message = ("Only binary classification is supported. Labels "
-                     "in y_true: {}".format(np.array([0, 1, 2])))
-    with pytest.raises(ValueError, match=re.escape(error_message)):
+    error_message = (
+        r"Only binary classification is supported. Labels in y_true: "
+        r"\[0 1 2\]"
+    )
+
+    with pytest.raises(ValueError, match=error_message):
         brier_score_loss(y_true, y_pred)
 
     # calculate correctly when there's only one class in y_true