From db390c21b493feb1baa71b5948ddf3097608cb3c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 15 Jun 2020 10:11:32 +0200
Subject: [PATCH 01/17] ENH add a parameter pos_label in roc_auc_score

---
 sklearn/metrics/_ranking.py           | 16 +++++-----
 sklearn/metrics/tests/test_ranking.py | 43 ++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index 6aab05a71707d..db8d7a8080efd 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -217,14 +217,16 @@ def _binary_uninterpolated_average_precision(
                                  average, sample_weight=sample_weight)
 
 
-def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None):
+def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None,
+                          pos_label=None):
     """Binary roc auc score"""
     if len(np.unique(y_true)) != 2:
         raise ValueError("Only one class present in y_true. ROC AUC score "
                          "is not defined in that case.")
 
-    fpr, tpr, _ = roc_curve(y_true, y_score,
-                            sample_weight=sample_weight)
+    fpr, tpr, _ = roc_curve(
+        y_true, y_score, sample_weight=sample_weight, pos_label=pos_label,
+    )
     if max_fpr is None or max_fpr == 1:
         return auc(fpr, tpr)
     if max_fpr <= 0 or max_fpr > 1:
@@ -247,7 +249,8 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None):
 
 @_deprecate_positional_args
 def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
-                  max_fpr=None, multi_class="raise", labels=None):
+                  max_fpr=None, multi_class="raise", labels=None,
+                  pos_label=None):
     """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
     from prediction scores.
 
@@ -385,10 +388,9 @@ def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
         return _multiclass_roc_auc_score(y_true, y_score, labels,
                                          multi_class, average, sample_weight)
     elif y_type == "binary":
-        labels = np.unique(y_true)
-        y_true = label_binarize(y_true, classes=labels)[:, 0]
         return _average_binary_score(partial(_binary_roc_auc_score,
-                                             max_fpr=max_fpr),
+                                             max_fpr=max_fpr,
+                                             pos_label=pos_label),
                                      y_true, y_score, average,
                                      sample_weight=sample_weight)
     else:  # multilabel-indicator
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index a66ff9525c28c..e1c8053a59842 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -7,9 +7,13 @@
 from sklearn import datasets
 from sklearn import svm
 
-from sklearn.utils.extmath import softmax
 from sklearn.datasets import make_multilabel_classification
+from sklearn.datasets import load_breast_cancer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
 from sklearn.random_projection import _sparse_random_matrix
+from sklearn.utils import shuffle
+from sklearn.utils.extmath import softmax
 from sklearn.utils.validation import check_array, check_consistent_length
 from sklearn.utils.validation import check_random_state
 
@@ -1469,3 +1473,40 @@ def test_partial_roc_auc_score():
         assert_almost_equal(
             roc_auc_score(y_true, y_pred, max_fpr=max_fpr),
             _partial_roc_auc_score(y_true, y_pred, max_fpr))
+
+
+@pytest.mark.parametrize(
+    "decision_method", ["predict_proba", "decision_function"]
+)
+def test_roc_auc_score_pos_label(decision_method):
+    X, y = load_breast_cancer(return_X_y=True)
+    # create an highly imbalanced
+    idx_positive = np.flatnonzero(y == 1)
+    idx_negative = np.flatnonzero(y == 0)
+    idx_selected = np.hstack([idx_negative, idx_positive[:25]])
+    X, y = X[idx_selected], y[idx_selected]
+    X, y = shuffle(X, y, random_state=42)
+    # only use 2 features to make the problem even harder
+    X = X[:, :2]
+    y = np.array(
+        ["cancer" if c == 1 else "not cancer" for c in y], dtype=object
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, stratify=y, random_state=0,
+    )
+
+    classifier = LogisticRegression()
+    classifier.fit(X_train, y_train)
+
+    # sanity check to be sure the positive class is classes_[0] and that we
+    # are betrayed by the class imbalance
+    assert classifier.classes_.tolist() == ["cancer", "not cancer"]
+    pos_label = "cancer"
+
+    y_pred = getattr(classifier, decision_method)(X_test)
+    y_pred = y_pred[:, 0] if y_pred.ndim == 2 else -y_pred
+
+    fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=pos_label)
+    roc_auc = roc_auc_score(y_test, y_pred, pos_label=pos_label)
+
+    assert roc_auc == pytest.approx(np.trapz(tpr, fpr))

From 0e2937ba73bdc03615810cab62d0bcbd99c40fbf Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 15 Jun 2020 10:18:56 +0200
Subject: [PATCH 02/17] add documentation

---
 doc/whats_new/v0.24.rst     | 5 ++++-
 sklearn/metrics/_ranking.py | 5 +++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index ea27d7579ae4d..fbf2717ffbbf6 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -99,9 +99,12 @@ Changelog
   :pr:`17309` by :user:`Swier Heeres <swierh>`
 
 - |Enhancement| Add `sample_weight` parameter to
-  :class:`metrics.median_absolute_error`. :pr:`17225` by
+  :func:`metrics.median_absolute_error`. :pr:`17225` by
   :user:`Lucy Liu <lucyleeow>`.
 
+- |Enhancement| Add `pos_label` parameter to :func:`roc_auc_score`.
+  :pr:`17594` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index db8d7a8080efd..cbb073db3c1cc 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -329,6 +329,11 @@ def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
         If ``None``, the numerical or lexicographical order of the labels in
         ``y_true`` is used.
 
+    pos_label : int or str, default=None
+        The label of the positive class in the binary case. When
+        `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1}, `pos_label` is
+        set to 1, otherwise an error will be raised.
+
     Returns
     -------
     auc : float

From 7f4fa4562d6af1fd3d4039810f8aa5073017011b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 15 Jun 2020 11:18:29 +0200
Subject: [PATCH 03/17] TST pass pos_label with str in common test

---
 sklearn/metrics/tests/test_common.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 7301d21a35f39..e1cfdd0620a36 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -319,6 +319,17 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
 # Metrics with a "pos_label" argument
 METRICS_WITH_POS_LABEL = {
     "roc_curve",
+
+    "roc_auc_score",
+    "weighted_roc_auc",
+    "samples_roc_auc",
+    "micro_roc_auc",
+    "ovr_roc_auc",
+    "weighted_ovr_roc_auc",
+    "ovo_roc_auc",
+    "weighted_ovo_roc_auc",
+    "partial_roc_auc",
+
     "precision_recall_curve",
 
     "brier_score_loss",

From 1852d50b0f9614743ef4d43234f2fbb40dc35a0a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 15 Jun 2020 10:11:32 +0200
Subject: [PATCH 04/17] ENH add a parameter pos_label in roc_auc_score

---
 sklearn/metrics/_ranking.py           | 16 +++++-----
 sklearn/metrics/tests/test_ranking.py | 43 ++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index e07f61a92d478..5737f257912dc 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -218,14 +218,16 @@ def _binary_uninterpolated_average_precision(
                                  average, sample_weight=sample_weight)
 
 
-def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None):
+def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None,
+                          pos_label=None):
     """Binary roc auc score"""
     if len(np.unique(y_true)) != 2:
         raise ValueError("Only one class present in y_true. ROC AUC score "
                          "is not defined in that case.")
 
-    fpr, tpr, _ = roc_curve(y_true, y_score,
-                            sample_weight=sample_weight)
+    fpr, tpr, _ = roc_curve(
+        y_true, y_score, sample_weight=sample_weight, pos_label=pos_label,
+    )
     if max_fpr is None or max_fpr == 1:
         return auc(fpr, tpr)
     if max_fpr <= 0 or max_fpr > 1:
@@ -248,7 +250,8 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None):
 
 @_deprecate_positional_args
 def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
-                  max_fpr=None, multi_class="raise", labels=None):
+                  max_fpr=None, multi_class="raise", labels=None,
+                  pos_label=None):
     """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
     from prediction scores.
 
@@ -388,10 +391,9 @@ def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
         return _multiclass_roc_auc_score(y_true, y_score, labels,
                                          multi_class, average, sample_weight)
     elif y_type == "binary":
-        labels = np.unique(y_true)
-        y_true = label_binarize(y_true, classes=labels)[:, 0]
         return _average_binary_score(partial(_binary_roc_auc_score,
-                                             max_fpr=max_fpr),
+                                             max_fpr=max_fpr,
+                                             pos_label=pos_label),
                                      y_true, y_score, average,
                                      sample_weight=sample_weight)
     else:  # multilabel-indicator
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 3daafa8d196d3..dd771570e8481 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -7,9 +7,13 @@
 from sklearn import datasets
 from sklearn import svm
 
-from sklearn.utils.extmath import softmax
 from sklearn.datasets import make_multilabel_classification
+from sklearn.datasets import load_breast_cancer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
 from sklearn.random_projection import _sparse_random_matrix
+from sklearn.utils import shuffle
+from sklearn.utils.extmath import softmax
 from sklearn.utils.validation import check_array, check_consistent_length
 from sklearn.utils.validation import check_random_state
 
@@ -1469,3 +1473,40 @@ def test_partial_roc_auc_score():
         assert_almost_equal(
             roc_auc_score(y_true, y_pred, max_fpr=max_fpr),
             _partial_roc_auc_score(y_true, y_pred, max_fpr))
+
+
+@pytest.mark.parametrize(
+    "decision_method", ["predict_proba", "decision_function"]
+)
+def test_roc_auc_score_pos_label(decision_method):
+    X, y = load_breast_cancer(return_X_y=True)
+    # create an highly imbalanced
+    idx_positive = np.flatnonzero(y == 1)
+    idx_negative = np.flatnonzero(y == 0)
+    idx_selected = np.hstack([idx_negative, idx_positive[:25]])
+    X, y = X[idx_selected], y[idx_selected]
+    X, y = shuffle(X, y, random_state=42)
+    # only use 2 features to make the problem even harder
+    X = X[:, :2]
+    y = np.array(
+        ["cancer" if c == 1 else "not cancer" for c in y], dtype=object
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, stratify=y, random_state=0,
+    )
+
+    classifier = LogisticRegression()
+    classifier.fit(X_train, y_train)
+
+    # sanity check to be sure the positive class is classes_[0] and that we
+    # are betrayed by the class imbalance
+    assert classifier.classes_.tolist() == ["cancer", "not cancer"]
+    pos_label = "cancer"
+
+    y_pred = getattr(classifier, decision_method)(X_test)
+    y_pred = y_pred[:, 0] if y_pred.ndim == 2 else -y_pred
+
+    fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=pos_label)
+    roc_auc = roc_auc_score(y_test, y_pred, pos_label=pos_label)
+
+    assert roc_auc == pytest.approx(np.trapz(tpr, fpr))

From 62efb2d216c6272f0b9fc4f8ca3b8365f6ee2e4c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 15 Jun 2020 10:18:56 +0200
Subject: [PATCH 05/17] add documentation

---
 doc/whats_new/v0.24.rst     | 3 +++
 sklearn/metrics/_ranking.py | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index da366c913f500..42a9382e2d7b7 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -240,6 +240,9 @@ Changelog
   ``metric='seuclidean'`` and ``X`` is not type ``np.float64``.
   :pr:`15730` by :user:`Forrest Koch <ForrestCKoch>`.
 
+- |Enhancement| Add `pos_label` parameter to :func:`roc_auc_score`.
+  :pr:`17594` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index 5737f257912dc..4682fc4a6854c 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -330,6 +330,11 @@ def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
         If ``None``, the numerical or lexicographical order of the labels in
         ``y_true`` is used.
 
+    pos_label : int or str, default=None
+        The label of the positive class in the binary case. When
+        `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1}, `pos_label` is
+        set to 1, otherwise an error will be raised.
+
     Returns
     -------
     auc : float

From 2746252f92873994a92a79059de125581d3b8727 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 15 Jun 2020 17:20:20 +0200
Subject: [PATCH 06/17] add versionadded

---
 sklearn/metrics/_ranking.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index 4682fc4a6854c..5f738f01268d2 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -335,6 +335,8 @@ def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
         `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1}, `pos_label` is
         set to 1, otherwise an error will be raised.
 
+        .. versionadded:: 0.24
+
     Returns
     -------
     auc : float

From fca877cc6e2665e6182f4ec6a2af7d5f56cb8ba3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 7 Jul 2020 08:43:45 +0200
Subject: [PATCH 07/17] add test with grid-search

---
 sklearn/metrics/tests/test_ranking.py | 28 +++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index dd771570e8481..fba93036e3bd3 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -11,6 +11,7 @@
 from sklearn.datasets import load_breast_cancer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV
 from sklearn.random_projection import _sparse_random_matrix
 from sklearn.utils import shuffle
 from sklearn.utils.extmath import softmax
@@ -1510,3 +1511,30 @@ def test_roc_auc_score_pos_label(decision_method):
     roc_auc = roc_auc_score(y_test, y_pred, pos_label=pos_label)
 
     assert roc_auc == pytest.approx(np.trapz(tpr, fpr))
+
+
+@pytest.mark.parametrize(
+    "decision_method", ["predict_proba", "decision_function"]
+)
+def test_roc_auc_score_pos_label_grid_search(decision_method):
+    X, y = load_breast_cancer(return_X_y=True)
+    # create an highly imbalanced
+    idx_positive = np.flatnonzero(y == 1)
+    idx_negative = np.flatnonzero(y == 0)
+    idx_selected = np.hstack([idx_negative, idx_positive[:25]])
+    X, y = X[idx_selected], y[idx_selected]
+    X, y = shuffle(X, y, random_state=42)
+    # only use 2 features to make the problem even harder
+    X = X[:, :2]
+    y = np.array(
+        ["cancer" if c == 1 else "not cancer" for c in y], dtype=object
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, stratify=y, random_state=0,
+    )
+
+    param_grid = {"C": [0.1, 1]}
+    classifier = GridSearchCV(
+        LogisticRegression(), param_grid=param_grid, scoring="roc_auc",
+    )
+    classifier.fit(X_train, y_train)

From 28cb7c80aaeb4f58322dc63960ea5cca308ba454 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 17 Jul 2020 11:14:17 +0200
Subject: [PATCH 08/17] iter

---
 sklearn/metrics/_scorer.py                  | 23 ++++++-
 sklearn/metrics/tests/test_score_objects.py | 72 ++++++++++++++++++++-
 2 files changed, 91 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index b824b9b0cbcb8..8dc8fb4b8293a 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -239,7 +239,13 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
         y_pred = method_caller(clf, "predict_proba", X)
         if y_type == "binary":
             if y_pred.shape[1] == 2:
-                y_pred = y_pred[:, 1]
+                if "pos_label" in self._kwargs:
+                    col_idx = np.flatnonzero(
+                        clf.classes_ == self._kwargs["pos_label"]
+                    )[0]
+                else:
+                    col_idx = 1
+                y_pred = y_pred[:, col_idx]
             elif y_pred.shape[1] == 1:  # not multiclass
                 raise ValueError('got predict_proba of shape {},'
                                  ' but need classifier with two'
@@ -296,6 +302,13 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
             y_pred = method_caller(clf, "predict", X)
         else:
             try:
+                if (
+                    y_type == "binary"
+                    and self._score_func.__name__ == "roc_auc_score"
+                    and "pos_label" not in self._kwargs
+                ):
+                    self._kwargs["pos_label"] = clf.classes_[1]
+
                 y_pred = method_caller(clf, "decision_function", X)
 
                 # For multi-output multi-class estimator
@@ -307,7 +320,13 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
 
                 if y_type == "binary":
                     if y_pred.shape[1] == 2:
-                        y_pred = y_pred[:, 1]
+                        if "pos_label" in self._kwargs:
+                            col_idx = np.flatnonzero(
+                                clf.classes_ == self._kwargs["pos_label"]
+                            )[0]
+                        else:
+                            col_idx = 1
+                        y_pred = y_pred[:, col_idx]
                     else:
                         raise ValueError('got predict_proba of shape {},'
                                          ' but need classifier with two'
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 67900b7cb77c3..0f2c840f8467c 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -3,6 +3,7 @@
 import shutil
 import os
 import numbers
+from copy import deepcopy
 from unittest.mock import Mock
 from functools import partial
 
@@ -11,6 +12,7 @@
 import joblib
 
 from numpy.testing import assert_allclose
+from sklearn.utils import shuffle
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import ignore_warnings
@@ -32,10 +34,12 @@
 from sklearn.cluster import KMeans
 from sklearn.linear_model import Ridge, LogisticRegression, Perceptron
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.datasets import load_diabetes
+from sklearn.datasets import load_breast_cancer
 from sklearn.datasets import make_blobs
-from sklearn.datasets import make_classification, make_regression
+from sklearn.datasets import make_classification
 from sklearn.datasets import make_multilabel_classification
-from sklearn.datasets import load_diabetes
+from sklearn.datasets import make_regression
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.model_selection import GridSearchCV
 from sklearn.multiclass import OneVsRestClassifier
@@ -747,3 +751,67 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name):
     msg = "'Perceptron' object has no attribute 'predict_proba'"
     with pytest.raises(AttributeError, match=msg):
         scorer(lr, X, y)
+
+
+@pytest.mark.parametrize(
+    "scoring, is_symmetric",
+    [
+        ("roc_auc", True),
+        ("jaccard", False),
+        ("f1", False),
+        ("average_precision", False),
+        ("precision", False),
+        ("recall", False),
+        ("neg_brier_score", True),
+    ],
+)
+def test_scorer_pos_label_grid_search(scoring, is_symmetric):
+    # check the behaviour for the scorer which requires a `pos_label` with
+    # binary target
+    X, y = load_breast_cancer(return_X_y=True)
+    # create an highly imbalanced
+    idx_positive = np.flatnonzero(y == 1)
+    idx_negative = np.flatnonzero(y == 0)
+    idx_selected = np.hstack([idx_negative, idx_positive[:25]])
+    X, y = X[idx_selected], y[idx_selected]
+    X, y = shuffle(X, y, random_state=42)
+    # only use 2 features to make the problem even harder
+    X = X[:, :2]
+    y = np.array(
+        ["cancer" if c == 1 else "not cancer" for c in y], dtype=object
+    )
+
+    param_grid = {"max_depth": [1, 2, 3, 4, 5]}
+    classifier = GridSearchCV(
+        DecisionTreeClassifier(), param_grid=param_grid, scoring=scoring,
+    )
+
+    if is_symmetric:
+        # we will expand to compute for several scorer with different pos_label
+        # which should all give the same results
+        scorer = get_scorer(scoring)
+        scorer_pos_label, scorer_neg_label = deepcopy(scorer), deepcopy(scorer)
+
+        scorer_pos_label._kwargs["pos_label"] = "cancer"
+        scorer_neg_label._kwargs["pos_label"] = "not cancer"
+        multi_scoring = {
+            "scorer_str": scorer,
+            "scorer_pos": scorer_pos_label,
+            "scorer_neg": scorer_neg_label,
+        }
+
+        classifier.set_params(
+            scoring=multi_scoring, refit="scorer_str",
+        )
+        classifier.fit(X, y)
+        assert_allclose(
+            classifier.cv_results_["mean_test_scorer_str"],
+            classifier.cv_results_["mean_test_scorer_pos"]
+        )
+        assert_allclose(
+            classifier.cv_results_["mean_test_scorer_str"],
+            classifier.cv_results_["mean_test_scorer_neg"]
+        )
+    else:
+        with pytest.raises(ValueError):
+            classifier.fit(X, y)

From 419c3005e5102e1f779cfac2749bad46b5b7135f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 17 Jul 2020 11:47:21 +0200
Subject: [PATCH 09/17] iter

---
 sklearn/metrics/tests/test_ranking.py | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index fba93036e3bd3..211bb5e5f4bdf 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -1511,30 +1511,3 @@ def test_roc_auc_score_pos_label(decision_method):
     roc_auc = roc_auc_score(y_test, y_pred, pos_label=pos_label)
 
     assert roc_auc == pytest.approx(np.trapz(tpr, fpr))
-
-
-@pytest.mark.parametrize(
-    "decision_method", ["predict_proba", "decision_function"]
-)
-def test_roc_auc_score_pos_label_grid_search(decision_method):
-    X, y = load_breast_cancer(return_X_y=True)
-    # create an highly imbalanced
-    idx_positive = np.flatnonzero(y == 1)
-    idx_negative = np.flatnonzero(y == 0)
-    idx_selected = np.hstack([idx_negative, idx_positive[:25]])
-    X, y = X[idx_selected], y[idx_selected]
-    X, y = shuffle(X, y, random_state=42)
-    # only use 2 features to make the problem even harder
-    X = X[:, :2]
-    y = np.array(
-        ["cancer" if c == 1 else "not cancer" for c in y], dtype=object
-    )
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, stratify=y, random_state=0,
-    )
-
-    param_grid = {"C": [0.1, 1]}
-    classifier = GridSearchCV(
-        LogisticRegression(), param_grid=param_grid, scoring="roc_auc",
-    )
-    classifier.fit(X_train, y_train)

From 67c7a3ebb510a18d15772e224964eeb1cee78504 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 17 Jul 2020 11:49:06 +0200
Subject: [PATCH 10/17] PEP8

---
 sklearn/metrics/tests/test_ranking.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 211bb5e5f4bdf..dd771570e8481 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -11,7 +11,6 @@
 from sklearn.datasets import load_breast_cancer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
-from sklearn.model_selection import GridSearchCV
 from sklearn.random_projection import _sparse_random_matrix
 from sklearn.utils import shuffle
 from sklearn.utils.extmath import softmax

From 8abad1c9884f1f14b3acf9e7eb20f3bddb320a05 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 17 Jul 2020 11:51:59 +0200
Subject: [PATCH 11/17] add link to issue

---
 sklearn/metrics/tests/test_score_objects.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 0f2c840f8467c..70e189e9033ba 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -766,8 +766,9 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name):
     ],
 )
 def test_scorer_pos_label_grid_search(scoring, is_symmetric):
-    # check the behaviour for the scorer which requires a `pos_label` with
-    # binary target
+    # Check the behaviour for the scorer which requires a `pos_label` with
+    # binary target. Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/pull/17572
     X, y = load_breast_cancer(return_X_y=True)
     # create an highly imbalanced
     idx_positive = np.flatnonzero(y == 1)

From 113034f19e1a9667d8465d0c8f0c1b6e863838ce Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 17 Jul 2020 12:10:51 +0200
Subject: [PATCH 12/17] TST make sure that pos_label is computing the right
 thing

---
 sklearn/metrics/tests/test_score_objects.py | 31 +++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 70e189e9033ba..6f5d46122ff43 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -782,9 +782,12 @@ def test_scorer_pos_label_grid_search(scoring, is_symmetric):
         ["cancer" if c == 1 else "not cancer" for c in y], dtype=object
     )
 
-    param_grid = {"max_depth": [1, 2, 3, 4, 5]}
+    param_grid = {"max_depth": [1, 3, 5]}
     classifier = GridSearchCV(
-        DecisionTreeClassifier(), param_grid=param_grid, scoring=scoring,
+        DecisionTreeClassifier(random_state=0),
+        param_grid=param_grid,
+        scoring=scoring,
+        cv=2,
     )
 
     if is_symmetric:
@@ -815,4 +818,28 @@ def test_scorer_pos_label_grid_search(scoring, is_symmetric):
         )
     else:
         with pytest.raises(ValueError):
+            # it should raise an error by default
             classifier.fit(X, y)
+        # passing pos_label should solve the issue and should be equivalent to
+        # encode the label with 0, 1
+
+        # we should control our cv indices since y will be different leading
+        # to different cv split
+        indices = np.arange(y.shape[0])
+        cv = [
+            (indices[: indices.size // 2], indices[indices.size // 2 :]),
+            (indices[indices.size // 2 :], indices[: indices.size // 2]),
+        ]
+        classifier.set_params(cv=cv)
+
+        y_encoded = y == "cancer"
+        classifier.fit(X, y_encoded)
+        mean_test_score_y_encoded = classifier.cv_results_["mean_test_score"]
+
+        scorer = get_scorer(scoring)
+        scorer._kwargs["pos_label"] = "cancer"
+        classifier.set_params(scoring=scorer)
+        classifier.fit(X, y)
+        mean_test_score_pos_label = classifier.cv_results_["mean_test_score"]
+
+        assert_allclose(mean_test_score_pos_label, mean_test_score_y_encoded)

From ebbbd84a13680ee001ad612e8b90a3c0a44ddda2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 17 Jul 2020 12:34:24 +0200
Subject: [PATCH 13/17] iter

---
 sklearn/metrics/tests/test_score_objects.py | 45 +++++++++++----------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 6f5d46122ff43..57dbee0eb08ce 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -820,26 +820,27 @@ def test_scorer_pos_label_grid_search(scoring, is_symmetric):
         with pytest.raises(ValueError):
             # it should raise an error by default
             classifier.fit(X, y)
-        # passing pos_label should solve the issue and should be equivalent to
-        # encode the label with 0, 1
-
-        # we should control our cv indices since y will be different leading
-        # to different cv split
-        indices = np.arange(y.shape[0])
-        cv = [
-            (indices[: indices.size // 2], indices[indices.size // 2 :]),
-            (indices[indices.size // 2 :], indices[: indices.size // 2]),
-        ]
-        classifier.set_params(cv=cv)
-
-        y_encoded = y == "cancer"
-        classifier.fit(X, y_encoded)
-        mean_test_score_y_encoded = classifier.cv_results_["mean_test_score"]
 
-        scorer = get_scorer(scoring)
-        scorer._kwargs["pos_label"] = "cancer"
-        classifier.set_params(scoring=scorer)
-        classifier.fit(X, y)
-        mean_test_score_pos_label = classifier.cv_results_["mean_test_score"]
-
-        assert_allclose(mean_test_score_pos_label, mean_test_score_y_encoded)
+    # passing pos_label should always solve the issue and should be equivalent
+    # to encode the label with {0, 1}.
+
+    # we should control our cv indices since y will be different leading
+    # to different cv split
+    indices = np.arange(y.shape[0])
+    cv = [
+        (indices[: indices.size // 2], indices[indices.size // 2 :]),
+        (indices[indices.size // 2 :], indices[: indices.size // 2]),
+    ]
+    classifier.set_params(cv=cv, scoring=scoring, refit=True)
+
+    y_encoded = (y == "cancer").astype(int)
+    classifier.fit(X, y_encoded)
+    mean_test_score_y_encoded = classifier.cv_results_["mean_test_score"]
+
+    scorer = get_scorer(scoring)
+    scorer._kwargs["pos_label"] = "cancer"
+    classifier.set_params(scoring=scorer)
+    classifier.fit(X, y)
+    mean_test_score_pos_label = classifier.cv_results_["mean_test_score"]
+
+    assert_allclose(mean_test_score_pos_label, mean_test_score_y_encoded)

From 89eff6680ac97c418c9a61ec5d47cdcc3ad5bde1 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 17 Jul 2020 12:58:49 +0200
Subject: [PATCH 14/17] PEP8

---
 sklearn/metrics/tests/test_score_objects.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 57dbee0eb08ce..f3a63a46363ed 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -793,7 +793,7 @@ def test_scorer_pos_label_grid_search(scoring, is_symmetric):
     if is_symmetric:
         # we will expand to compute for several scorer with different pos_label
         # which should all give the same results
-        scorer = get_scorer(scoring)
+        scorer = deepcopy(get_scorer(scoring))
         scorer_pos_label, scorer_neg_label = deepcopy(scorer), deepcopy(scorer)
 
         scorer_pos_label._kwargs["pos_label"] = "cancer"
@@ -828,8 +828,8 @@ def test_scorer_pos_label_grid_search(scoring, is_symmetric):
     # to different cv split
     indices = np.arange(y.shape[0])
     cv = [
-        (indices[: indices.size // 2], indices[indices.size // 2 :]),
-        (indices[indices.size // 2 :], indices[: indices.size // 2]),
+        (indices[:indices.size // 2], indices[indices.size // 2:]),
+        (indices[indices.size // 2:], indices[:indices.size // 2]),
     ]
     classifier.set_params(cv=cv, scoring=scoring, refit=True)
 
@@ -837,7 +837,7 @@ def test_scorer_pos_label_grid_search(scoring, is_symmetric):
     classifier.fit(X, y_encoded)
     mean_test_score_y_encoded = classifier.cv_results_["mean_test_score"]
 
-    scorer = get_scorer(scoring)
+    scorer = deepcopy(get_scorer(scoring))
     scorer._kwargs["pos_label"] = "cancer"
     classifier.set_params(scoring=scorer)
     classifier.fit(X, y)

From 4ebeb8796b5cc181912a54cc64f38110b328dbf7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 17 Jul 2020 13:54:49 +0200
Subject: [PATCH 15/17] need to make a deepcopy

---
 sklearn/metrics/_scorer.py                  | 5 +++--
 sklearn/metrics/tests/test_score_objects.py | 7 +++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 8dc8fb4b8293a..d07eb31596e2b 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -18,9 +18,10 @@
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 # License: Simplified BSD
 
+from collections import Counter
 from collections.abc import Iterable
+from copy import deepcopy
 from functools import partial
-from collections import Counter
 
 import numpy as np
 
@@ -364,7 +365,7 @@ def get_scorer(scoring):
     """
     if isinstance(scoring, str):
         try:
-            scorer = SCORERS[scoring]
+            scorer = deepcopy(SCORERS[scoring])
         except KeyError:
             raise ValueError('%r is not a valid scoring value. '
                              'Use sorted(sklearn.metrics.SCORERS.keys()) '
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index f3a63a46363ed..48b01e638ddc0 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -793,13 +793,12 @@ def test_scorer_pos_label_grid_search(scoring, is_symmetric):
     if is_symmetric:
         # we will expand to compute for several scorer with different pos_label
         # which should all give the same results
-        scorer = deepcopy(get_scorer(scoring))
+        scorer = get_scorer(scoring)
         scorer_pos_label, scorer_neg_label = deepcopy(scorer), deepcopy(scorer)
-
         scorer_pos_label._kwargs["pos_label"] = "cancer"
         scorer_neg_label._kwargs["pos_label"] = "not cancer"
         multi_scoring = {
-            "scorer_str": scorer,
+            "scorer_str": scoring,
             "scorer_pos": scorer_pos_label,
             "scorer_neg": scorer_neg_label,
         }
@@ -837,7 +836,7 @@ def test_scorer_pos_label_grid_search(scoring, is_symmetric):
     classifier.fit(X, y_encoded)
     mean_test_score_y_encoded = classifier.cv_results_["mean_test_score"]
 
-    scorer = deepcopy(get_scorer(scoring))
+    scorer = get_scorer(scoring)
     scorer._kwargs["pos_label"] = "cancer"
     classifier.set_params(scoring=scorer)
     classifier.fit(X, y)

From d94f1fa39287d9cdf1397f6a9492a54eca7b6e39 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 17 Jul 2020 14:17:10 +0200
Subject: [PATCH 16/17] iter

---
 sklearn/metrics/_scorer.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index d07eb31596e2b..e91522f10b329 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -303,13 +303,6 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
             y_pred = method_caller(clf, "predict", X)
         else:
             try:
-                if (
-                    y_type == "binary"
-                    and self._score_func.__name__ == "roc_auc_score"
-                    and "pos_label" not in self._kwargs
-                ):
-                    self._kwargs["pos_label"] = clf.classes_[1]
-
                 y_pred = method_caller(clf, "decision_function", X)
 
                 # For multi-output multi-class estimator
@@ -321,6 +314,12 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
 
                 if y_type == "binary":
                     if y_pred.shape[1] == 2:
+                        if (
+                            self._score_func.__name__ == "roc_auc_score"
+                            and "pos_label" not in self._kwargs
+                        ):
+                            self._kwargs["pos_label"] = clf.classes_[1]
+
                         if "pos_label" in self._kwargs:
                             col_idx = np.flatnonzero(
                                 clf.classes_ == self._kwargs["pos_label"]
@@ -371,7 +370,7 @@ def get_scorer(scoring):
                              'Use sorted(sklearn.metrics.SCORERS.keys()) '
                              'to get valid options.' % scoring)
     else:
-        scorer = scoring
+        scorer = deepcopy(scoring)
     return scorer
 
 

From 33dfd93e674f554a610ed52e704910213fb2f91b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 17 Jul 2020 14:23:50 +0200
Subject: [PATCH 17/17] iter

---
 sklearn/metrics/_scorer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index e91522f10b329..b40ad6c23db69 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -364,14 +364,14 @@ def get_scorer(scoring):
     """
     if isinstance(scoring, str):
         try:
-            scorer = deepcopy(SCORERS[scoring])
+            scorer = SCORERS[scoring]
         except KeyError:
             raise ValueError('%r is not a valid scoring value. '
                              'Use sorted(sklearn.metrics.SCORERS.keys()) '
                              'to get valid options.' % scoring)
     else:
-        scorer = deepcopy(scoring)
-    return scorer
+        scorer = scoring
+    return deepcopy(scorer)
 
 
 def _passthrough_scorer(estimator, *args, **kwargs):