From 13f38a15e72e0414a3ecc305b52cb58eb83ca127 Mon Sep 17 00:00:00 2001
From: vitaliset <carlo_lemos@hotmail.com>
Date: Sun, 9 Jun 2024 01:40:31 -0300
Subject: [PATCH 1/2] moving _curve_scorer class to metrics

---
 sklearn/metrics/_classification_threshold.py  | 122 ++++++++++++++++++
 .../tests/test_classification_threshold.py    | 100 ++++++++++++++
 .../_classification_threshold.py              | 122 +-----------------
 .../tests/test_classification_threshold.py    |  94 +-------------
 4 files changed, 227 insertions(+), 211 deletions(-)
 create mode 100644 sklearn/metrics/_classification_threshold.py
 create mode 100644 sklearn/metrics/tests/test_classification_threshold.py

diff --git a/sklearn/metrics/_classification_threshold.py b/sklearn/metrics/_classification_threshold.py
new file mode 100644
index 0000000000000..520f75eb5269a
--- /dev/null
+++ b/sklearn/metrics/_classification_threshold.py
@@ -0,0 +1,122 @@
+from numbers import Integral
+
+import numpy as np
+
+from ._scorer import _BaseScorer
+
+
+def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
+    """Threshold `y_score` and return the associated class labels."""
+    if pos_label is None:
+        map_thresholded_score_to_label = np.array([0, 1])
+    else:
+        pos_label_idx = np.flatnonzero(classes == pos_label)[0]
+        neg_label_idx = np.flatnonzero(classes != pos_label)[0]
+        map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx])
+
+    return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]]
+
+
+class _CurveScorer(_BaseScorer):
+    """Scorer taking a continuous response and output a score for each threshold.
+
+    Parameters
+    ----------
+    score_func : callable
+        The score function to use. It will be called as
+        `score_func(y_true, y_pred, **kwargs)`.
+
+    sign : int
+        Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`.
+        Thus, `sign` defined if higher scores are better or worse.
+
+    kwargs : dict
+        Additional parameters to pass to the score function.
+
+    thresholds : int or array-like
+        Related to the number of decision thresholds for which we want to compute the
+        score. If an integer, it will be used to generate `thresholds` thresholds
+        uniformly distributed between the minimum and maximum predicted scores. If an
+        array-like, it will be used as the thresholds.
+
+    response_method : str
+        The method to call on the estimator to get the response values.
+    """
+
+    def __init__(self, score_func, sign, kwargs, thresholds, response_method):
+        super().__init__(
+            score_func=score_func,
+            sign=sign,
+            kwargs=kwargs,
+            response_method=response_method,
+        )
+        self._thresholds = thresholds
+
+    @classmethod
+    def from_scorer(cls, scorer, response_method, thresholds):
+        """Create a continuous scorer from a normal scorer."""
+        instance = cls(
+            score_func=scorer._score_func,
+            sign=scorer._sign,
+            response_method=response_method,
+            thresholds=thresholds,
+            kwargs=scorer._kwargs,
+        )
+        # transfer the metadata request
+        instance._metadata_request = scorer._get_metadata_request()
+        return instance
+
+    def _score(self, method_caller, estimator, X, y_true, **kwargs):
+        """Evaluate predicted target values for X relative to y_true.
+
+        Parameters
+        ----------
+        method_caller : callable
+            Returns predictions given an estimator, method name, and other
+            arguments, potentially caching results.
+
+        estimator : object
+            Trained estimator to use for scoring.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Test data that will be fed to estimator.predict.
+
+        y_true : array-like of shape (n_samples,)
+            Gold standard target values for X.
+
+        **kwargs : dict
+            Other parameters passed to the scorer. Refer to
+            :func:`set_score_request` for more details.
+
+        Returns
+        -------
+        scores : ndarray of shape (thresholds,)
+            The scores associated to each threshold.
+
+        potential_thresholds : ndarray of shape (thresholds,)
+            The potential thresholds used to compute the scores.
+        """
+        pos_label = self._get_pos_label()
+        y_score = method_caller(
+            estimator, self._response_method, X, pos_label=pos_label
+        )
+
+        scoring_kwargs = {**self._kwargs, **kwargs}
+        if isinstance(self._thresholds, Integral):
+            potential_thresholds = np.linspace(
+                np.min(y_score), np.max(y_score), self._thresholds
+            )
+        else:
+            potential_thresholds = np.asarray(self._thresholds)
+        score_thresholds = [
+            self._sign
+            * self._score_func(
+                y_true,
+                _threshold_scores_to_class_labels(
+                    y_score, th, estimator.classes_, pos_label
+                ),
+                **scoring_kwargs,
+            )
+            for th in potential_thresholds
+        ]
+        return np.array(score_thresholds), potential_thresholds
diff --git a/sklearn/metrics/tests/test_classification_threshold.py b/sklearn/metrics/tests/test_classification_threshold.py
new file mode 100644
index 0000000000000..e25f61ec803fe
--- /dev/null
+++ b/sklearn/metrics/tests/test_classification_threshold.py
@@ -0,0 +1,100 @@
+import pytest
+
+from sklearn.datasets import make_classification
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+    balanced_accuracy_score,
+    recall_score,
+)
+from sklearn.metrics._classification_threshold import _CurveScorer
+
+
+def test_curve_scorer():
+    """Check the behaviour of the `_CurveScorer` class."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression().fit(X, y)
+    curve_scorer = _CurveScorer(
+        balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={},
+    )
+    scores, thresholds = curve_scorer(estimator, X, y)
+
+    assert thresholds.shape == scores.shape
+    # check that the thresholds are probabilities with extreme values close to 0 and 1.
+    # they are not exactly 0 and 1 because they are the extremum of the
+    # `estimator.predict_proba(X)` values.
+    assert 0 <= thresholds.min() <= 0.01
+    assert 0.99 <= thresholds.max() <= 1
+    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
+    assert 0.5 <= scores.min() <= 1
+
+    # check that passing kwargs to the scorer works
+    curve_scorer = _CurveScorer(
+        balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={"adjusted": True},
+    )
+    scores, thresholds = curve_scorer(estimator, X, y)
+
+    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
+    assert 0 <= scores.min() <= 0.5
+
+    # check that we can inverse the sign of the score when dealing with `neg_*` scorer
+    curve_scorer = _CurveScorer(
+        balanced_accuracy_score,
+        sign=-1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={"adjusted": True},
+    )
+    scores, thresholds = curve_scorer(estimator, X, y)
+
+    assert all(scores <= 0)
+
+
+def test_curve_scorer_pos_label(global_random_seed):
+    """Check that we propagate properly the `pos_label` parameter to the scorer."""
+    n_samples = 30
+    X, y = make_classification(
+        n_samples=n_samples, weights=[0.9, 0.1], random_state=global_random_seed
+    )
+    estimator = LogisticRegression().fit(X, y)
+
+    curve_scorer = _CurveScorer(
+        recall_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={"pos_label": 1},
+    )
+    scores_pos_label_1, thresholds_pos_label_1 = curve_scorer(estimator, X, y)
+
+    curve_scorer = _CurveScorer(
+        recall_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={"pos_label": 0},
+    )
+    scores_pos_label_0, thresholds_pos_label_0 = curve_scorer(estimator, X, y)
+
+    # Since `pos_label` is forwarded to the curve_scorer, the thresholds are not equal.
+    assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all()
+    # The min-max range for the thresholds is defined by the probabilities of the
+    # `pos_label` class (the column of `predict_proba`).
+    y_pred = estimator.predict_proba(X)
+    assert thresholds_pos_label_0.min() == pytest.approx(y_pred.min(axis=0)[0])
+    assert thresholds_pos_label_0.max() == pytest.approx(y_pred.max(axis=0)[0])
+    assert thresholds_pos_label_1.min() == pytest.approx(y_pred.min(axis=0)[1])
+    assert thresholds_pos_label_1.max() == pytest.approx(y_pred.max(axis=0)[1])
+
+    # The recall cannot be negative and `pos_label=1` should have a higher recall
+    # since there is less samples to be considered.
+    assert 0.0 < scores_pos_label_0.min() < scores_pos_label_1.min()
+    assert scores_pos_label_0.max() == pytest.approx(1.0)
+    assert scores_pos_label_1.max() == pytest.approx(1.0)
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 1d221d3388434..5d49aeb7a5839 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -15,7 +15,10 @@
     check_scoring,
     get_scorer_names,
 )
-from ..metrics._scorer import _BaseScorer
+from ..metrics._classification_threshold import (
+    _CurveScorer,
+    _threshold_scores_to_class_labels,
+)
 from ..utils import _safe_indexing
 from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
 from ..utils._response import _get_response_values_binary
@@ -54,18 +57,6 @@ def check(self):
     return check
 
 
-def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
-    """Threshold `y_score` and return the associated class labels."""
-    if pos_label is None:
-        map_thresholded_score_to_label = np.array([0, 1])
-    else:
-        pos_label_idx = np.flatnonzero(classes == pos_label)[0]
-        neg_label_idx = np.flatnonzero(classes != pos_label)[0]
-        map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx])
-
-    return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]]
-
-
 class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     """Base class for binary classifiers that set a non-default decision threshold.
 
@@ -426,111 +417,6 @@ def get_metadata_routing(self):
         return router
 
 
-class _CurveScorer(_BaseScorer):
-    """Scorer taking a continuous response and output a score for each threshold.
-
-    Parameters
-    ----------
-    score_func : callable
-        The score function to use. It will be called as
-        `score_func(y_true, y_pred, **kwargs)`.
-
-    sign : int
-        Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`.
-        Thus, `sign` defined if higher scores are better or worse.
-
-    kwargs : dict
-        Additional parameters to pass to the score function.
-
-    thresholds : int or array-like
-        Related to the number of decision thresholds for which we want to compute the
-        score. If an integer, it will be used to generate `thresholds` thresholds
-        uniformly distributed between the minimum and maximum predicted scores. If an
-        array-like, it will be used as the thresholds.
-
-    response_method : str
-        The method to call on the estimator to get the response values.
-    """
-
-    def __init__(self, score_func, sign, kwargs, thresholds, response_method):
-        super().__init__(
-            score_func=score_func,
-            sign=sign,
-            kwargs=kwargs,
-            response_method=response_method,
-        )
-        self._thresholds = thresholds
-
-    @classmethod
-    def from_scorer(cls, scorer, response_method, thresholds):
-        """Create a continuous scorer from a normal scorer."""
-        instance = cls(
-            score_func=scorer._score_func,
-            sign=scorer._sign,
-            response_method=response_method,
-            thresholds=thresholds,
-            kwargs=scorer._kwargs,
-        )
-        # transfer the metadata request
-        instance._metadata_request = scorer._get_metadata_request()
-        return instance
-
-    def _score(self, method_caller, estimator, X, y_true, **kwargs):
-        """Evaluate predicted target values for X relative to y_true.
-
-        Parameters
-        ----------
-        method_caller : callable
-            Returns predictions given an estimator, method name, and other
-            arguments, potentially caching results.
-
-        estimator : object
-            Trained estimator to use for scoring.
-
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Test data that will be fed to estimator.predict.
-
-        y_true : array-like of shape (n_samples,)
-            Gold standard target values for X.
-
-        **kwargs : dict
-            Other parameters passed to the scorer. Refer to
-            :func:`set_score_request` for more details.
-
-        Returns
-        -------
-        scores : ndarray of shape (thresholds,)
-            The scores associated to each threshold.
-
-        potential_thresholds : ndarray of shape (thresholds,)
-            The potential thresholds used to compute the scores.
-        """
-        pos_label = self._get_pos_label()
-        y_score = method_caller(
-            estimator, self._response_method, X, pos_label=pos_label
-        )
-
-        scoring_kwargs = {**self._kwargs, **kwargs}
-        if isinstance(self._thresholds, Integral):
-            potential_thresholds = np.linspace(
-                np.min(y_score), np.max(y_score), self._thresholds
-            )
-        else:
-            potential_thresholds = np.asarray(self._thresholds)
-        score_thresholds = [
-            self._sign
-            * self._score_func(
-                y_true,
-                _threshold_scores_to_class_labels(
-                    y_score, th, estimator.classes_, pos_label
-                ),
-                **scoring_kwargs,
-            )
-            for th in potential_thresholds
-        ]
-        return np.array(score_thresholds), potential_thresholds
-
-
 def _fit_and_score_over_thresholds(
     classifier,
     X,
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 77c4c20e99ef2..2f6b49f695f05 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -17,15 +17,14 @@
     f1_score,
     fbeta_score,
     make_scorer,
-    recall_score,
 )
+from sklearn.metrics._classification_threshold import _CurveScorer
 from sklearn.model_selection import (
     FixedThresholdClassifier,
     StratifiedShuffleSplit,
     TunedThresholdClassifierCV,
 )
 from sklearn.model_selection._classification_threshold import (
-    _CurveScorer,
     _fit_and_score_over_thresholds,
 )
 from sklearn.pipeline import make_pipeline
@@ -40,97 +39,6 @@
 )
 
 
-def test_curve_scorer():
-    """Check the behaviour of the `_CurveScorer` class."""
-    X, y = make_classification(random_state=0)
-    estimator = LogisticRegression().fit(X, y)
-    curve_scorer = _CurveScorer(
-        balanced_accuracy_score,
-        sign=1,
-        response_method="predict_proba",
-        thresholds=10,
-        kwargs={},
-    )
-    scores, thresholds = curve_scorer(estimator, X, y)
-
-    assert thresholds.shape == scores.shape
-    # check that the thresholds are probabilities with extreme values close to 0 and 1.
-    # they are not exactly 0 and 1 because they are the extremum of the
-    # `estimator.predict_proba(X)` values.
-    assert 0 <= thresholds.min() <= 0.01
-    assert 0.99 <= thresholds.max() <= 1
-    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
-    assert 0.5 <= scores.min() <= 1
-
-    # check that passing kwargs to the scorer works
-    curve_scorer = _CurveScorer(
-        balanced_accuracy_score,
-        sign=1,
-        response_method="predict_proba",
-        thresholds=10,
-        kwargs={"adjusted": True},
-    )
-    scores, thresholds = curve_scorer(estimator, X, y)
-
-    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
-    assert 0 <= scores.min() <= 0.5
-
-    # check that we can inverse the sign of the score when dealing with `neg_*` scorer
-    curve_scorer = _CurveScorer(
-        balanced_accuracy_score,
-        sign=-1,
-        response_method="predict_proba",
-        thresholds=10,
-        kwargs={"adjusted": True},
-    )
-    scores, thresholds = curve_scorer(estimator, X, y)
-
-    assert all(scores <= 0)
-
-
-def test_curve_scorer_pos_label(global_random_seed):
-    """Check that we propagate properly the `pos_label` parameter to the scorer."""
-    n_samples = 30
-    X, y = make_classification(
-        n_samples=n_samples, weights=[0.9, 0.1], random_state=global_random_seed
-    )
-    estimator = LogisticRegression().fit(X, y)
-
-    curve_scorer = _CurveScorer(
-        recall_score,
-        sign=1,
-        response_method="predict_proba",
-        thresholds=10,
-        kwargs={"pos_label": 1},
-    )
-    scores_pos_label_1, thresholds_pos_label_1 = curve_scorer(estimator, X, y)
-
-    curve_scorer = _CurveScorer(
-        recall_score,
-        sign=1,
-        response_method="predict_proba",
-        thresholds=10,
-        kwargs={"pos_label": 0},
-    )
-    scores_pos_label_0, thresholds_pos_label_0 = curve_scorer(estimator, X, y)
-
-    # Since `pos_label` is forwarded to the curve_scorer, the thresholds are not equal.
-    assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all()
-    # The min-max range for the thresholds is defined by the probabilities of the
-    # `pos_label` class (the column of `predict_proba`).
-    y_pred = estimator.predict_proba(X)
-    assert thresholds_pos_label_0.min() == pytest.approx(y_pred.min(axis=0)[0])
-    assert thresholds_pos_label_0.max() == pytest.approx(y_pred.max(axis=0)[0])
-    assert thresholds_pos_label_1.min() == pytest.approx(y_pred.min(axis=0)[1])
-    assert thresholds_pos_label_1.max() == pytest.approx(y_pred.max(axis=0)[1])
-
-    # The recall cannot be negative and `pos_label=1` should have a higher recall
-    # since there is less samples to be considered.
-    assert 0.0 < scores_pos_label_0.min() < scores_pos_label_1.min()
-    assert scores_pos_label_0.max() == pytest.approx(1.0)
-    assert scores_pos_label_1.max() == pytest.approx(1.0)
-
-
 def test_fit_and_score_over_thresholds_curve_scorers():
     """Check that `_fit_and_score_over_thresholds` returns thresholds in ascending order
     for the different accepted curve scorers."""

From 63204155e350607058f7ac3f3056b03e9e0bc810 Mon Sep 17 00:00:00 2001
From: vitaliset <carlo_lemos@hotmail.com>
Date: Tue, 2 Jul 2024 23:39:42 -0300
Subject: [PATCH 2/2] moving files from metrics._classification_threshold to
 metrics._scorer

---
 sklearn/metrics/_classification_threshold.py  | 122 ------------------
 sklearn/metrics/_scorer.py                    | 120 +++++++++++++++++
 .../tests/test_classification_threshold.py    | 100 --------------
 sklearn/metrics/tests/test_score_objects.py   |  92 +++++++++++++
 .../_classification_threshold.py              |   2 +-
 .../tests/test_classification_threshold.py    |   2 +-
 6 files changed, 214 insertions(+), 224 deletions(-)
 delete mode 100644 sklearn/metrics/_classification_threshold.py
 delete mode 100644 sklearn/metrics/tests/test_classification_threshold.py

diff --git a/sklearn/metrics/_classification_threshold.py b/sklearn/metrics/_classification_threshold.py
deleted file mode 100644
index 520f75eb5269a..0000000000000
--- a/sklearn/metrics/_classification_threshold.py
+++ /dev/null
@@ -1,122 +0,0 @@
-from numbers import Integral
-
-import numpy as np
-
-from ._scorer import _BaseScorer
-
-
-def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
-    """Threshold `y_score` and return the associated class labels."""
-    if pos_label is None:
-        map_thresholded_score_to_label = np.array([0, 1])
-    else:
-        pos_label_idx = np.flatnonzero(classes == pos_label)[0]
-        neg_label_idx = np.flatnonzero(classes != pos_label)[0]
-        map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx])
-
-    return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]]
-
-
-class _CurveScorer(_BaseScorer):
-    """Scorer taking a continuous response and output a score for each threshold.
-
-    Parameters
-    ----------
-    score_func : callable
-        The score function to use. It will be called as
-        `score_func(y_true, y_pred, **kwargs)`.
-
-    sign : int
-        Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`.
-        Thus, `sign` defined if higher scores are better or worse.
-
-    kwargs : dict
-        Additional parameters to pass to the score function.
-
-    thresholds : int or array-like
-        Related to the number of decision thresholds for which we want to compute the
-        score. If an integer, it will be used to generate `thresholds` thresholds
-        uniformly distributed between the minimum and maximum predicted scores. If an
-        array-like, it will be used as the thresholds.
-
-    response_method : str
-        The method to call on the estimator to get the response values.
-    """
-
-    def __init__(self, score_func, sign, kwargs, thresholds, response_method):
-        super().__init__(
-            score_func=score_func,
-            sign=sign,
-            kwargs=kwargs,
-            response_method=response_method,
-        )
-        self._thresholds = thresholds
-
-    @classmethod
-    def from_scorer(cls, scorer, response_method, thresholds):
-        """Create a continuous scorer from a normal scorer."""
-        instance = cls(
-            score_func=scorer._score_func,
-            sign=scorer._sign,
-            response_method=response_method,
-            thresholds=thresholds,
-            kwargs=scorer._kwargs,
-        )
-        # transfer the metadata request
-        instance._metadata_request = scorer._get_metadata_request()
-        return instance
-
-    def _score(self, method_caller, estimator, X, y_true, **kwargs):
-        """Evaluate predicted target values for X relative to y_true.
-
-        Parameters
-        ----------
-        method_caller : callable
-            Returns predictions given an estimator, method name, and other
-            arguments, potentially caching results.
-
-        estimator : object
-            Trained estimator to use for scoring.
-
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Test data that will be fed to estimator.predict.
-
-        y_true : array-like of shape (n_samples,)
-            Gold standard target values for X.
-
-        **kwargs : dict
-            Other parameters passed to the scorer. Refer to
-            :func:`set_score_request` for more details.
-
-        Returns
-        -------
-        scores : ndarray of shape (thresholds,)
-            The scores associated to each threshold.
-
-        potential_thresholds : ndarray of shape (thresholds,)
-            The potential thresholds used to compute the scores.
-        """
-        pos_label = self._get_pos_label()
-        y_score = method_caller(
-            estimator, self._response_method, X, pos_label=pos_label
-        )
-
-        scoring_kwargs = {**self._kwargs, **kwargs}
-        if isinstance(self._thresholds, Integral):
-            potential_thresholds = np.linspace(
-                np.min(y_score), np.max(y_score), self._thresholds
-            )
-        else:
-            potential_thresholds = np.asarray(self._thresholds)
-        score_thresholds = [
-            self._sign
-            * self._score_func(
-                y_true,
-                _threshold_scores_to_class_labels(
-                    y_score, th, estimator.classes_, pos_label
-                ),
-                **scoring_kwargs,
-            )
-            for th in potential_thresholds
-        ]
-        return np.array(score_thresholds), potential_thresholds
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index c1a916aa0b5f3..b2f4f7178940b 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -23,8 +23,11 @@
 from collections import Counter
 from functools import partial
 from inspect import signature
+from numbers import Integral
 from traceback import format_exc
 
+import numpy as np
+
 from ..base import is_regressor
 from ..utils import Bunch
 from ..utils._param_validation import HasMethods, Hidden, StrOptions, validate_params
@@ -1064,3 +1067,120 @@ def check_scoring(estimator=None, scoring=None, *, allow_none=False, raise_exc=T
                 "If no scoring is specified, the estimator passed should "
                 "have a 'score' method. The estimator %r does not." % estimator
             )
+
+
+def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
+    """Threshold `y_score` and return the associated class labels."""
+    if pos_label is None:
+        map_thresholded_score_to_label = np.array([0, 1])
+    else:
+        pos_label_idx = np.flatnonzero(classes == pos_label)[0]
+        neg_label_idx = np.flatnonzero(classes != pos_label)[0]
+        map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx])
+
+    return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]]
+
+
+class _CurveScorer(_BaseScorer):
+    """Scorer taking a continuous response and output a score for each threshold.
+
+    Parameters
+    ----------
+    score_func : callable
+        The score function to use. It will be called as
+        `score_func(y_true, y_pred, **kwargs)`.
+
+    sign : int
+        Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`.
+        Thus, `sign` defined if higher scores are better or worse.
+
+    kwargs : dict
+        Additional parameters to pass to the score function.
+
+    thresholds : int or array-like
+        Related to the number of decision thresholds for which we want to compute the
+        score. If an integer, it will be used to generate `thresholds` thresholds
+        uniformly distributed between the minimum and maximum predicted scores. If an
+        array-like, it will be used as the thresholds.
+
+    response_method : str
+        The method to call on the estimator to get the response values.
+    """
+
+    def __init__(self, score_func, sign, kwargs, thresholds, response_method):
+        super().__init__(
+            score_func=score_func,
+            sign=sign,
+            kwargs=kwargs,
+            response_method=response_method,
+        )
+        self._thresholds = thresholds
+
+    @classmethod
+    def from_scorer(cls, scorer, response_method, thresholds):
+        """Create a continuous scorer from a normal scorer."""
+        instance = cls(
+            score_func=scorer._score_func,
+            sign=scorer._sign,
+            response_method=response_method,
+            thresholds=thresholds,
+            kwargs=scorer._kwargs,
+        )
+        # transfer the metadata request
+        instance._metadata_request = scorer._get_metadata_request()
+        return instance
+
+    def _score(self, method_caller, estimator, X, y_true, **kwargs):
+        """Evaluate predicted target values for X relative to y_true.
+
+        Parameters
+        ----------
+        method_caller : callable
+            Returns predictions given an estimator, method name, and other
+            arguments, potentially caching results.
+
+        estimator : object
+            Trained estimator to use for scoring.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Test data that will be fed to estimator.predict.
+
+        y_true : array-like of shape (n_samples,)
+            Gold standard target values for X.
+
+        **kwargs : dict
+            Other parameters passed to the scorer. Refer to
+            :func:`set_score_request` for more details.
+
+        Returns
+        -------
+        scores : ndarray of shape (thresholds,)
+            The scores associated to each threshold.
+
+        potential_thresholds : ndarray of shape (thresholds,)
+            The potential thresholds used to compute the scores.
+        """
+        pos_label = self._get_pos_label()
+        y_score = method_caller(
+            estimator, self._response_method, X, pos_label=pos_label
+        )
+
+        scoring_kwargs = {**self._kwargs, **kwargs}
+        if isinstance(self._thresholds, Integral):
+            potential_thresholds = np.linspace(
+                np.min(y_score), np.max(y_score), self._thresholds
+            )
+        else:
+            potential_thresholds = np.asarray(self._thresholds)
+        score_thresholds = [
+            self._sign
+            * self._score_func(
+                y_true,
+                _threshold_scores_to_class_labels(
+                    y_score, th, estimator.classes_, pos_label
+                ),
+                **scoring_kwargs,
+            )
+            for th in potential_thresholds
+        ]
+        return np.array(score_thresholds), potential_thresholds
diff --git a/sklearn/metrics/tests/test_classification_threshold.py b/sklearn/metrics/tests/test_classification_threshold.py
deleted file mode 100644
index e25f61ec803fe..0000000000000
--- a/sklearn/metrics/tests/test_classification_threshold.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import pytest
-
-from sklearn.datasets import make_classification
-from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import (
-    balanced_accuracy_score,
-    recall_score,
-)
-from sklearn.metrics._classification_threshold import _CurveScorer
-
-
-def test_curve_scorer():
-    """Check the behaviour of the `_CurveScorer` class."""
-    X, y = make_classification(random_state=0)
-    estimator = LogisticRegression().fit(X, y)
-    curve_scorer = _CurveScorer(
-        balanced_accuracy_score,
-        sign=1,
-        response_method="predict_proba",
-        thresholds=10,
-        kwargs={},
-    )
-    scores, thresholds = curve_scorer(estimator, X, y)
-
-    assert thresholds.shape == scores.shape
-    # check that the thresholds are probabilities with extreme values close to 0 and 1.
-    # they are not exactly 0 and 1 because they are the extremum of the
-    # `estimator.predict_proba(X)` values.
-    assert 0 <= thresholds.min() <= 0.01
-    assert 0.99 <= thresholds.max() <= 1
-    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
-    assert 0.5 <= scores.min() <= 1
-
-    # check that passing kwargs to the scorer works
-    curve_scorer = _CurveScorer(
-        balanced_accuracy_score,
-        sign=1,
-        response_method="predict_proba",
-        thresholds=10,
-        kwargs={"adjusted": True},
-    )
-    scores, thresholds = curve_scorer(estimator, X, y)
-
-    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
-    assert 0 <= scores.min() <= 0.5
-
-    # check that we can inverse the sign of the score when dealing with `neg_*` scorer
-    curve_scorer = _CurveScorer(
-        balanced_accuracy_score,
-        sign=-1,
-        response_method="predict_proba",
-        thresholds=10,
-        kwargs={"adjusted": True},
-    )
-    scores, thresholds = curve_scorer(estimator, X, y)
-
-    assert all(scores <= 0)
-
-
-def test_curve_scorer_pos_label(global_random_seed):
-    """Check that we propagate properly the `pos_label` parameter to the scorer."""
-    n_samples = 30
-    X, y = make_classification(
-        n_samples=n_samples, weights=[0.9, 0.1], random_state=global_random_seed
-    )
-    estimator = LogisticRegression().fit(X, y)
-
-    curve_scorer = _CurveScorer(
-        recall_score,
-        sign=1,
-        response_method="predict_proba",
-        thresholds=10,
-        kwargs={"pos_label": 1},
-    )
-    scores_pos_label_1, thresholds_pos_label_1 = curve_scorer(estimator, X, y)
-
-    curve_scorer = _CurveScorer(
-        recall_score,
-        sign=1,
-        response_method="predict_proba",
-        thresholds=10,
-        kwargs={"pos_label": 0},
-    )
-    scores_pos_label_0, thresholds_pos_label_0 = curve_scorer(estimator, X, y)
-
-    # Since `pos_label` is forwarded to the curve_scorer, the thresholds are not equal.
-    assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all()
-    # The min-max range for the thresholds is defined by the probabilities of the
-    # `pos_label` class (the column of `predict_proba`).
-    y_pred = estimator.predict_proba(X)
-    assert thresholds_pos_label_0.min() == pytest.approx(y_pred.min(axis=0)[0])
-    assert thresholds_pos_label_0.max() == pytest.approx(y_pred.max(axis=0)[0])
-    assert thresholds_pos_label_1.min() == pytest.approx(y_pred.min(axis=0)[1])
-    assert thresholds_pos_label_1.max() == pytest.approx(y_pred.max(axis=0)[1])
-
-    # The recall cannot be negative and `pos_label=1` should have a higher recall
-    # since there is less samples to be considered.
-    assert 0.0 < scores_pos_label_0.min() < scores_pos_label_1.min()
-    assert scores_pos_label_0.max() == pytest.approx(1.0)
-    assert scores_pos_label_1.max() == pytest.approx(1.0)
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index bfe8f57f92265..4fc295a10d9fa 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -43,6 +43,7 @@
 from sklearn.metrics import cluster as cluster_module
 from sklearn.metrics._scorer import (
     _check_multimetric_scoring,
+    _CurveScorer,
     _MultimetricScorer,
     _PassthroughScorer,
     _Scorer,
@@ -1598,3 +1599,94 @@ def test_metadata_routing_multimetric_metadata_routing(enable_metadata_routing):
     multimetric_scorer = _MultimetricScorer(scorers={"acc": get_scorer("accuracy")})
     with config_context(enable_metadata_routing=enable_metadata_routing):
         multimetric_scorer(estimator, X, y)
+
+
+def test_curve_scorer():
+    """Check the behaviour of the `_CurveScorer` class."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression().fit(X, y)
+    curve_scorer = _CurveScorer(
+        balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={},
+    )
+    scores, thresholds = curve_scorer(estimator, X, y)
+
+    assert thresholds.shape == scores.shape
+    # check that the thresholds are probabilities with extreme values close to 0 and 1.
+    # they are not exactly 0 and 1 because they are the extremum of the
+    # `estimator.predict_proba(X)` values.
+    assert 0 <= thresholds.min() <= 0.01
+    assert 0.99 <= thresholds.max() <= 1
+    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
+    assert 0.5 <= scores.min() <= 1
+
+    # check that passing kwargs to the scorer works
+    curve_scorer = _CurveScorer(
+        balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={"adjusted": True},
+    )
+    scores, thresholds = curve_scorer(estimator, X, y)
+
+    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
+    assert 0 <= scores.min() <= 0.5
+
+    # check that we can inverse the sign of the score when dealing with `neg_*` scorer
+    curve_scorer = _CurveScorer(
+        balanced_accuracy_score,
+        sign=-1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={"adjusted": True},
+    )
+    scores, thresholds = curve_scorer(estimator, X, y)
+
+    assert all(scores <= 0)
+
+
+def test_curve_scorer_pos_label(global_random_seed):
+    """Check that we propagate properly the `pos_label` parameter to the scorer."""
+    n_samples = 30
+    X, y = make_classification(
+        n_samples=n_samples, weights=[0.9, 0.1], random_state=global_random_seed
+    )
+    estimator = LogisticRegression().fit(X, y)
+
+    curve_scorer = _CurveScorer(
+        recall_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={"pos_label": 1},
+    )
+    scores_pos_label_1, thresholds_pos_label_1 = curve_scorer(estimator, X, y)
+
+    curve_scorer = _CurveScorer(
+        recall_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={"pos_label": 0},
+    )
+    scores_pos_label_0, thresholds_pos_label_0 = curve_scorer(estimator, X, y)
+
+    # Since `pos_label` is forwarded to the curve_scorer, the thresholds are not equal.
+    assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all()
+    # The min-max range for the thresholds is defined by the probabilities of the
+    # `pos_label` class (the column of `predict_proba`).
+    y_pred = estimator.predict_proba(X)
+    assert thresholds_pos_label_0.min() == pytest.approx(y_pred.min(axis=0)[0])
+    assert thresholds_pos_label_0.max() == pytest.approx(y_pred.max(axis=0)[0])
+    assert thresholds_pos_label_1.min() == pytest.approx(y_pred.min(axis=0)[1])
+    assert thresholds_pos_label_1.max() == pytest.approx(y_pred.max(axis=0)[1])
+
+    # The recall cannot be negative and `pos_label=1` should have a higher recall
+    # since there is less samples to be considered.
+    assert 0.0 < scores_pos_label_0.min() < scores_pos_label_1.min()
+    assert scores_pos_label_0.max() == pytest.approx(1.0)
+    assert scores_pos_label_1.max() == pytest.approx(1.0)
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index 5d49aeb7a5839..c2f58e7337676 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -15,7 +15,7 @@
     check_scoring,
     get_scorer_names,
 )
-from ..metrics._classification_threshold import (
+from ..metrics._scorer import (
     _CurveScorer,
     _threshold_scores_to_class_labels,
 )
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
index 2f6b49f695f05..a7bc6f86f1248 100644
--- a/sklearn/model_selection/tests/test_classification_threshold.py
+++ b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -18,7 +18,7 @@
     fbeta_score,
     make_scorer,
 )
-from sklearn.metrics._classification_threshold import _CurveScorer
+from sklearn.metrics._scorer import _CurveScorer
 from sklearn.model_selection import (
     FixedThresholdClassifier,
     StratifiedShuffleSplit,