From f128648f79dcbabb25e3709fd876918b22275b5a Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Thu, 7 Dec 2023 18:02:24 -0800
Subject: [PATCH 01/17] Added link for plot_adaboost_multiclass example

---
 examples/ensemble/plot_adaboost_multiclass.py | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/examples/ensemble/plot_adaboost_multiclass.py b/examples/ensemble/plot_adaboost_multiclass.py
index 35b0d1bb86470..5343d93778696 100644
--- a/examples/ensemble/plot_adaboost_multiclass.py
+++ b/examples/ensemble/plot_adaboost_multiclass.py
@@ -17,6 +17,9 @@
 be selected. This ensures that subsequent iterations of the algorithm focus on
 the difficult-to-classify samples.
 
+For more example of usage, see
+:ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
+
 .. topic:: References:
 
     .. [1] :doi:`J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class adaboost."

From 6fb31eadfba39a029d26bc042c69520b8fc58c63 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Sun, 10 Dec 2023 10:07:14 -0800
Subject: [PATCH 02/17] Moved the example link from the example itself back to
 the doc string in _weight_boosting.py.

---
 examples/ensemble/plot_adaboost_multiclass.py | 3 ---
 sklearn/ensemble/_weight_boosting.py          | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/ensemble/plot_adaboost_multiclass.py b/examples/ensemble/plot_adaboost_multiclass.py
index 5343d93778696..35b0d1bb86470 100644
--- a/examples/ensemble/plot_adaboost_multiclass.py
+++ b/examples/ensemble/plot_adaboost_multiclass.py
@@ -17,9 +17,6 @@
 be selected. This ensures that subsequent iterations of the algorithm focus on
 the difficult-to-classify samples.
 
-For more example of usage, see
-:ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
-
 .. topic:: References:
 
     .. [1] :doi:`J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class adaboost."
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index af731892880ee..313524097d088 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -351,6 +351,9 @@ class AdaBoostClassifier(
 
     Read more in the :ref:`User Guide <adaboost>`.
 
+    For more example of usage, see
+    :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
+
     .. versionadded:: 0.14
 
     Parameters

From a2c74683323e3bf85c836f6503a4211f30b84fac Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Fri, 26 Jan 2024 11:54:03 -0800
Subject: [PATCH 03/17] Reworded the example reference of AdaBoost in the
 `ensemble/_weight_boosting.py` file, moving it below the `Examples` section
 for improved organization.

Included an AdaBoost example reference within the DecisionTree class in
the `tree/-class.py` file.
---
 sklearn/ensemble/_weight_boosting.py | 6 +++---
 sklearn/tree/_classes.py             | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index 313524097d088..406acfff4fa6f 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -351,9 +351,6 @@ class AdaBoostClassifier(
 
     Read more in the :ref:`User Guide <adaboost>`.
 
-    For more example of usage, see
-    :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
-
     .. versionadded:: 0.14
 
     Parameters
@@ -481,6 +478,9 @@ class AdaBoostClassifier(
     array([1])
     >>> clf.score(X, y)
     0.96...
+
+    For an example of using AdaBoost to fit a sequence of DecisionTrees as weak learners,
+    please refer to :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
     """
 
     # TODO(1.6): Modify _parameter_constraints for "algorithm" to only check
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 00d61f184731d..3972dbe909906 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -934,6 +934,9 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
     ...
     array([ 1.     ,  0.93...,  0.86...,  0.93...,  0.93...,
             0.93...,  0.93...,  1.     ,  0.93...,  1.      ])
+
+    For an example of using AdaBoost to fit a sequence of DecisionTrees as weak learners,
+    please refer to :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
     """
 
     _parameter_constraints: dict = {

From eb50b06819ba762d0b48135ecf3c0a31f8aea0d9 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Tue, 27 Feb 2024 16:18:32 -0800
Subject: [PATCH 04/17] - Added the Multi-class AdaBoosted Decision Trees
 example to the Decision Trees user guide. - Modified the doc-string wording
 in the `AdaBoostClassifier` class referencing to the aforementioned example.

---
 doc/modules/tree.rst                 | 1 +
 sklearn/ensemble/_weight_boosting.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index b54b913573a34..e838216ac76c5 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -222,6 +222,7 @@ of external libraries and is more compact:
 
  * :ref:`sphx_glr_auto_examples_tree_plot_iris_dtc.py`
  * :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
+ * :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`
 
 .. _tree_regression:
 
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index 406acfff4fa6f..7e1e9ee8fb906 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -479,7 +479,7 @@ class AdaBoostClassifier(
     >>> clf.score(X, y)
     0.96...
 
-    For an example of using AdaBoost to fit a sequence of DecisionTrees as weak learners,
+    For a detailed example of using AdaBoost to fit a sequence of DecisionTrees as weak learners,
     please refer to :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
     """
 

From 46318a7e443b5ba9810f51cfd4cfacb2b148707e Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Fri, 1 Mar 2024 13:39:28 -0800
Subject: [PATCH 05/17] Reformatted doc-strings to meet the ruff requirement

---
 sklearn/ensemble/_weight_boosting.py | 5 +++--
 sklearn/tree/_classes.py             | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index 7e1e9ee8fb906..0461a397983be 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -479,8 +479,9 @@ class AdaBoostClassifier(
     >>> clf.score(X, y)
     0.96...
 
-    For a detailed example of using AdaBoost to fit a sequence of DecisionTrees as weak learners,
-    please refer to :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
+    For a detailed example of using AdaBoost to fit a sequence of DecisionTrees
+    as weaklearners, please refer to
+    :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
     """
 
     # TODO(1.6): Modify _parameter_constraints for "algorithm" to only check
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 0205fdb117c03..020779404ca2d 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -935,8 +935,9 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
     array([ 1.     ,  0.93...,  0.86...,  0.93...,  0.93...,
             0.93...,  0.93...,  1.     ,  0.93...,  1.      ])
 
-    For an example of using AdaBoost to fit a sequence of DecisionTrees as weak learners,
-    please refer to :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
+    For an example of using AdaBoost to fit a sequence of
+    DecisionTrees as weak learners, please refer to
+    :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
     """
 
     _parameter_constraints: dict = {

From 752a8e83bc25c25c48a8a57cd112b925e255ca1e Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Fri, 1 Mar 2024 15:36:34 -0800
Subject: [PATCH 06/17] Empty commit for test re-run.


From 6d866d4808d3e152f1d5b009e6f12a571c264bc7 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Wed, 6 Mar 2024 11:11:08 -0800
Subject: [PATCH 07/17] Removed example links from two files

---
 doc/modules/tree.rst     | 1 -
 sklearn/tree/_classes.py | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index e838216ac76c5..b54b913573a34 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -222,7 +222,6 @@ of external libraries and is more compact:
 
  * :ref:`sphx_glr_auto_examples_tree_plot_iris_dtc.py`
  * :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
- * :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`
 
 .. _tree_regression:
 
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 020779404ca2d..7e2419a77dcee 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -935,9 +935,6 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
     array([ 1.     ,  0.93...,  0.86...,  0.93...,  0.93...,
             0.93...,  0.93...,  1.     ,  0.93...,  1.      ])
 
-    For an example of using AdaBoost to fit a sequence of
-    DecisionTrees as weak learners, please refer to
-    :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
     """
 
     _parameter_constraints: dict = {

From 23e6ef1a7ad8ef92376b4857bc59831d3a7bd205 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Wed, 6 Mar 2024 11:37:36 -0800
Subject: [PATCH 08/17] Empty commit for checks re-run


From 03aa44ab56d11e7a7b84dcc2cadd677d1366f1aa Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Wed, 6 Mar 2024 12:05:47 -0800
Subject: [PATCH 09/17] Removed an empty line for checks re-run.

---
 sklearn/tree/_classes.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 7e2419a77dcee..9f99d831a0990 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -934,7 +934,6 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
     ...
     array([ 1.     ,  0.93...,  0.86...,  0.93...,  0.93...,
             0.93...,  0.93...,  1.     ,  0.93...,  1.      ])
-
     """
 
     _parameter_constraints: dict = {

From 88d45008d8c3daeee5fbdd159082fddee661e4d7 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Mon, 6 May 2024 10:57:29 -0700
Subject: [PATCH 10/17] Created the `sklearn/calibration_temperature.py` to
 contain all work related to temperature scaling.

---
 sklearn/calibration_temperature.py | 1558 ++++++++++++++++++++++++++++
 1 file changed, 1558 insertions(+)
 create mode 100644 sklearn/calibration_temperature.py

diff --git a/sklearn/calibration_temperature.py b/sklearn/calibration_temperature.py
new file mode 100644
index 0000000000000..12f287482488d
--- /dev/null
+++ b/sklearn/calibration_temperature.py
@@ -0,0 +1,1558 @@
+"""Calibration of predicted probabilities."""
+
+# Author: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
+#         Balazs Kegl <balazs.kegl@gmail.com>
+#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#         Mathieu Blondel <mathieu@mblondel.org>
+#
+# License: BSD 3 clause
+
+import warnings
+from inspect import signature
+from math import log
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.optimize import minimize
+from scipy.special import expit, softmax
+
+from sklearn.utils import Bunch
+
+from ._loss import HalfBinomialLoss
+from .base import (
+    BaseEstimator,
+    ClassifierMixin,
+    MetaEstimatorMixin,
+    RegressorMixin,
+    _fit_context,
+    clone,
+)
+from .isotonic import IsotonicRegression
+from .model_selection import check_cv, cross_val_predict
+from .preprocessing import LabelEncoder, label_binarize
+from .svm import LinearSVC
+from .utils import (
+    _safe_indexing,
+    column_or_1d,
+    indexable,
+)
+from .utils._param_validation import (
+    HasMethods,
+    Interval,
+    StrOptions,
+    validate_params,
+)
+from .utils._plotting import _BinaryClassifierCurveDisplayMixin
+from .utils._response import _get_response_values, _process_predict_proba
+from .utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _routing_enabled,
+    process_routing,
+)
+from .utils.multiclass import check_classification_targets
+from .utils.parallel import Parallel, delayed
+from .utils.validation import (
+    _check_method_params,
+    _check_pos_label_consistency,
+    _check_response_method,
+    _check_sample_weight,
+    _num_samples,
+    check_consistent_length,
+    check_is_fitted,
+)
+
+
+class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
+    """Probability calibration with isotonic regression or logistic regression.
+
+    This class uses cross-validation to both estimate the parameters of a
+    classifier and subsequently calibrate a classifier. With default
+    `ensemble=True`, for each cv split it
+    fits a copy of the base estimator to the training subset, and calibrates it
+    using the testing subset. For prediction, predicted probabilities are
+    averaged across these individual calibrated classifiers. When
+    `ensemble=False`, cross-validation is used to obtain unbiased predictions,
+    via :func:`~sklearn.model_selection.cross_val_predict`, which are then
+    used for calibration. For prediction, the base estimator, trained using all
+    the data, is used. This is the prediction method implemented when
+    `probabilities=True` for :class:`~sklearn.svm.SVC` and :class:`~sklearn.svm.NuSVC`
+    estimators (see :ref:`User Guide <scores_probabilities>` for details).
+
+    Already fitted classifiers can be calibrated via the parameter
+    `cv="prefit"`. In this case, no cross-validation is used and all provided
+    data is used for calibration. The user has to take care manually that data
+    for model fitting and calibration are disjoint.
+
+    The calibration is based on the :term:`decision_function` method of the
+    `estimator` if it exists, else on :term:`predict_proba`.
+
+    Read more in the :ref:`User Guide <calibration>`.
+
+    Parameters
+    ----------
+    estimator : estimator instance, default=None
+        The classifier whose output need to be calibrated to provide more
+        accurate `predict_proba` outputs. The default classifier is
+        a :class:`~sklearn.svm.LinearSVC`.
+
+        .. versionadded:: 1.2
+
+    method : {'sigmoid', 'isotonic'}, default='sigmoid'
+        The method to use for calibration. Can be 'sigmoid' which
+        corresponds to Platt's method (i.e. a logistic regression model) or
+        'isotonic' which is a non-parametric approach. It is not advised to
+        use isotonic calibration with too few calibration samples
+        ``(<<1000)`` since it tends to overfit.
+
+    cv : int, cross-validation generator, iterable or "prefit", \
+            default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if ``y`` is binary or multiclass,
+        :class:`~sklearn.model_selection.StratifiedKFold` is used. If ``y`` is
+        neither binary nor multiclass, :class:`~sklearn.model_selection.KFold`
+        is used.
+
+        Refer to the :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        If "prefit" is passed, it is assumed that `estimator` has been
+        fitted already and all data is used for calibration.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors.
+
+        Base estimator clones are fitted in parallel across cross-validation
+        iterations. Therefore parallelism happens only when `cv != "prefit"`.
+
+        See :term:`Glossary <n_jobs>` for more details.
+
+        .. versionadded:: 0.24
+
+    ensemble : bool, default=True
+        Determines how the calibrator is fitted when `cv` is not `'prefit'`.
+        Ignored if `cv='prefit'`.
+
+        If `True`, the `estimator` is fitted using training data, and
+        calibrated using testing data, for each `cv` fold. The final estimator
+        is an ensemble of `n_cv` fitted classifier and calibrator pairs, where
+        `n_cv` is the number of cross-validation folds. The output is the
+        average predicted probabilities of all pairs.
+
+        If `False`, `cv` is used to compute unbiased predictions, via
+        :func:`~sklearn.model_selection.cross_val_predict`, which are then
+        used for calibration. At prediction time, the classifier used is the
+        `estimator` trained on all the data.
+        Note that this method is also internally implemented  in
+        :mod:`sklearn.svm` estimators with the `probabilities=True` parameter.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        The class labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+        .. versionadded:: 1.0
+
+    calibrated_classifiers_ : list (len() equal to cv or 1 if `cv="prefit"` \
+            or `ensemble=False`)
+        The list of classifier and calibrator pairs.
+
+        - When `cv="prefit"`, the fitted `estimator` and fitted
+          calibrator.
+        - When `cv` is not "prefit" and `ensemble=True`, `n_cv` fitted
+          `estimator` and calibrator pairs. `n_cv` is the number of
+          cross-validation folds.
+        - When `cv` is not "prefit" and `ensemble=False`, the `estimator`,
+          fitted on all the data, and fitted calibrator.
+
+        .. versionchanged:: 0.24
+            Single calibrated classifier case when `ensemble=False`.
+
+    See Also
+    --------
+    calibration_curve : Compute true and predicted probabilities
+        for a calibration curve.
+
+    References
+    ----------
+    .. [1] Obtaining calibrated probability estimates from decision trees
+           and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001
+
+    .. [2] Transforming Classifier Scores into Accurate Multiclass
+           Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002)
+
+    .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to
+           Regularized Likelihood Methods, J. Platt, (1999)
+
+    .. [4] Predicting Good Probabilities with Supervised Learning,
+           A. Niculescu-Mizil & R. Caruana, ICML 2005
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.naive_bayes import GaussianNB
+    >>> from sklearn.calibration import CalibratedClassifierCV
+    >>> X, y = make_classification(n_samples=100, n_features=2,
+    ...                            n_redundant=0, random_state=42)
+    >>> base_clf = GaussianNB()
+    >>> calibrated_clf = CalibratedClassifierCV(base_clf, cv=3)
+    >>> calibrated_clf.fit(X, y)
+    CalibratedClassifierCV(...)
+    >>> len(calibrated_clf.calibrated_classifiers_)
+    3
+    >>> calibrated_clf.predict_proba(X)[:5, :]
+    array([[0.110..., 0.889...],
+           [0.072..., 0.927...],
+           [0.928..., 0.071...],
+           [0.928..., 0.071...],
+           [0.071..., 0.928...]])
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = make_classification(n_samples=100, n_features=2,
+    ...                            n_redundant=0, random_state=42)
+    >>> X_train, X_calib, y_train, y_calib = train_test_split(
+    ...        X, y, random_state=42
+    ... )
+    >>> base_clf = GaussianNB()
+    >>> base_clf.fit(X_train, y_train)
+    GaussianNB()
+    >>> calibrated_clf = CalibratedClassifierCV(base_clf, cv="prefit")
+    >>> calibrated_clf.fit(X_calib, y_calib)
+    CalibratedClassifierCV(...)
+    >>> len(calibrated_clf.calibrated_classifiers_)
+    1
+    >>> calibrated_clf.predict_proba([[-0.5, 0.5]])
+    array([[0.936..., 0.063...]])
+    """
+
+    _parameter_constraints: dict = {
+        "estimator": [
+            HasMethods(["fit", "predict_proba"]),
+            HasMethods(["fit", "decision_function"]),
+            None,
+        ],
+        "method": [StrOptions({"isotonic", "sigmoid"})],
+        "cv": ["cv_object", StrOptions({"prefit"})],
+        "n_jobs": [Integral, None],
+        "ensemble": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        estimator=None,
+        *,
+        method="sigmoid",
+        cv=None,
+        n_jobs=None,
+        ensemble=True,
+    ):
+        self.estimator = estimator
+        self.method = method
+        self.cv = cv
+        self.n_jobs = n_jobs
+        self.ensemble = ensemble
+
+    def _get_estimator(self):
+        """Resolve which estimator to return (default is LinearSVC)"""
+        if self.estimator is None:
+            # we want all classifiers that don't expose a random_state
+            # to be deterministic (and we don't want to expose this one).
+            estimator = LinearSVC(random_state=0, dual="auto")
+            if _routing_enabled():
+                estimator.set_fit_request(sample_weight=True)
+        else:
+            estimator = self.estimator
+
+        return estimator
+
+    @_fit_context(
+        # CalibratedClassifierCV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, sample_weight=None, **fit_params):
+        """Fit the calibrated model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+
+        **fit_params : dict
+            Parameters to pass to the `fit` method of the underlying
+            classifier.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        check_classification_targets(y)
+        X, y = indexable(X, y)
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
+        estimator = self._get_estimator()
+
+        self.calibrated_classifiers_ = []
+        if self.cv == "prefit":
+            # `classes_` should be consistent with that of estimator
+            check_is_fitted(self.estimator, attributes=["classes_"])
+            self.classes_ = self.estimator.classes_
+
+            predictions, _ = _get_response_values(
+                estimator,
+                X,
+                response_method=["decision_function", "predict_proba"],
+            )
+            if predictions.ndim == 1:
+                # Reshape binary output from `(n_samples,)` to `(n_samples, 1)`
+                predictions = predictions.reshape(-1, 1)
+
+            calibrated_classifier = _fit_calibrator(
+                estimator,
+                predictions,
+                y,
+                self.classes_,
+                self.method,
+                sample_weight,
+            )
+            self.calibrated_classifiers_.append(calibrated_classifier)
+        else:
+            # Set `classes_` using all `y`
+            label_encoder_ = LabelEncoder().fit(y)
+            self.classes_ = label_encoder_.classes_
+
+            if _routing_enabled():
+                routed_params = process_routing(
+                    self,
+                    "fit",
+                    sample_weight=sample_weight,
+                    **fit_params,
+                )
+            else:
+                # sample_weight checks
+                fit_parameters = signature(estimator.fit).parameters
+                supports_sw = "sample_weight" in fit_parameters
+                if sample_weight is not None and not supports_sw:
+                    estimator_name = type(estimator).__name__
+                    warnings.warn(
+                        f"Since {estimator_name} does not appear to accept"
+                        " sample_weight, sample weights will only be used for the"
+                        " calibration itself. This can be caused by a limitation of"
+                        " the current scikit-learn API. See the following issue for"
+                        " more details:"
+                        " https://github.com/scikit-learn/scikit-learn/issues/21134."
+                        " Be warned that the result of the calibration is likely to be"
+                        " incorrect."
+                    )
+                routed_params = Bunch()
+                routed_params.splitter = Bunch(split={})  # no routing for splitter
+                routed_params.estimator = Bunch(fit=fit_params)
+                if sample_weight is not None and supports_sw:
+                    routed_params.estimator.fit["sample_weight"] = sample_weight
+
+            # Check that each cross-validation fold can have at least one
+            # example per class
+            if isinstance(self.cv, int):
+                n_folds = self.cv
+            elif hasattr(self.cv, "n_splits"):
+                n_folds = self.cv.n_splits
+            else:
+                n_folds = None
+            if n_folds and np.any(
+                [np.sum(y == class_) < n_folds for class_ in self.classes_]
+            ):
+                raise ValueError(
+                    f"Requesting {n_folds}-fold "
+                    "cross-validation but provided less than "
+                    f"{n_folds} examples for at least one class."
+                )
+            cv = check_cv(self.cv, y, classifier=True)
+
+            if self.ensemble:
+                parallel = Parallel(n_jobs=self.n_jobs)
+                self.calibrated_classifiers_ = parallel(
+                    delayed(_fit_classifier_calibrator_pair)(
+                        clone(estimator),
+                        X,
+                        y,
+                        train=train,
+                        test=test,
+                        method=self.method,
+                        classes=self.classes_,
+                        sample_weight=sample_weight,
+                        fit_params=routed_params.estimator.fit,
+                    )
+                    for train, test in cv.split(X, y, **routed_params.splitter.split)
+                )
+            else:
+                this_estimator = clone(estimator)
+                method_name = _check_response_method(
+                    this_estimator,
+                    ["decision_function", "predict_proba"],
+                ).__name__
+                predictions = cross_val_predict(
+                    estimator=this_estimator,
+                    X=X,
+                    y=y,
+                    cv=cv,
+                    method=method_name,
+                    n_jobs=self.n_jobs,
+                    params=routed_params.estimator.fit,
+                )
+                if len(self.classes_) == 2:
+                    # Ensure shape (n_samples, 1) in the binary case
+                    if method_name == "predict_proba":
+                        # Select the probability column of the postive class
+                        predictions = _process_predict_proba(
+                            y_pred=predictions,
+                            target_type="binary",
+                            classes=self.classes_,
+                            pos_label=self.classes_[1],
+                        )
+                    predictions = predictions.reshape(-1, 1)
+
+                this_estimator.fit(X, y, **routed_params.estimator.fit)
+                # Note: Here we don't pass on fit_params because the supported
+                # calibrators don't support fit_params anyway
+                calibrated_classifier = _fit_calibrator(
+                    this_estimator,
+                    predictions,
+                    y,
+                    self.classes_,
+                    self.method,
+                    sample_weight,
+                )
+                self.calibrated_classifiers_.append(calibrated_classifier)
+
+        first_clf = self.calibrated_classifiers_[0].estimator
+        if hasattr(first_clf, "n_features_in_"):
+            self.n_features_in_ = first_clf.n_features_in_
+        if hasattr(first_clf, "feature_names_in_"):
+            self.feature_names_in_ = first_clf.feature_names_in_
+        return self
+
+    def predict_proba(self, X):
+        """Calibrated probabilities of classification.
+
+        This function returns calibrated probabilities of classification
+        according to each class on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The samples, as accepted by `estimator.predict_proba`.
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples, n_classes)
+            The predicted probas.
+        """
+        check_is_fitted(self)
+        # Compute the arithmetic mean of the predictions of the calibrated
+        # classifiers
+        mean_proba = np.zeros((_num_samples(X), len(self.classes_)))
+        for calibrated_classifier in self.calibrated_classifiers_:
+            proba = calibrated_classifier.predict_proba(X)
+            mean_proba += proba
+
+        mean_proba /= len(self.calibrated_classifiers_)
+
+        return mean_proba
+
+    def predict(self, X):
+        """Predict the target of new samples.
+
+        The predicted class is the class that has the highest probability,
+        and can thus be different from the prediction of the uncalibrated classifier.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The samples, as accepted by `estimator.predict`.
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,)
+            The predicted class.
+        """
+        check_is_fitted(self)
+        return self.classes_[np.argmax(self.predict_proba(X), axis=1)]
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = (
+            MetadataRouter(owner=self.__class__.__name__)
+            .add_self_request(self)
+            .add(
+                estimator=self._get_estimator(),
+                method_mapping=MethodMapping().add(callee="fit", caller="fit"),
+            )
+            .add(
+                splitter=self.cv,
+                method_mapping=MethodMapping().add(callee="split", caller="fit"),
+            )
+        )
+        return router
+
+    def _more_tags(self):
+        return {
+            "_xfail_checks": {
+                "check_sample_weights_invariance": (
+                    "Due to the cross-validation and sample ordering, removing a sample"
+                    " is not strictly equal to putting is weight to zero. Specific unit"
+                    " tests are added for CalibratedClassifierCV specifically."
+                ),
+            }
+        }
+
+
+def _fit_classifier_calibrator_pair(
+    estimator,
+    X,
+    y,
+    train,
+    test,
+    method,
+    classes,
+    sample_weight=None,
+    fit_params=None,
+):
+    """Fit a classifier/calibration pair on a given train/test split.
+
+    Fit the classifier on the train set, compute its predictions on the test
+    set and use the predictions as input to fit the calibrator along with the
+    test labels.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Cloned base estimator.
+
+    X : array-like, shape (n_samples, n_features)
+        Sample data.
+
+    y : array-like, shape (n_samples,)
+        Targets.
+
+    train : ndarray, shape (n_train_indices,)
+        Indices of the training subset.
+
+    test : ndarray, shape (n_test_indices,)
+        Indices of the testing subset.
+
+    method : {'sigmoid', 'isotonic'}
+        Method to use for calibration.
+
+    classes : ndarray, shape (n_classes,)
+        The target classes.
+
+    sample_weight : array-like, default=None
+        Sample weights for `X`.
+
+    fit_params : dict, default=None
+        Parameters to pass to the `fit` method of the underlying
+        classifier.
+
+    Returns
+    -------
+    calibrated_classifier : _CalibratedClassifier instance
+    """
+    fit_params_train = _check_method_params(X, params=fit_params, indices=train)
+    X_train, y_train = _safe_indexing(X, train), _safe_indexing(y, train)
+    X_test, y_test = _safe_indexing(X, test), _safe_indexing(y, test)
+
+    estimator.fit(X_train, y_train, **fit_params_train)
+
+    predictions, _ = _get_response_values(
+        estimator,
+        X_test,
+        response_method=["decision_function", "predict_proba"],
+    )
+    if predictions.ndim == 1:
+        # Reshape binary output from `(n_samples,)` to `(n_samples, 1)`
+        predictions = predictions.reshape(-1, 1)
+
+    sw_test = None if sample_weight is None else _safe_indexing(sample_weight, test)
+    calibrated_classifier = _fit_calibrator(
+        estimator, predictions, y_test, classes, method, sample_weight=sw_test
+    )
+    return calibrated_classifier
+
+
+def _fit_calibrator(clf, predictions, y, classes, method, sample_weight=None):
+    """Fit calibrator(s) and return a `_CalibratedClassifier`
+    instance.
+
+    `n_classes` (i.e. `len(clf.classes_)`) calibrators are fitted.
+    However, if `n_classes` equals 2, one calibrator is fitted.
+
+    Parameters
+    ----------
+    clf : estimator instance
+        Fitted classifier.
+
+    predictions : array-like, shape (n_samples, n_classes) or (n_samples, 1) \
+                    when binary.
+        Raw predictions returned by the un-calibrated base classifier.
+
+    y : array-like, shape (n_samples,)
+        The targets.
+
+    classes : ndarray, shape (n_classes,)
+        All the prediction classes.
+
+    method : {'sigmoid', 'isotonic'}
+        The method to use for calibration.
+
+    sample_weight : ndarray, shape (n_samples,), default=None
+        Sample weights. If None, then samples are equally weighted.
+
+    Returns
+    -------
+    pipeline : _CalibratedClassifier instance
+    """
+    Y = label_binarize(y, classes=classes)
+    label_encoder = LabelEncoder().fit(classes)
+    pos_class_indices = label_encoder.transform(clf.classes_)
+    calibrators = []
+
+    if (method == 'isotonic') or (method == 'sigmoid'):
+        for class_idx, this_pred in zip(pos_class_indices, predictions.T):
+            if method == "isotonic":
+                calibrator = IsotonicRegression(out_of_bounds="clip")
+            else:  # "sigmoid"
+                calibrator = _SigmoidCalibration()
+            calibrator.fit(this_pred, Y[:, class_idx], sample_weight)
+            calibrators.append(calibrator)
+
+    elif method == 'Temperature_scaling':
+        calibrator = _TemperatureScaling()
+        calibrator.fit(predictions, Y, sample_weight)
+
+    pipeline = _CalibratedClassifier(clf, calibrators, method=method, classes=classes)
+    return pipeline
+
+
+class _CalibratedClassifier:
+    """Pipeline-like chaining a fitted classifier and its fitted calibrators.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Fitted classifier.
+
+    calibrators : list of fitted estimator instances
+        List of fitted calibrators (either 'IsotonicRegression' or
+        '_SigmoidCalibration'). The number of calibrators equals the number of
+        classes. However, if there are 2 classes, the list contains only one
+        fitted calibrator.
+
+    classes : array-like of shape (n_classes,)
+        All the prediction classes.
+
+    method : {'sigmoid', 'isotonic'}, default='sigmoid'
+        The method to use for calibration. Can be 'sigmoid' which
+        corresponds to Platt's method or 'isotonic' which is a
+        non-parametric approach based on isotonic regression.
+    """
+
+    def __init__(self, estimator, calibrators, *, classes, method="sigmoid"):
+        self.estimator = estimator
+        self.calibrators = calibrators
+        self.classes = classes
+        self.method = method
+
+    def predict_proba(self, X):
+        """Calculate calibrated probabilities.
+
+        Calculates classification calibrated probabilities
+        for each class, in a one-vs-all manner, for `X`.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            The sample data.
+
+        Returns
+        -------
+        proba : array, shape (n_samples, n_classes)
+            The predicted probabilities. Can be exact zeros.
+        """
+        predictions, _ = _get_response_values(
+            self.estimator,
+            X,
+            response_method=["decision_function", "predict_proba"],
+        )
+        if predictions.ndim == 1:
+            # Reshape binary output from `(n_samples,)` to `(n_samples, 1)`
+            predictions = predictions.reshape(-1, 1)
+
+        n_classes = len(self.classes)
+
+        label_encoder = LabelEncoder().fit(self.classes)
+        pos_class_indices = label_encoder.transform(self.estimator.classes_)
+
+        proba = np.zeros((_num_samples(X), n_classes))
+
+        # Sigmoid and Isotonic methods
+        if (self.method == 'sigmoid') or (self.method == 'isotonic'):
+
+            for class_idx, this_pred, calibrator in zip(
+                pos_class_indices, predictions.T, self.calibrators
+            ):
+                if n_classes == 2:
+                    # When binary, `predictions` consists only of predictions for
+                    # clf.classes_[1] but `pos_class_indices` = 0
+                    class_idx += 1
+                proba[:, class_idx] = calibrator.predict(this_pred)
+
+            # Normalize the probabilities
+            if n_classes == 2:
+                proba[:, 0] = 1.0 - proba[:, 1]
+            else:
+                denominator = np.sum(proba, axis=1)[:, np.newaxis]
+                # In the edge case where for each class calibrator returns a null
+                # probability for a given sample, use the uniform distribution
+                # instead.
+                uniform_proba = np.full_like(proba, 1 / n_classes)
+                proba = np.divide(
+                    proba, denominator, out=uniform_proba, where=denominator != 0
+                )
+
+        # Temperature Scaling method
+        elif self.method == 'temperature_scaling':
+
+            assert len(self.calibrators) == 1, 'Temperature scaling should consists of one calibrator.'
+
+            proba = self.calibrators[0].predict(predictions)
+
+        # Deal with cases where the predicted probability minimally exceeds 1.0
+        proba[(1.0 < proba) & (proba <= 1.0 + 1e-5)] = 1.0
+
+        return proba
+
+
+# The max_abs_prediction_threshold was approximated using
+# logit(np.finfo(np.float64).eps) which is about -36
+def _sigmoid_calibration(
+    predictions, y, sample_weight=None, max_abs_prediction_threshold=30
+):
+    """Probability Calibration with sigmoid method (Platt 2000)
+
+    Parameters
+    ----------
+    predictions : ndarray of shape (n_samples,)
+        The decision function or predict proba for the samples.
+
+    y : ndarray of shape (n_samples,)
+        The targets.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights. If None, then samples are equally weighted.
+
+    Returns
+    -------
+    a : float
+        The slope.
+
+    b : float
+        The intercept.
+
+    References
+    ----------
+    Platt, "Probabilistic Outputs for Support Vector Machines"
+    """
+    predictions = column_or_1d(predictions)
+    y = column_or_1d(y)
+
+    F = predictions  # F follows Platt's notations
+
+    scale_constant = 1.0
+    max_prediction = np.max(np.abs(F))
+
+    # If the predictions have large values we scale them in order to bring
+    # them within a suitable range. This has no effect on the final
+    # (prediction) result because linear models like Logisitic Regression
+    # without a penalty are invariant to multiplying the features by a
+    # constant.
+    if max_prediction >= max_abs_prediction_threshold:
+        scale_constant = max_prediction
+        # We rescale the features in a copy: inplace rescaling could confuse
+        # the caller and make the code harder to reason about.
+        F = F / scale_constant
+
+    # Bayesian priors (see Platt end of section 2.2):
+    # It corresponds to the number of samples, taking into account the
+    # `sample_weight`.
+    mask_negative_samples = y <= 0
+    if sample_weight is not None:
+        prior0 = (sample_weight[mask_negative_samples]).sum()
+        prior1 = (sample_weight[~mask_negative_samples]).sum()
+    else:
+        prior0 = float(np.sum(mask_negative_samples))
+        prior1 = y.shape[0] - prior0
+    T = np.zeros_like(y, dtype=predictions.dtype)
+    T[y > 0] = (prior1 + 1.0) / (prior1 + 2.0)
+    T[y <= 0] = 1.0 / (prior0 + 2.0)
+
+    bin_loss = HalfBinomialLoss()
+
+    def loss_grad(AB):
+        # .astype below is needed to ensure y_true and raw_prediction have the
+        # same dtype. With result = np.float64(0) * np.array([1, 2], dtype=np.float32)
+        # - in Numpy 2, result.dtype is float64
+        # - in Numpy<2, result.dtype is float32
+        raw_prediction = -(AB[0] * F + AB[1]).astype(dtype=predictions.dtype)
+        l, g = bin_loss.loss_gradient(
+            y_true=T,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+        )
+        loss = l.sum()
+        # TODO: Remove casting to np.float64 when minimum supported SciPy is 1.11.2
+        # With SciPy >= 1.11.2, the LBFGS implementation will cast to float64
+        # https://github.com/scipy/scipy/pull/18825.
+        # Here we cast to float64 to support SciPy < 1.11.2
+        grad = np.asarray([-g @ F, -g.sum()], dtype=np.float64)
+        return loss, grad
+
+    AB0 = np.array([0.0, log((prior0 + 1.0) / (prior1 + 1.0))])
+
+    opt_result = minimize(
+        loss_grad,
+        AB0,
+        method="L-BFGS-B",
+        jac=True,
+        options={
+            "gtol": 1e-6,
+            "ftol": 64 * np.finfo(float).eps,
+        },
+    )
+    AB_ = opt_result.x
+
+    # The tuned multiplicative parameter is converted back to the original
+    # input feature scale. The offset parameter does not need rescaling since
+    # we did not rescale the outcome variable.
+    return AB_[0] / scale_constant, AB_[1]
+
+
+class _SigmoidCalibration(RegressorMixin, BaseEstimator):
+    """Sigmoid regression model.
+
+    Attributes
+    ----------
+    a_ : float
+        The slope.
+
+    b_ : float
+        The intercept.
+    """
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit the model using X, y as training data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples,)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Training target.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        X = column_or_1d(X)
+        y = column_or_1d(y)
+        X, y = indexable(X, y)
+
+        self.a_, self.b_ = _sigmoid_calibration(X, y, sample_weight)
+        return self
+
+    def predict(self, T):
+        """Predict new data by linear interpolation.
+
+        Parameters
+        ----------
+        T : array-like of shape (n_samples,)
+            Data to predict from.
+
+        Returns
+        -------
+        T_ : ndarray of shape (n_samples,)
+            The predicted data.
+        """
+        T = column_or_1d(T)
+        return expit(-(self.a_ * T + self.b_))
+
+
+def _row_max_normalization(data: np.ndarray) -> np.ndarray:
+    """Normalise the output by subtracting
+       the per-row maximum element.
+    """
+    row_max: np.ndarray = np.max(data,
+                                 axis=1,
+                                 keepdims=True
+                                 )
+
+    return data - row_max
+
+
+def _softmax_T(predictions: np.ndarray,
+               temperature: float,
+               ) -> np.ndarray:
+    """Softmax function scaled by the inverse temperature
+    """
+
+    softmax_T_output: np.ndarray = predictions
+    softmax_T_output = _row_max_normalization(softmax_T_output)
+    softmax_T_output /= temperature
+    softmax_T_output = softmax(softmax_T_output,
+                               axis=1
+                               )
+    softmax_T_output = softmax_T_output.astype(dtype=predictions.dtype)
+
+    return softmax_T_output
+
+
+def _exp_T(predictions: np.ndarray,
+           temperature: float
+           ) -> np.ndarray:
+    """Scale by inverse temperature, and then apply the nature exponential function
+    """
+
+    exp_T_output: np.ndarray = predictions
+    exp_T_output = _row_max_normalization(exp_T_output)
+    exp_T_output /= temperature
+    exp_T_output = np.exp(exp_T_output)
+
+    return exp_T_output
+
+
+def _temperature_scaling(predictions: np.ndarray,
+                         labels: np.ndarray,
+                         initial_temperature: float
+                         ) -> float:
+    """ Minimize the Negative Log Likelihood Loss with respect to Temperature
+    """
+
+    def negative_log_likelihood(temperature: float):
+        """Negative Log Likelihood Loss and its Derivative
+           with respect to Temperature
+        """
+
+        # Losses
+        losses: np.ndarray = _softmax_T(predictions,
+                                        temperature
+                                        )
+
+        # Select the probability of the correct class
+        losses = losses[np.arange(losses.shape[0]),
+        labels
+        ]
+
+        losses = np.log(losses)
+
+        # Derivates with respect to Temperature
+        exp_T: np.ndarray = _exp_T(predictions, temperature)
+        exp_T_sum = exp_T.sum(axis=1)
+
+        term_1: np.ndarray = _row_max_normalization(predictions)
+        term_1 /= temperature ** 2
+        term_1 = - term_1[np.arange(term_1.shape[0]),
+        labels
+        ]
+        term_1 *= exp_T_sum
+
+        term_2: np.ndarray = _row_max_normalization(predictions)
+        term_2 /= temperature ** 2
+        term_2 = _row_max_normalization(term_2)
+        term_2 *= exp_T
+        term_2 = term_2.sum(axis=1)
+
+        dL_dts: np.ndarray = (term_1 + term_2) / exp_T_sum
+
+        # print(f"{-losses.sum() = },  {-dL_dts.sum() = }")
+
+        return -losses.sum(), -dL_dts.sum()
+
+    temperature_minimizer: minimize = minimize(negative_log_likelihood,
+                                               initial_temperature,
+                                               method="L-BFGS-B",
+                                               bounds=[(1, None)],
+                                               jac=True,
+                                               options={"gtol": 1e-6,
+                                                        "ftol": 64 * np.finfo(float).eps,
+                                                        }
+                                               )
+
+    return temperature_minimizer.x[0]
+
+
+class _TemperatureScaling():
+    """Temperature Scaling model.
+
+    Attributes
+    ----------
+    T_ : float
+        The optimal temperature.
+    """
+
+    def __init__(self,
+                 initial_temperature: float = None
+                 ):
+
+        self._initial_temperature: float = initial_temperature
+
+        if initial_temperature is None:
+            self._initial_temperature = 1.5
+
+    def fit(self,
+            X,
+            y
+            ):
+
+        self.T_: float = _temperature_scaling(X, y, self._initial_temperature)
+
+        return self
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_prob": ["array-like"],
+        "pos_label": [Real, str, "boolean", None],
+        "n_bins": [Interval(Integral, 1, None, closed="left")],
+        "strategy": [StrOptions({"uniform", "quantile"})],
+    },
+    prefer_skip_nested_validation=True,
+)
+def calibration_curve(
+    y_true,
+    y_prob,
+    *,
+    pos_label=None,
+    n_bins=5,
+    strategy="uniform",
+):
+    """Compute true and predicted probabilities for a calibration curve.
+
+    The method assumes the inputs come from a binary classifier, and
+    discretize the [0, 1] interval into bins.
+
+    Calibration curves may also be referred to as reliability diagrams.
+
+    Read more in the :ref:`User Guide <calibration>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        True targets.
+
+    y_prob : array-like of shape (n_samples,)
+        Probabilities of the positive class.
+
+    pos_label : int, float, bool or str, default=None
+        The label of the positive class.
+
+        .. versionadded:: 1.1
+
+    n_bins : int, default=5
+        Number of bins to discretize the [0, 1] interval. A bigger number
+        requires more data. Bins with no samples (i.e. without
+        corresponding values in `y_prob`) will not be returned, thus the
+        returned arrays may have less than `n_bins` values.
+
+    strategy : {'uniform', 'quantile'}, default='uniform'
+        Strategy used to define the widths of the bins.
+
+        uniform
+            The bins have identical widths.
+        quantile
+            The bins have the same number of samples and depend on `y_prob`.
+
+    Returns
+    -------
+    prob_true : ndarray of shape (n_bins,) or smaller
+        The proportion of samples whose class is the positive class, in each
+        bin (fraction of positives).
+
+    prob_pred : ndarray of shape (n_bins,) or smaller
+        The mean predicted probability in each bin.
+
+    References
+    ----------
+    Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good
+    Probabilities With Supervised Learning, in Proceedings of the 22nd
+    International Conference on Machine Learning (ICML).
+    See section 4 (Qualitative Analysis of Predictions).
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.calibration import calibration_curve
+    >>> y_true = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1])
+    >>> y_pred = np.array([0.1, 0.2, 0.3, 0.4, 0.65, 0.7, 0.8, 0.9,  1.])
+    >>> prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=3)
+    >>> prob_true
+    array([0. , 0.5, 1. ])
+    >>> prob_pred
+    array([0.2  , 0.525, 0.85 ])
+    """
+    y_true = column_or_1d(y_true)
+    y_prob = column_or_1d(y_prob)
+    check_consistent_length(y_true, y_prob)
+    pos_label = _check_pos_label_consistency(pos_label, y_true)
+
+    if y_prob.min() < 0 or y_prob.max() > 1:
+        raise ValueError("y_prob has values outside [0, 1].")
+
+    labels = np.unique(y_true)
+    if len(labels) > 2:
+        raise ValueError(
+            f"Only binary classification is supported. Provided labels {labels}."
+        )
+    y_true = y_true == pos_label
+
+    if strategy == "quantile":  # Determine bin edges by distribution of data
+        quantiles = np.linspace(0, 1, n_bins + 1)
+        bins = np.percentile(y_prob, quantiles * 100)
+    elif strategy == "uniform":
+        bins = np.linspace(0.0, 1.0, n_bins + 1)
+    else:
+        raise ValueError(
+            "Invalid entry to 'strategy' input. Strategy "
+            "must be either 'quantile' or 'uniform'."
+        )
+
+    binids = np.searchsorted(bins[1:-1], y_prob)
+
+    bin_sums = np.bincount(binids, weights=y_prob, minlength=len(bins))
+    bin_true = np.bincount(binids, weights=y_true, minlength=len(bins))
+    bin_total = np.bincount(binids, minlength=len(bins))
+
+    nonzero = bin_total != 0
+    prob_true = bin_true[nonzero] / bin_total[nonzero]
+    prob_pred = bin_sums[nonzero] / bin_total[nonzero]
+
+    return prob_true, prob_pred
+
+
+class CalibrationDisplay(_BinaryClassifierCurveDisplayMixin):
+    """Calibration curve (also known as reliability diagram) visualization.
+
+    It is recommended to use
+    :func:`~sklearn.calibration.CalibrationDisplay.from_estimator` or
+    :func:`~sklearn.calibration.CalibrationDisplay.from_predictions`
+    to create a `CalibrationDisplay`. All parameters are stored as attributes.
+
+    Read more about calibration in the :ref:`User Guide <calibration>` and
+    more about the scikit-learn visualization API in :ref:`visualizations`.
+
+    .. versionadded:: 1.0
+
+    Parameters
+    ----------
+    prob_true : ndarray of shape (n_bins,)
+        The proportion of samples whose class is the positive class (fraction
+        of positives), in each bin.
+
+    prob_pred : ndarray of shape (n_bins,)
+        The mean predicted probability in each bin.
+
+    y_prob : ndarray of shape (n_samples,)
+        Probability estimates for the positive class, for each sample.
+
+    estimator_name : str, default=None
+        Name of estimator. If None, the estimator name is not shown.
+
+    pos_label : int, float, bool or str, default=None
+        The positive class when computing the calibration curve.
+        By default, `pos_label` is set to `estimators.classes_[1]` when using
+        `from_estimator` and set to 1 when using `from_predictions`.
+
+        .. versionadded:: 1.1
+
+    Attributes
+    ----------
+    line_ : matplotlib Artist
+        Calibration curve.
+
+    ax_ : matplotlib Axes
+        Axes with calibration curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the curve.
+
+    See Also
+    --------
+    calibration_curve : Compute true and predicted probabilities for a
+        calibration curve.
+    CalibrationDisplay.from_predictions : Plot calibration curve using true
+        and predicted labels.
+    CalibrationDisplay.from_estimator : Plot calibration curve using an
+        estimator and data.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.calibration import calibration_curve, CalibrationDisplay
+    >>> X, y = make_classification(random_state=0)
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, random_state=0)
+    >>> clf = LogisticRegression(random_state=0)
+    >>> clf.fit(X_train, y_train)
+    LogisticRegression(random_state=0)
+    >>> y_prob = clf.predict_proba(X_test)[:, 1]
+    >>> prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
+    >>> disp = CalibrationDisplay(prob_true, prob_pred, y_prob)
+    >>> disp.plot()
+    <...>
+    """
+
+    def __init__(
+        self, prob_true, prob_pred, y_prob, *, estimator_name=None, pos_label=None
+    ):
+        self.prob_true = prob_true
+        self.prob_pred = prob_pred
+        self.y_prob = y_prob
+        self.estimator_name = estimator_name
+        self.pos_label = pos_label
+
+    def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
+        """Plot visualization.
+
+        Extra keyword arguments will be passed to
+        :func:`matplotlib.pyplot.plot`.
+
+        Parameters
+        ----------
+        ax : Matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        name : str, default=None
+            Name for labeling curve. If `None`, use `estimator_name` if
+            not `None`, otherwise no labeling is shown.
+
+        ref_line : bool, default=True
+            If `True`, plots a reference line representing a perfectly
+            calibrated classifier.
+
+        **kwargs : dict
+            Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.
+
+        Returns
+        -------
+        display : :class:`~sklearn.calibration.CalibrationDisplay`
+            Object that stores computed values.
+        """
+        self.ax_, self.figure_, name = self._validate_plot_params(ax=ax, name=name)
+
+        info_pos_label = (
+            f"(Positive class: {self.pos_label})" if self.pos_label is not None else ""
+        )
+
+        line_kwargs = {"marker": "s", "linestyle": "-"}
+        if name is not None:
+            line_kwargs["label"] = name
+        line_kwargs.update(**kwargs)
+
+        ref_line_label = "Perfectly calibrated"
+        existing_ref_line = ref_line_label in self.ax_.get_legend_handles_labels()[1]
+        if ref_line and not existing_ref_line:
+            self.ax_.plot([0, 1], [0, 1], "k:", label=ref_line_label)
+        self.line_ = self.ax_.plot(self.prob_pred, self.prob_true, **line_kwargs)[0]
+
+        # We always have to show the legend for at least the reference line
+        self.ax_.legend(loc="lower right")
+
+        xlabel = f"Mean predicted probability {info_pos_label}"
+        ylabel = f"Fraction of positives {info_pos_label}"
+        self.ax_.set(xlabel=xlabel, ylabel=ylabel)
+
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        n_bins=5,
+        strategy="uniform",
+        pos_label=None,
+        name=None,
+        ref_line=True,
+        ax=None,
+        **kwargs,
+    ):
+        """Plot calibration curve using a binary classifier and data.
+
+        A calibration curve, also known as a reliability diagram, uses inputs
+        from a binary classifier and plots the average predicted probability
+        for each bin against the fraction of positive classes, on the
+        y-axis.
+
+        Extra keyword arguments will be passed to
+        :func:`matplotlib.pyplot.plot`.
+
+        Read more about calibration in the :ref:`User Guide <calibration>` and
+        more about the scikit-learn visualization API in :ref:`visualizations`.
+
+        .. versionadded:: 1.0
+
+        Parameters
+        ----------
+        estimator : estimator instance
+            Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+            in which the last estimator is a classifier. The classifier must
+            have a :term:`predict_proba` method.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input values.
+
+        y : array-like of shape (n_samples,)
+            Binary target values.
+
+        n_bins : int, default=5
+            Number of bins to discretize the [0, 1] interval into when
+            calculating the calibration curve. A bigger number requires more
+            data.
+
+        strategy : {'uniform', 'quantile'}, default='uniform'
+            Strategy used to define the widths of the bins.
+
+            - `'uniform'`: The bins have identical widths.
+            - `'quantile'`: The bins have the same number of samples and depend
+              on predicted probabilities.
+
+        pos_label : int, float, bool or str, default=None
+            The positive class when computing the calibration curve.
+            By default, `estimators.classes_[1]` is considered as the
+            positive class.
+
+            .. versionadded:: 1.1
+
+        name : str, default=None
+            Name for labeling curve. If `None`, the name of the estimator is
+            used.
+
+        ref_line : bool, default=True
+            If `True`, plots a reference line representing a perfectly
+            calibrated classifier.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        **kwargs : dict
+            Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.
+
+        Returns
+        -------
+        display : :class:`~sklearn.calibration.CalibrationDisplay`.
+            Object that stores computed values.
+
+        See Also
+        --------
+        CalibrationDisplay.from_predictions : Plot calibration curve using true
+            and predicted labels.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.calibration import CalibrationDisplay
+        >>> X, y = make_classification(random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...     X, y, random_state=0)
+        >>> clf = LogisticRegression(random_state=0)
+        >>> clf.fit(X_train, y_train)
+        LogisticRegression(random_state=0)
+        >>> disp = CalibrationDisplay.from_estimator(clf, X_test, y_test)
+        >>> plt.show()
+        """
+        y_prob, pos_label, name = cls._validate_and_get_response_values(
+            estimator,
+            X,
+            y,
+            response_method="predict_proba",
+            pos_label=pos_label,
+            name=name,
+        )
+
+        return cls.from_predictions(
+            y,
+            y_prob,
+            n_bins=n_bins,
+            strategy=strategy,
+            pos_label=pos_label,
+            name=name,
+            ref_line=ref_line,
+            ax=ax,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_predictions(
+        cls,
+        y_true,
+        y_prob,
+        *,
+        n_bins=5,
+        strategy="uniform",
+        pos_label=None,
+        name=None,
+        ref_line=True,
+        ax=None,
+        **kwargs,
+    ):
+        """Plot calibration curve using true labels and predicted probabilities.
+
+        Calibration curve, also known as reliability diagram, uses inputs
+        from a binary classifier and plots the average predicted probability
+        for each bin against the fraction of positive classes, on the
+        y-axis.
+
+        Extra keyword arguments will be passed to
+        :func:`matplotlib.pyplot.plot`.
+
+        Read more about calibration in the :ref:`User Guide <calibration>` and
+        more about the scikit-learn visualization API in :ref:`visualizations`.
+
+        .. versionadded:: 1.0
+
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            True labels.
+
+        y_prob : array-like of shape (n_samples,)
+            The predicted probabilities of the positive class.
+
+        n_bins : int, default=5
+            Number of bins to discretize the [0, 1] interval into when
+            calculating the calibration curve. A bigger number requires more
+            data.
+
+        strategy : {'uniform', 'quantile'}, default='uniform'
+            Strategy used to define the widths of the bins.
+
+            - `'uniform'`: The bins have identical widths.
+            - `'quantile'`: The bins have the same number of samples and depend
+              on predicted probabilities.
+
+        pos_label : int, float, bool or str, default=None
+            The positive class when computing the calibration curve.
+            By default `pos_label` is set to 1.
+
+            .. versionadded:: 1.1
+
+        name : str, default=None
+            Name for labeling curve.
+
+        ref_line : bool, default=True
+            If `True`, plots a reference line representing a perfectly
+            calibrated classifier.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        **kwargs : dict
+            Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.
+
+        Returns
+        -------
+        display : :class:`~sklearn.calibration.CalibrationDisplay`.
+            Object that stores computed values.
+
+        See Also
+        --------
+        CalibrationDisplay.from_estimator : Plot calibration curve using an
+            estimator and data.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.calibration import CalibrationDisplay
+        >>> X, y = make_classification(random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...     X, y, random_state=0)
+        >>> clf = LogisticRegression(random_state=0)
+        >>> clf.fit(X_train, y_train)
+        LogisticRegression(random_state=0)
+        >>> y_prob = clf.predict_proba(X_test)[:, 1]
+        >>> disp = CalibrationDisplay.from_predictions(y_test, y_prob)
+        >>> plt.show()
+        """
+        pos_label_validated, name = cls._validate_from_predictions_params(
+            y_true, y_prob, sample_weight=None, pos_label=pos_label, name=name
+        )
+
+        prob_true, prob_pred = calibration_curve(
+            y_true, y_prob, n_bins=n_bins, strategy=strategy, pos_label=pos_label
+        )
+
+        disp = cls(
+            prob_true=prob_true,
+            prob_pred=prob_pred,
+            y_prob=y_prob,
+            estimator_name=name,
+            pos_label=pos_label_validated,
+        )
+        return disp.plot(ax=ax, ref_line=ref_line, **kwargs)

From 7e2a444cacecd0178b318ba98b51260003406be1 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Thu, 9 May 2024 18:44:05 -0700
Subject: [PATCH 11/17] Added the `_TemperatureScaling` class and associated
 helper functions.

---
 sklearn/calibration_temperature.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/sklearn/calibration_temperature.py b/sklearn/calibration_temperature.py
index 12f287482488d..fa1a4f1f0eb1c 100644
--- a/sklearn/calibration_temperature.py
+++ b/sklearn/calibration_temperature.py
@@ -254,7 +254,7 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator)
             HasMethods(["fit", "decision_function"]),
             None,
         ],
-        "method": [StrOptions({"isotonic", "sigmoid"})],
+        "method": [StrOptions({"isotonic", "sigmoid", "temperature"})],
         "cv": ["cv_object", StrOptions({"prefit"})],
         "n_jobs": [Integral, None],
         "ensemble": ["boolean"],
@@ -665,9 +665,10 @@ def _fit_calibrator(clf, predictions, y, classes, method, sample_weight=None):
             calibrator.fit(this_pred, Y[:, class_idx], sample_weight)
             calibrators.append(calibrator)
 
-    elif method == 'Temperature_scaling':
+    elif method == 'temperature':
         calibrator = _TemperatureScaling()
-        calibrator.fit(predictions, Y, sample_weight)
+        calibrator.fit(predictions, Y)
+        calibrators.append(calibrator)
 
     pipeline = _CalibratedClassifier(clf, calibrators, method=method, classes=classes)
     return pipeline
@@ -760,7 +761,7 @@ def predict_proba(self, X):
                 )
 
         # Temperature Scaling method
-        elif self.method == 'temperature_scaling':
+        elif self.method == 'temperature':
 
             assert len(self.calibrators) == 1, 'Temperature scaling should consists of one calibrator.'
 
@@ -1057,10 +1058,26 @@ def fit(self,
             y
             ):
 
-        self.T_: float = _temperature_scaling(X, y, self._initial_temperature)
+        self.T_: float = _temperature_scaling(np.log(X), y, self._initial_temperature)
 
         return self
 
+    def predict(self, X):
+        """Predict new data by temperature-scaled softmax.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_classes)
+            Data to predict from.
+
+        Returns
+        -------
+        X_ : ndarray of shape (n_samples,)
+            The predicted data.
+        """
+
+        return _softmax_T(np.log(X), self.T_)
+
 
 @validate_params(
     {

From 7acc77906cd0b6c8d134589a10195d4f2523ea77 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Tue, 18 Jun 2024 10:03:31 -0700
Subject: [PATCH 12/17] - Converted variables into lowercase to reduce warning
 messages. - Modified the `negative_log_likelihood` function to allow labels
 to be one-hot.

---
 sklearn/calibration_temperature.py | 61 +++++++++++++-----------------
 1 file changed, 27 insertions(+), 34 deletions(-)

diff --git a/sklearn/calibration_temperature.py b/sklearn/calibration_temperature.py
index fa1a4f1f0eb1c..48b9f2b7a1dae 100644
--- a/sklearn/calibration_temperature.py
+++ b/sklearn/calibration_temperature.py
@@ -63,7 +63,7 @@
 )
 
 
-class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
+class CalibratedClassifierCV_test(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     """Probability calibration with isotonic regression or logistic regression.
 
     This class uses cross-validation to both estimate the parameters of a
@@ -944,35 +944,33 @@ def _row_max_normalization(data: np.ndarray) -> np.ndarray:
     return data - row_max
 
 
-def _softmax_T(predictions: np.ndarray,
+def _softmax_t(predictions: np.ndarray,
                temperature: float,
                ) -> np.ndarray:
     """Softmax function scaled by the inverse temperature
     """
 
-    softmax_T_output: np.ndarray = predictions
-    softmax_T_output = _row_max_normalization(softmax_T_output)
-    softmax_T_output /= temperature
-    softmax_T_output = softmax(softmax_T_output,
-                               axis=1
-                               )
-    softmax_T_output = softmax_T_output.astype(dtype=predictions.dtype)
+    softmax_t_output: np.ndarray = predictions
+    softmax_t_output = _row_max_normalization(softmax_t_output)
+    softmax_t_output /= temperature
+    softmax_t_output = softmax(softmax_t_output, axis=1)
+    softmax_t_output = softmax_t_output.astype(dtype=predictions.dtype)
 
-    return softmax_T_output
+    return softmax_t_output
 
 
-def _exp_T(predictions: np.ndarray,
+def _exp_t(predictions: np.ndarray,
            temperature: float
            ) -> np.ndarray:
     """Scale by inverse temperature, and then apply the nature exponential function
     """
 
-    exp_T_output: np.ndarray = predictions
-    exp_T_output = _row_max_normalization(exp_T_output)
-    exp_T_output /= temperature
-    exp_T_output = np.exp(exp_T_output)
+    exp_t_output: np.ndarray = predictions
+    exp_t_output = _row_max_normalization(exp_t_output)
+    exp_t_output /= temperature
+    exp_t_output = np.exp(exp_t_output)
 
-    return exp_T_output
+    return exp_t_output
 
 
 def _temperature_scaling(predictions: np.ndarray,
@@ -987,43 +985,38 @@ def negative_log_likelihood(temperature: float):
            with respect to Temperature
         """
 
-        # Losses
-        losses: np.ndarray = _softmax_T(predictions,
-                                        temperature
-                                        )
+        # Initiate the Losses
+        losses: np.ndarray = _softmax_t(predictions, temperature)
+        class_indices: np.ndarray = np.argmax(labels, axis=1)
 
         # Select the probability of the correct class
-        losses = losses[np.arange(losses.shape[0]),
-        labels
-        ]
+        losses = losses[np.arange(losses.shape[0]), class_indices]
 
         losses = np.log(losses)
 
-        # Derivates with respect to Temperature
-        exp_T: np.ndarray = _exp_T(predictions, temperature)
-        exp_T_sum = exp_T.sum(axis=1)
+        # Derivatives with respect to Temperature
+        exp_t: np.ndarray = _exp_t(predictions, temperature)
+        exp_t_sum = exp_t.sum(axis=1)
 
         term_1: np.ndarray = _row_max_normalization(predictions)
         term_1 /= temperature ** 2
-        term_1 = - term_1[np.arange(term_1.shape[0]),
-        labels
-        ]
-        term_1 *= exp_T_sum
+        term_1 = - term_1[np.arange(term_1.shape[0]), class_indices]
+        term_1 *= exp_t_sum
 
         term_2: np.ndarray = _row_max_normalization(predictions)
         term_2 /= temperature ** 2
         term_2 = _row_max_normalization(term_2)
-        term_2 *= exp_T
+        term_2 *= exp_t
         term_2 = term_2.sum(axis=1)
 
-        dL_dts: np.ndarray = (term_1 + term_2) / exp_T_sum
+        dL_dts: np.ndarray = (term_1 + term_2) / exp_t_sum
 
         # print(f"{-losses.sum() = },  {-dL_dts.sum() = }")
 
         return -losses.sum(), -dL_dts.sum()
 
     temperature_minimizer: minimize = minimize(negative_log_likelihood,
-                                               initial_temperature,
+                                               np.array([initial_temperature]),
                                                method="L-BFGS-B",
                                                bounds=[(1, None)],
                                                jac=True,
@@ -1076,7 +1069,7 @@ def predict(self, X):
             The predicted data.
         """
 
-        return _softmax_T(np.log(X), self.T_)
+        return _softmax_t(np.log(X), self.T_)
 
 
 @validate_params(

From 25a1bf2324ff6b2326368fe5d7920bf4bef19ff4 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Tue, 18 Jun 2024 11:19:50 -0700
Subject: [PATCH 13/17] - Converted variables into lowercase to reduce warning
 messages. - Modified the `negative_log_likelihood` function to allow labels
 to be one-hot. - Added the `_temperature_scaling_test.py` file.

---
 sklearn/_temperature_scaling_test.py | 40 ++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 sklearn/_temperature_scaling_test.py

diff --git a/sklearn/_temperature_scaling_test.py b/sklearn/_temperature_scaling_test.py
new file mode 100644
index 0000000000000..991ade2b066db
--- /dev/null
+++ b/sklearn/_temperature_scaling_test.py
@@ -0,0 +1,40 @@
+'''
+This file is created to test if the custom 'TemperatureScaling' class runs properly,
+and serves as proof of work for the changes made to the scikit-learn repository.
+Reference: https://github.com/scikit-learn/scikit-learn/issues/28574
+
+The file also includes examples related to developing a temperature scaling method
+for probability calibration in multi-class classification.
+'''
+
+from sklearn.calibration_temperature import CalibratedClassifierCV_test
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+from sklearn.svm import SVC
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+
+# Load the Iris dataset
+X, y = datasets.load_iris(return_X_y=True)
+X_train, X_calib, y_train, y_calib = train_test_split(X, y)
+
+# Load the following classifiers for testing
+SV_classifier: SVC = SVC(probability=True)
+Logistic_classifier: LogisticRegression = LogisticRegression()
+Tree_classifier: DecisionTreeClassifier = DecisionTreeClassifier()
+
+# Initiate the calibrators for the classifiers
+SVC_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(SV_classifier, cv=3, method='temperature')
+Logistic_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(Logistic_classifier, cv=3, method='temperature')
+Tree_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(Tree_classifier, cv=3, method='temperature')
+
+# Fit all classifier-calibrator pairs
+SVC_scaled.fit(X_train,y_train)
+Logistic_scaled.fit(X_train,y_train)
+Tree_scaled.fit(X_train,y_train)
+
+print("Optimal Temperatures For Each Classifiers")
+print(f"- SVC: {SVC_scaled.calibrated_classifiers_[0].calibrators[0].T_}")
+print(f"- Logistic: {Logistic_scaled.calibrated_classifiers_[0].calibrators[0].T_}")
+print(f"- Decision Tree: {Tree_scaled.calibrated_classifiers_[0].calibrators[0].T_}")
+

From 26f458d3f8d181b7fd9814f99dd61834ffbbfd9f Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Wed, 19 Jun 2024 13:03:44 -0700
Subject: [PATCH 14/17] Added doc-strings to temperature-scaling-related
 functions.

---
 sklearn/_temperature_scaling_test.py |   2 +
 sklearn/calibration_temperature.py   | 145 +++++++++++++++++++++++----
 2 files changed, 130 insertions(+), 17 deletions(-)

diff --git a/sklearn/_temperature_scaling_test.py b/sklearn/_temperature_scaling_test.py
index 991ade2b066db..66c3ffdc56f91 100644
--- a/sklearn/_temperature_scaling_test.py
+++ b/sklearn/_temperature_scaling_test.py
@@ -33,6 +33,8 @@
 Logistic_scaled.fit(X_train,y_train)
 Tree_scaled.fit(X_train,y_train)
 
+print(f" Initial temperatureSVC: {SVC_scaled.calibrated_classifiers_[0].calibrators[0]._initial_temperature}")
+
 print("Optimal Temperatures For Each Classifiers")
 print(f"- SVC: {SVC_scaled.calibrated_classifiers_[0].calibrators[0].T_}")
 print(f"- Logistic: {Logistic_scaled.calibrated_classifiers_[0].calibrators[0].T_}")
diff --git a/sklearn/calibration_temperature.py b/sklearn/calibration_temperature.py
index 48b9f2b7a1dae..20b211a411b18 100644
--- a/sklearn/calibration_temperature.py
+++ b/sklearn/calibration_temperature.py
@@ -64,7 +64,7 @@
 
 
 class CalibratedClassifierCV_test(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
-    """Probability calibration with isotonic regression or logistic regression.
+    """Probability calibration with isotonic regression, logistic regression, or temperature scaling (in-progress).
 
     This class uses cross-validation to both estimate the parameters of a
     classifier and subsequently calibrate a classifier. With default
@@ -98,11 +98,12 @@ class CalibratedClassifierCV_test(ClassifierMixin, MetaEstimatorMixin, BaseEstim
 
         .. versionadded:: 1.2
 
-    method : {'sigmoid', 'isotonic'}, default='sigmoid'
+    method : {'sigmoid', 'isotonic', 'temperature'}, default='sigmoid'
         The method to use for calibration. Can be 'sigmoid' which
-        corresponds to Platt's method (i.e. a logistic regression model) or
-        'isotonic' which is a non-parametric approach. It is not advised to
-        use isotonic calibration with too few calibration samples
+        corresponds to Platt's method (i.e. a logistic regression model),
+        'isotonic' which is a non-parametric approach, or 'temperature'
+        which corresponds to the temperature scalingt method. It is not
+        advised to use isotonic calibration with too few calibration samples
         ``(<<1000)`` since it tends to overfit.
 
     cv : int, cross-validation generator, iterable or "prefit", \
@@ -211,6 +212,9 @@ class CalibratedClassifierCV_test(ClassifierMixin, MetaEstimatorMixin, BaseEstim
     .. [4] Predicting Good Probabilities with Supervised Learning,
            A. Niculescu-Mizil & R. Caruana, ICML 2005
 
+    .. [5] On Calibration of Modern Neural Networks,
+           C. Guo, G. Pleiss, Y. Sun & K. Q. Weinberger, ICML 2017
+
     Examples
     --------
     >>> from sklearn.datasets import make_classification
@@ -933,9 +937,20 @@ def predict(self, T):
 
 
 def _row_max_normalization(data: np.ndarray) -> np.ndarray:
-    """Normalise the output by subtracting
-       the per-row maximum element.
+    """Normalize the input data by subtracting the maximum value of each row.
+
+    Parameters
+    ----------
+    data : np.ndarray
+        The input data array of shape (n_samples, n_classes).
+
+    Returns
+    -------
+    np.ndarray
+        A 2D array of the same shape as `data` where each row has been normalized
+        by subtracting the maximum value of that row.
     """
+
     row_max: np.ndarray = np.max(data,
                                  axis=1,
                                  keepdims=True
@@ -947,7 +962,27 @@ def _row_max_normalization(data: np.ndarray) -> np.ndarray:
 def _softmax_t(predictions: np.ndarray,
                temperature: float,
                ) -> np.ndarray:
-    """Softmax function scaled by the inverse temperature
+    """Compute the temperature-scaled softmax of the input predictions.
+
+    Parameters
+    ----------
+    predictions : np.ndarray
+        The input predictions array of shape (n_sample, n_classes).
+
+    temperature : float
+        The temperature parameter for scaling.
+
+    Returns
+    -------
+    np.ndarray
+        A 2D array of the same shape as `predictions` containing the temperature-scaled
+        softmax probabilities.
+
+    Notes
+    -----
+    - This function internally normalizes the predictions by subtracting the row-wise
+      maximum to improve numerical stability before scaling by the temperature.
+    - The softmax computation is done along the last axis of the input predictions.
     """
 
     softmax_t_output: np.ndarray = predictions
@@ -962,7 +997,27 @@ def _softmax_t(predictions: np.ndarray,
 def _exp_t(predictions: np.ndarray,
            temperature: float
            ) -> np.ndarray:
-    """Scale by inverse temperature, and then apply the nature exponential function
+    """Scale predictions by the inverse temperature and apply the exponential function.
+
+    Parameters
+    ----------
+    predictions : np.ndarray
+        The input predictions array of shape (n_samples, n_classes).
+
+    temperature : float
+        The temperature parameter for scaling.
+
+    Returns
+    -------
+    np.ndarray
+        A 2D array of the same shape as `predictions` containing the scaled and
+        exponentiated values.
+
+    Notes
+    -----
+    - This function internally normalizes the predictions by subtracting the row-wise
+      maximum to improve numerical stability before scaling by the temperature and
+      applying the exponential function.
     """
 
     exp_t_output: np.ndarray = predictions
@@ -977,12 +1032,46 @@ def _temperature_scaling(predictions: np.ndarray,
                          labels: np.ndarray,
                          initial_temperature: float
                          ) -> float:
-    """ Minimize the Negative Log Likelihood Loss with respect to Temperature
+    """Probability Calibration with temperature scaling (Guo-Pleiss-Sun-Weinberger 2017).
+
+    Parameters
+    ----------
+    predictions : ndarray of shape (n_samples,)
+        The decision function or predict proba for the samples.
+
+    labels : ndarray of shape (n_samples, n_classes)
+        One-hot encoded true labels for the samples.
+
+    initial_temperature : float
+       Initial temperature value to start the optimisation
+
+    Returns
+    -------
+    float
+        The optimised temperature parameter for probability calibration, with a
+        value in the range [1, infinity).
+
+    References
+    ----------
+    Guo, Pleiss, Sun & Weinberger, "On Calibration of Modern Neural Networks"
     """
 
     def negative_log_likelihood(temperature: float):
-        """Negative Log Likelihood Loss and its Derivative
-           with respect to Temperature
+        """ Compute the negative log likelihood loss and its derivative
+            with respect  to temperature.
+
+        Parameters
+        ----------
+        temperature : float
+            The current temperature value during optimisation.
+
+        Returns
+        -------
+        float
+            The negative log likelihood loss.
+        float
+            The derivative of the negative log likelihood loss with respect to
+            temperature.
         """
 
         # Initiate the Losses
@@ -1009,11 +1098,11 @@ def negative_log_likelihood(temperature: float):
         term_2 *= exp_t
         term_2 = term_2.sum(axis=1)
 
-        dL_dts: np.ndarray = (term_1 + term_2) / exp_t_sum
+        dlosses_dts: np.ndarray = (term_1 + term_2) / exp_t_sum
 
         # print(f"{-losses.sum() = },  {-dL_dts.sum() = }")
 
-        return -losses.sum(), -dL_dts.sum()
+        return -losses.sum(), -dlosses_dts.sum()
 
     temperature_minimizer: minimize = minimize(negative_log_likelihood,
                                                np.array([initial_temperature]),
@@ -1033,8 +1122,14 @@ class _TemperatureScaling():
 
     Attributes
     ----------
+
+    _initial_temperature: float or None
+        Initial temperature value to start the optimisation.
+        If None, the it is set to 1.5.
+
+
     T_ : float
-        The optimal temperature.
+        The optimised temperature for probability calibration.
     """
 
     def __init__(self,
@@ -1050,6 +1145,21 @@ def fit(self,
             X,
             y
             ):
+        """Fit the model using X, y as training data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_classes)
+            Training data.
+
+        y : array-like of shape (n_samples, n_classes)
+            Training target.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
 
         self.T_: float = _temperature_scaling(np.log(X), y, self._initial_temperature)
 
@@ -1061,11 +1171,12 @@ def predict(self, X):
         Parameters
         ----------
         X : array-like of shape (n_samples, n_classes)
-            Data to predict from.
+            The decision function or predict proba for the samples
+
 
         Returns
         -------
-        X_ : ndarray of shape (n_samples,)
+        ndarray of shape (n_samples, n_classes)
             The predicted data.
         """
 

From 6160ee18418a58ac6070f532a6bf85fbe6b571d0 Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Fri, 12 Jul 2024 14:56:20 -0700
Subject: [PATCH 15/17] Modified the `.fit()` method of temperature scaling.
 Now it can handle outputs from `decision_function` function. Also added the
 `_additive_smoothing` function to avoid numerical instability when applying
 logarithm.

---
 sklearn/_temperature_scaling_test.py | 27 ++++++--
 sklearn/calibration_temperature.py   | 96 ++++++++++++++++++++++------
 2 files changed, 99 insertions(+), 24 deletions(-)

diff --git a/sklearn/_temperature_scaling_test.py b/sklearn/_temperature_scaling_test.py
index 66c3ffdc56f91..92982de2f4344 100644
--- a/sklearn/_temperature_scaling_test.py
+++ b/sklearn/_temperature_scaling_test.py
@@ -5,6 +5,7 @@
 
 The file also includes examples related to developing a temperature scaling method
 for probability calibration in multi-class classification.
+
 '''
 
 from sklearn.calibration_temperature import CalibratedClassifierCV_test
@@ -22,21 +23,37 @@
 SV_classifier: SVC = SVC(probability=True)
 Logistic_classifier: LogisticRegression = LogisticRegression()
 Tree_classifier: DecisionTreeClassifier = DecisionTreeClassifier()
+# compare_classifier = DecisionTreeClassifier()
 
 # Initiate the calibrators for the classifiers
-SVC_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(SV_classifier, cv=3, method='temperature')
-Logistic_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(Logistic_classifier, cv=3, method='temperature')
-Tree_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(Tree_classifier, cv=3, method='temperature')
+SVC_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(SV_classifier,
+                                                                      cv=3,
+                                                                      method='temperature'
+                                                                      )
+Logistic_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(Logistic_classifier,
+                                                                           cv=3,
+                                                                           method='temperature'
+                                                                           )
+Tree_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(Tree_classifier,
+                                                                       cv=3,
+                                                                       method='temperature'
+                                                                       )
 
 # Fit all classifier-calibrator pairs
 SVC_scaled.fit(X_train,y_train)
 Logistic_scaled.fit(X_train,y_train)
 Tree_scaled.fit(X_train,y_train)
+# compare_classifier.fit(X_train, y_train)
 
-print(f" Initial temperatureSVC: {SVC_scaled.calibrated_classifiers_[0].calibrators[0]._initial_temperature}")
-
+# print(f" Initial temperatureSVC: {SVC_scaled.calibrated_classifiers_[0].calibrators[0]._initial_temperature}")
 print("Optimal Temperatures For Each Classifiers")
 print(f"- SVC: {SVC_scaled.calibrated_classifiers_[0].calibrators[0].T_}")
 print(f"- Logistic: {Logistic_scaled.calibrated_classifiers_[0].calibrators[0].T_}")
 print(f"- Decision Tree: {Tree_scaled.calibrated_classifiers_[0].calibrators[0].T_}")
 
+print("Printing calibrated probabilities...")
+print(f"{SVC_scaled.predict_proba((X_calib)) = }")
+print(f"{Logistic_scaled.predict_proba((X_calib))=}")
+print(f"{Tree_scaled.predict_proba(X_calib)=}")
+# print(f"{compare_classifier.predict_proba(X_calib)=}")
+print(f"{y_calib=}")
diff --git a/sklearn/calibration_temperature.py b/sklearn/calibration_temperature.py
index 20b211a411b18..04fe0c7a92f67 100644
--- a/sklearn/calibration_temperature.py
+++ b/sklearn/calibration_temperature.py
@@ -463,6 +463,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             self.n_features_in_ = first_clf.n_features_in_
         if hasattr(first_clf, "feature_names_in_"):
             self.feature_names_in_ = first_clf.feature_names_in_
+
         return self
 
     def predict_proba(self, X):
@@ -959,6 +960,38 @@ def _row_max_normalization(data: np.ndarray) -> np.ndarray:
     return data - row_max
 
 
+def _additive_smoothing(probabilities: np.ndarray) -> np.ndarray:
+    """Additive Smoothing.
+    Modify the original probability array to avoid numerical instability when
+    applying logarithm.
+
+    This method adjusts probabilities to avoid exact 0 or 1 values by using
+    a fixed transformation. The transformation ensures that probabilities
+    are within a safe range for logarithmic operations.
+
+    For more details, refer to:
+    https://en.wikipedia.org/wiki/Additive_smoothing
+
+    Parameters
+    ----------
+        probabilities : np.ndarray
+            The input 2D numpy array of probabilities.
+
+    Returns
+    -------
+        np.ndarray
+            The smoothed probability array, with values adjusted to avoid 0 and 1.
+    """
+
+    n_classes: int = probabilities.shape[1]
+
+    smooth_probabilities: np.ndarray = (probabilities * (n_classes - 1) + 0.5) / n_classes
+
+    smooth_probabilities = smooth_probabilities.astype(dtype=probabilities.dtype)
+
+    return smooth_probabilities
+
+
 def _softmax_t(predictions: np.ndarray,
                temperature: float,
                ) -> np.ndarray:
@@ -977,16 +1010,9 @@ def _softmax_t(predictions: np.ndarray,
     np.ndarray
         A 2D array of the same shape as `predictions` containing the temperature-scaled
         softmax probabilities.
-
-    Notes
-    -----
-    - This function internally normalizes the predictions by subtracting the row-wise
-      maximum to improve numerical stability before scaling by the temperature.
-    - The softmax computation is done along the last axis of the input predictions.
     """
 
     softmax_t_output: np.ndarray = predictions
-    softmax_t_output = _row_max_normalization(softmax_t_output)
     softmax_t_output /= temperature
     softmax_t_output = softmax(softmax_t_output, axis=1)
     softmax_t_output = softmax_t_output.astype(dtype=predictions.dtype)
@@ -1012,16 +1038,10 @@ def _exp_t(predictions: np.ndarray,
     np.ndarray
         A 2D array of the same shape as `predictions` containing the scaled and
         exponentiated values.
-
-    Notes
-    -----
-    - This function internally normalizes the predictions by subtracting the row-wise
-      maximum to improve numerical stability before scaling by the temperature and
-      applying the exponential function.
     """
 
     exp_t_output: np.ndarray = predictions
-    exp_t_output = _row_max_normalization(exp_t_output)
+    # exp_t_output = _row_max_normalization(exp_t_output)
     exp_t_output /= temperature
     exp_t_output = np.exp(exp_t_output)
 
@@ -1087,14 +1107,16 @@ def negative_log_likelihood(temperature: float):
         exp_t: np.ndarray = _exp_t(predictions, temperature)
         exp_t_sum = exp_t.sum(axis=1)
 
-        term_1: np.ndarray = _row_max_normalization(predictions)
+        # term_1: np.ndarray = _row_max_normalization(predictions)
+        term_1: np.ndarray = _additive_smoothing(predictions)
         term_1 /= temperature ** 2
         term_1 = - term_1[np.arange(term_1.shape[0]), class_indices]
         term_1 *= exp_t_sum
 
-        term_2: np.ndarray = _row_max_normalization(predictions)
+        # term_2: np.ndarray = _row_max_normalization(predictions)
+        term_2: np.ndarray = _additive_smoothing(predictions)
         term_2 /= temperature ** 2
-        term_2 = _row_max_normalization(term_2)
+        # term_2 = _row_max_normalization(term_2)
         term_2 *= exp_t
         term_2 = term_2.sum(axis=1)
 
@@ -1117,6 +1139,31 @@ def negative_log_likelihood(temperature: float):
     return temperature_minimizer.x[0]
 
 
+def _is_predict_proba(X: np.ndarray) -> bool:
+    """
+    Helper function to check if the input array contains probabilities.
+
+    Specifically, it checks if all rows in the array sum to 1 and if all
+    entries are floats between 0 and 1.
+
+    Parameters:
+    ----------
+        np.ndarray: The input 2D numpy array.
+
+    Returns:
+    --------
+        bool: True if the array is likely to be probabilities, False if it is likely to be logits.
+    """
+
+    # Check if all entries are between 0 and 1
+    entries_zero_to_one: bool = np.all((X >= 0) & (X <= 1))
+
+    # Check if each row sums approximately to 1
+    row_sums_to_one: bool = np.all(np.isclose(np.sum(X, axis=1), 1.0))
+
+    return entries_zero_to_one and row_sums_to_one
+
+
 class _TemperatureScaling():
     """Temperature Scaling model.
 
@@ -1161,7 +1208,14 @@ def fit(self,
             Returns an instance of self.
         """
 
-        self.T_: float = _temperature_scaling(np.log(X), y, self._initial_temperature)
+        # If X are outputs of `decision_function`
+        # i.e., logits (e.g., SVC(probability=False) )
+        if _is_predict_proba(X):
+            self.T_ = _temperature_scaling(np.log(_additive_smoothing(X)), y, self._initial_temperature)
+
+        # If X are outputs of `predict_proba`
+        else:
+            self.T_ = _temperature_scaling(X, y, self._initial_temperature)
 
         return self
 
@@ -1180,7 +1234,11 @@ def predict(self, X):
             The predicted data.
         """
 
-        return _softmax_t(np.log(X), self.T_)
+        if _is_predict_proba(X):
+            return _softmax_t(np.log(_additive_smoothing(X)), self.T_)
+
+        else:
+            return _softmax_t(X, self.T_)
 
 
 @validate_params(

From 67afee5d665ceb30938aba5aa41866d840d8a16a Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Wed, 17 Jul 2024 14:59:03 -0700
Subject: [PATCH 16/17] 1. Modified the `_TemperatureScaling` class to adept
 `sample_weight` argument. 2. Modified the `_temperature_scaling` function.
 The initial temperature is now 1.0, and the optimised temperature is in
 interval [1e-2, inf). 3. Revise doc-strings.

---
 sklearn/_temperature_scaling_test.py |  24 ++---
 sklearn/calibration_temperature.py   | 127 ++++++++++++++-------------
 2 files changed, 81 insertions(+), 70 deletions(-)

diff --git a/sklearn/_temperature_scaling_test.py b/sklearn/_temperature_scaling_test.py
index 92982de2f4344..51cfbea37e33f 100644
--- a/sklearn/_temperature_scaling_test.py
+++ b/sklearn/_temperature_scaling_test.py
@@ -1,11 +1,15 @@
 '''
 This file is created to test if the custom 'TemperatureScaling' class runs properly,
 and serves as proof of work for the changes made to the scikit-learn repository.
-Reference: https://github.com/scikit-learn/scikit-learn/issues/28574
 
 The file also includes examples related to developing a temperature scaling method
 for probability calibration in multi-class classification.
 
+
+References:
+-----------
+  .. [1] https://github.com/scikit-learn/scikit-learn/issues/28574. Original issue
+         on Github.
 '''
 
 from sklearn.calibration_temperature import CalibratedClassifierCV_test
@@ -20,10 +24,10 @@
 X_train, X_calib, y_train, y_calib = train_test_split(X, y)
 
 # Load the following classifiers for testing
-SV_classifier: SVC = SVC(probability=True)
+SV_classifier: SVC = SVC(probability=False)
 Logistic_classifier: LogisticRegression = LogisticRegression()
 Tree_classifier: DecisionTreeClassifier = DecisionTreeClassifier()
-# compare_classifier = DecisionTreeClassifier()
+
 
 # Initiate the calibrators for the classifiers
 SVC_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(SV_classifier,
@@ -31,7 +35,7 @@
                                                                       method='temperature'
                                                                       )
 Logistic_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(Logistic_classifier,
-                                                                           cv=3,
+                                                                           cv=7,
                                                                            method='temperature'
                                                                            )
 Tree_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(Tree_classifier,
@@ -39,21 +43,21 @@
                                                                        method='temperature'
                                                                        )
 
+
 # Fit all classifier-calibrator pairs
 SVC_scaled.fit(X_train,y_train)
 Logistic_scaled.fit(X_train,y_train)
 Tree_scaled.fit(X_train,y_train)
 # compare_classifier.fit(X_train, y_train)
 
-# print(f" Initial temperatureSVC: {SVC_scaled.calibrated_classifiers_[0].calibrators[0]._initial_temperature}")
 print("Optimal Temperatures For Each Classifiers")
-print(f"- SVC: {SVC_scaled.calibrated_classifiers_[0].calibrators[0].T_}")
-print(f"- Logistic: {Logistic_scaled.calibrated_classifiers_[0].calibrators[0].T_}")
-print(f"- Decision Tree: {Tree_scaled.calibrated_classifiers_[0].calibrators[0].T_}")
+print(f"{SVC_scaled.calibrated_classifiers_[0].calibrators[0].T_=}")
+print(f"{Logistic_scaled.calibrated_classifiers_[0].calibrators[0].T_=}")
+print(f"{Tree_scaled.calibrated_classifiers_[0].calibrators[0].T_=}")
 
 print("Printing calibrated probabilities...")
-print(f"{SVC_scaled.predict_proba((X_calib)) = }")
-print(f"{Logistic_scaled.predict_proba((X_calib))=}")
+print(f"{SVC_scaled.predict_proba(X_calib) = }")
+print(f"{Logistic_scaled.predict_proba(X_calib) = }")
 print(f"{Tree_scaled.predict_proba(X_calib)=}")
 # print(f"{compare_classifier.predict_proba(X_calib)=}")
 print(f"{y_calib=}")
diff --git a/sklearn/calibration_temperature.py b/sklearn/calibration_temperature.py
index 04fe0c7a92f67..b7c656842f559 100644
--- a/sklearn/calibration_temperature.py
+++ b/sklearn/calibration_temperature.py
@@ -672,7 +672,7 @@ def _fit_calibrator(clf, predictions, y, classes, method, sample_weight=None):
 
     elif method == 'temperature':
         calibrator = _TemperatureScaling()
-        calibrator.fit(predictions, Y)
+        calibrator.fit(predictions, Y, sample_weight)
         calibrators.append(calibrator)
 
     pipeline = _CalibratedClassifier(clf, calibrators, method=method, classes=classes)
@@ -943,12 +943,12 @@ def _row_max_normalization(data: np.ndarray) -> np.ndarray:
     Parameters
     ----------
     data : np.ndarray
-        The input data array of shape (n_samples, n_classes).
+        The input array.
 
     Returns
     -------
     np.ndarray
-        A 2D array of the same shape as `data` where each row has been normalized
+        An array of the same shape as `data` where each row has been normalized
         by subtracting the maximum value of that row.
     """
 
@@ -974,13 +974,13 @@ def _additive_smoothing(probabilities: np.ndarray) -> np.ndarray:
 
     Parameters
     ----------
-        probabilities : np.ndarray
-            The input 2D numpy array of probabilities.
+    probabilities : np.ndarray
+        The input 2D numpy array of probabilities.
 
     Returns
     -------
-        np.ndarray
-            The smoothed probability array, with values adjusted to avoid 0 and 1.
+    np.ndarray
+        The smoothed probability array, with values adjusted to avoid 0 and 1.
     """
 
     n_classes: int = probabilities.shape[1]
@@ -992,15 +992,15 @@ def _additive_smoothing(probabilities: np.ndarray) -> np.ndarray:
     return smooth_probabilities
 
 
-def _softmax_t(predictions: np.ndarray,
+def _softmax_t(X: np.ndarray,
                temperature: float,
                ) -> np.ndarray:
-    """Compute the temperature-scaled softmax of the input predictions.
+    """Compute the temperature-scaled softmax of the input array.
 
     Parameters
     ----------
-    predictions : np.ndarray
-        The input predictions array of shape (n_sample, n_classes).
+    X : np.ndarray
+        The input array.
 
     temperature : float
         The temperature parameter for scaling.
@@ -1008,27 +1008,28 @@ def _softmax_t(predictions: np.ndarray,
     Returns
     -------
     np.ndarray
-        A 2D array of the same shape as `predictions` containing the temperature-scaled
+        An array of the same shape as the input containing the temperature-scaled
         softmax probabilities.
     """
 
-    softmax_t_output: np.ndarray = predictions
+    softmax_t_output: np.ndarray = X
+    softmax_t_output = _row_max_normalization(softmax_t_output)
     softmax_t_output /= temperature
     softmax_t_output = softmax(softmax_t_output, axis=1)
-    softmax_t_output = softmax_t_output.astype(dtype=predictions.dtype)
+    softmax_t_output = softmax_t_output.astype(dtype=X.dtype)
 
     return softmax_t_output
 
 
-def _exp_t(predictions: np.ndarray,
+def _exp_t(X: np.ndarray,
            temperature: float
            ) -> np.ndarray:
     """Scale predictions by the inverse temperature and apply the exponential function.
 
     Parameters
     ----------
-    predictions : np.ndarray
-        The input predictions array of shape (n_samples, n_classes).
+    X : np.ndarray
+        The input array.
 
     temperature : float
         The temperature parameter for scaling.
@@ -1036,21 +1037,23 @@ def _exp_t(predictions: np.ndarray,
     Returns
     -------
     np.ndarray
-        A 2D array of the same shape as `predictions` containing the scaled and
+        An array of the same shape as the input containing the temperature-scaled and
         exponentiated values.
     """
 
-    exp_t_output: np.ndarray = predictions
-    # exp_t_output = _row_max_normalization(exp_t_output)
+    exp_t_output: np.ndarray = X
+    exp_t_output = _row_max_normalization(exp_t_output)
     exp_t_output /= temperature
     exp_t_output = np.exp(exp_t_output)
+    exp_t_output = exp_t_output.astype(dtype=X.dtype)
 
     return exp_t_output
 
 
 def _temperature_scaling(predictions: np.ndarray,
                          labels: np.ndarray,
-                         initial_temperature: float
+                         sample_weight=None,
+                         initial_temperature: float = 1.0
                          ) -> float:
     """Probability Calibration with temperature scaling (Guo-Pleiss-Sun-Weinberger 2017).
 
@@ -1062,8 +1065,11 @@ def _temperature_scaling(predictions: np.ndarray,
     labels : ndarray of shape (n_samples, n_classes)
         One-hot encoded true labels for the samples.
 
-    initial_temperature : float
-       Initial temperature value to start the optimisation
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights. If None, then samples are equally weighted.
+
+    initial_temperature : float, default=1.0
+       Initial temperature value to start the optimisation.
 
     Returns
     -------
@@ -1089,6 +1095,7 @@ def negative_log_likelihood(temperature: float):
         -------
         float
             The negative log likelihood loss.
+
         float
             The derivative of the negative log likelihood loss with respect to
             temperature.
@@ -1103,33 +1110,38 @@ def negative_log_likelihood(temperature: float):
 
         losses = np.log(losses)
 
+        # Apply sample weight
+        if sample_weight is not None:
+            losses *= sample_weight
+
         # Derivatives with respect to Temperature
         exp_t: np.ndarray = _exp_t(predictions, temperature)
         exp_t_sum = exp_t.sum(axis=1)
 
-        # term_1: np.ndarray = _row_max_normalization(predictions)
-        term_1: np.ndarray = _additive_smoothing(predictions)
+        term_1: np.ndarray = predictions
+        term_1 = _row_max_normalization(predictions)
         term_1 /= temperature ** 2
         term_1 = - term_1[np.arange(term_1.shape[0]), class_indices]
         term_1 *= exp_t_sum
 
-        # term_2: np.ndarray = _row_max_normalization(predictions)
-        term_2: np.ndarray = _additive_smoothing(predictions)
+        term_2: np.ndarray = predictions
+        term_2 = _row_max_normalization(term_2)
         term_2 /= temperature ** 2
-        # term_2 = _row_max_normalization(term_2)
         term_2 *= exp_t
         term_2 = term_2.sum(axis=1)
 
         dlosses_dts: np.ndarray = (term_1 + term_2) / exp_t_sum
 
-        # print(f"{-losses.sum() = },  {-dL_dts.sum() = }")
+        # Apply sample weight
+        if sample_weight is not None:
+            dlosses_dts *= sample_weight
 
         return -losses.sum(), -dlosses_dts.sum()
 
     temperature_minimizer: minimize = minimize(negative_log_likelihood,
                                                np.array([initial_temperature]),
                                                method="L-BFGS-B",
-                                               bounds=[(1, None)],
+                                               bounds=[(1e-2, None)],
                                                jac=True,
                                                options={"gtol": 1e-6,
                                                         "ftol": 64 * np.finfo(float).eps,
@@ -1140,19 +1152,20 @@ def negative_log_likelihood(temperature: float):
 
 
 def _is_predict_proba(X: np.ndarray) -> bool:
-    """
-    Helper function to check if the input array contains probabilities.
-
+    """Helper function to check if the input array contains probabilities.
     Specifically, it checks if all rows in the array sum to 1 and if all
     entries are floats between 0 and 1.
 
     Parameters:
     ----------
-        np.ndarray: The input 2D numpy array.
+    X : np.ndarray
+        The input numpy array of shape (n_samples, n_classes).
 
     Returns:
     --------
-        bool: True if the array is likely to be probabilities, False if it is likely to be logits.
+    bool
+        True if the array is likely to be output of `predict_proba`,
+        False if it is likely to be output of `decision_function`.
     """
 
     # Check if all entries are between 0 and 1
@@ -1164,43 +1177,36 @@ def _is_predict_proba(X: np.ndarray) -> bool:
     return entries_zero_to_one and row_sums_to_one
 
 
-class _TemperatureScaling():
+class _TemperatureScaling(RegressorMixin, BaseEstimator):
     """Temperature Scaling model.
 
     Attributes
     ----------
-
-    _initial_temperature: float or None
-        Initial temperature value to start the optimisation.
-        If None, the it is set to 1.5.
-
-
     T_ : float
         The optimised temperature for probability calibration.
+        Available after the calibrator is fitted.
     """
 
-    def __init__(self,
-                 initial_temperature: float = None
-                 ):
-
-        self._initial_temperature: float = initial_temperature
-
-        if initial_temperature is None:
-            self._initial_temperature = 1.5
 
     def fit(self,
             X,
-            y
+            y,
+            sample_weight=None
             ):
         """Fit the model using X, y as training data.
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_classes)
+        X : np.ndarray
+            array-like of shape (n_samples, n_classes).
             Training data.
 
-        y : array-like of shape (n_samples, n_classes)
-            Training target.
+        y : np.ndarray
+            array-like of shape (n_samples, n_classes)
+            Training labels.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
 
         Returns
         -------
@@ -1211,11 +1217,11 @@ def fit(self,
         # If X are outputs of `decision_function`
         # i.e., logits (e.g., SVC(probability=False) )
         if _is_predict_proba(X):
-            self.T_ = _temperature_scaling(np.log(_additive_smoothing(X)), y, self._initial_temperature)
+            self.T_ = _temperature_scaling(np.log(_additive_smoothing(X)), y, sample_weight)
 
         # If X are outputs of `predict_proba`
         else:
-            self.T_ = _temperature_scaling(X, y, self._initial_temperature)
+            self.T_ = _temperature_scaling(X, y, sample_weight)
 
         return self
 
@@ -1224,13 +1230,14 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_classes)
-            The decision function or predict proba for the samples
-
+        X : np.ndarray
+            array-like of shape (n_samples, n_classes)
+            The output of `decision_function` or `predict_proba`.
 
         Returns
         -------
-        ndarray of shape (n_samples, n_classes)
+        np.ndarray
+            ndarray of shape (n_samples, n_classes)
             The predicted data.
         """
 

From e72adfec27cf806fad2bf44c23c54f7deff5980c Mon Sep 17 00:00:00 2001
From: virchan <virchan.math@gmail.com>
Date: Thu, 18 Jul 2024 10:50:41 -0700
Subject: [PATCH 17/17] Modified `_temperature_scaling_test.py` and
 `calibration_temperature.py` for the first PR draft.

---
 sklearn/_temperature_scaling_test.py | 30 ++++++++++++++++++----------
 sklearn/calibration_temperature.py   |  6 +++---
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/sklearn/_temperature_scaling_test.py b/sklearn/_temperature_scaling_test.py
index 51cfbea37e33f..e0a95523ed58b 100644
--- a/sklearn/_temperature_scaling_test.py
+++ b/sklearn/_temperature_scaling_test.py
@@ -8,8 +8,11 @@
 
 References:
 -----------
-  .. [1] https://github.com/scikit-learn/scikit-learn/issues/28574. Original issue
-         on Github.
+    .. [1]  https://github.com/scikit-learn/scikit-learn/issues/28574. Original issue
+            on Github.
+
+    .. [2]  On Calibration of Modern Neural Networks,
+            C. Guo, G. Pleiss, Y. Sun & K. Q. Weinberger, ICML 2017
 '''
 
 from sklearn.calibration_temperature import CalibratedClassifierCV_test
@@ -19,17 +22,20 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
 
-# Load the Iris dataset
+# We demonstrate with the Iris dataset, because
+# it is small, multi-class, and self-provided.
 X, y = datasets.load_iris(return_X_y=True)
 X_train, X_calib, y_train, y_calib = train_test_split(X, y)
 
 # Load the following classifiers for testing
+# - Support vector classifier
+# - Logistic regressor
+# - Decision tree classifier
 SV_classifier: SVC = SVC(probability=False)
 Logistic_classifier: LogisticRegression = LogisticRegression()
 Tree_classifier: DecisionTreeClassifier = DecisionTreeClassifier()
 
-
-# Initiate the calibrators for the classifiers
+# Initiate the temperature scaling calibrators for the classifiers
 SVC_scaled: CalibratedClassifierCV_test = CalibratedClassifierCV_test(SV_classifier,
                                                                       cv=3,
                                                                       method='temperature'
@@ -43,21 +49,23 @@
                                                                        method='temperature'
                                                                        )
 
-
-# Fit all classifier-calibrator pairs
+# Calibrate the classifiers with temperature scaling
+# The calibrators are trained with the output of
+# `decision_function` for the support vector classifier
+# and logistic regression, while they are trained with
+# `predict_proba` for the decision tree classifier.
 SVC_scaled.fit(X_train,y_train)
 Logistic_scaled.fit(X_train,y_train)
 Tree_scaled.fit(X_train,y_train)
-# compare_classifier.fit(X_train, y_train)
 
 print("Optimal Temperatures For Each Classifiers")
 print(f"{SVC_scaled.calibrated_classifiers_[0].calibrators[0].T_=}")
 print(f"{Logistic_scaled.calibrated_classifiers_[0].calibrators[0].T_=}")
 print(f"{Tree_scaled.calibrated_classifiers_[0].calibrators[0].T_=}")
 
+print('\n')
 print("Printing calibrated probabilities...")
-print(f"{SVC_scaled.predict_proba(X_calib) = }")
-print(f"{Logistic_scaled.predict_proba(X_calib) = }")
+print(f"{SVC_scaled.predict_proba(X_calib)=}")
+print(f"{Logistic_scaled.predict_proba(X_calib)=}")
 print(f"{Tree_scaled.predict_proba(X_calib)=}")
-# print(f"{compare_classifier.predict_proba(X_calib)=}")
 print(f"{y_calib=}")
diff --git a/sklearn/calibration_temperature.py b/sklearn/calibration_temperature.py
index b7c656842f559..19c1b430d2175 100644
--- a/sklearn/calibration_temperature.py
+++ b/sklearn/calibration_temperature.py
@@ -585,7 +585,7 @@ def _fit_classifier_calibrator_pair(
     test : ndarray, shape (n_test_indices,)
         Indices of the testing subset.
 
-    method : {'sigmoid', 'isotonic'}
+    method : {'sigmoid', 'isotonic', 'temperature'}
         Method to use for calibration.
 
     classes : ndarray, shape (n_classes,)
@@ -646,7 +646,7 @@ def _fit_calibrator(clf, predictions, y, classes, method, sample_weight=None):
     classes : ndarray, shape (n_classes,)
         All the prediction classes.
 
-    method : {'sigmoid', 'isotonic'}
+    method : {'sigmoid', 'isotonic', 'temperature'}
         The method to use for calibration.
 
     sample_weight : ndarray, shape (n_samples,), default=None
@@ -696,7 +696,7 @@ class _CalibratedClassifier:
     classes : array-like of shape (n_classes,)
         All the prediction classes.
 
-    method : {'sigmoid', 'isotonic'}, default='sigmoid'
+    method : {'sigmoid', 'isotonic', 'temperature'}, default='sigmoid'
         The method to use for calibration. Can be 'sigmoid' which
         corresponds to Platt's method or 'isotonic' which is a
         non-parametric approach based on isotonic regression.