FEA D2 Brier Score (#28971)

OmarManzoor · ogrisel · web-flow · commit 810b9204772b · 2025-07-31T13:47:59.000Z
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
@@ -233,6 +233,7 @@ Scoring string name                    Function
 'roc_auc_ovr_weighted'                 :func:`metrics.roc_auc_score`
 'roc_auc_ovo_weighted'                 :func:`metrics.roc_auc_score`
 'd2_log_loss_score'                    :func:`metrics.d2_log_loss_score`
+'d2_brier_score'                       :func:`metrics.d2_brier_score`
 
 **Clustering**
 'adjusted_mutual_info_score'           :func:`metrics.adjusted_mutual_info_score`
@@ -506,6 +507,7 @@ Some of these are restricted to the binary classification case:
    roc_curve
    class_likelihood_ratios
    det_curve
+   d2_brier_score
 
 
 Others also work in the multiclass case:
@@ -2156,15 +2158,15 @@ D² score for classification
 The D² score computes the fraction of deviance explained.
 It is a generalization of R², where the squared error is generalized and replaced
 by a classification deviance of choice :math:`\text{dev}(y, \hat{y})`
-(e.g., Log loss). D² is a form of a *skill score*.
+(e.g., Log loss, Brier score,). D² is a form of a *skill score*.
 It is calculated as
 
 .. math::
 
   D^2(y, \hat{y}) = 1 - \frac{\text{dev}(y, \hat{y})}{\text{dev}(y, y_{\text{null}})} \,.
 
 Where :math:`y_{\text{null}}` is the optimal prediction of an intercept-only model
-(e.g., the per-class proportion of `y_true` in the case of the Log loss).
+(e.g., the per-class proportion of `y_true` in the case of the Log loss and Brier score).
 
 Like R², the best possible score is 1.0 and it can be negative (because the
 model can be arbitrarily worse). A constant model that always predicts
@@ -2210,6 +2212,50 @@ of 0.0.
     -0.552
 
 
+|details-start|
+**D2 Brier score**
+|details-split|
+
+The :func:`d2_brier_score` function implements the special case
+of D² with the Brier score, see :ref:`brier_score_loss`, i.e.:
+
+.. math::
+
+  \text{dev}(y, \hat{y}) = \text{brier_score_loss}(y, \hat{y}).
+
+This is also referred to as the Brier Skill Score (BSS).
+
+Here are some usage examples of the :func:`d2_brier_score` function::
+
+  >>> from sklearn.metrics import d2_brier_score
+  >>> y_true = [1, 1, 2, 3]
+  >>> y_pred = [
+  ...    [0.5, 0.25, 0.25],
+  ...    [0.5, 0.25, 0.25],
+  ...    [0.5, 0.25, 0.25],
+  ...    [0.5, 0.25, 0.25],
+  ... ]
+  >>> d2_brier_score(y_true, y_pred)
+  0.0
+  >>> y_true = [1, 2, 3]
+  >>> y_pred = [
+  ...    [0.98, 0.01, 0.01],
+  ...    [0.01, 0.98, 0.01],
+  ...    [0.01, 0.01, 0.98],
+  ... ]
+  >>> d2_brier_score(y_true, y_pred)
+  0.9991
+  >>> y_true = [1, 2, 3]
+  >>> y_pred = [
+  ...    [0.1, 0.6, 0.3],
+  ...    [0.1, 0.6, 0.3],
+  ...    [0.4, 0.5, 0.1],
+  ... ]
+  >>> d2_brier_score(y_true, y_pred)
+  -0.370...
+
+|details-end|
+
 .. _multilabel_ranking_metrics:
 
 Multilabel ranking metrics
diff --git a/doc/whats_new/upcoming_changes/sklearn.metrics/28971.feature.rst b/doc/whats_new/upcoming_changes/sklearn.metrics/28971.feature.rst
@@ -0,0 +1,2 @@
+- :func:`metrics.d2_brier_score` has been added which calculates the D^2 for the Brier score.
+  By :user:`Omar Salman <OmarManzoor>`.
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
@@ -12,6 +12,7 @@
     classification_report,
     cohen_kappa_score,
     confusion_matrix,
+    d2_brier_score,
     d2_log_loss_score,
     f1_score,
     fbeta_score,
@@ -124,6 +125,7 @@
     "consensus_score",
     "coverage_error",
     "d2_absolute_error_score",
+    "d2_brier_score",
     "d2_log_loss_score",
     "d2_pinball_score",
     "d2_tweedie_score",
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
@@ -3744,3 +3744,105 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None):
     )
 
     return float(1 - (numerator / denominator))
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_proba": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "pos_label": [Real, str, "boolean", None],
+        "labels": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def d2_brier_score(
+    y_true,
+    y_proba,
+    *,
+    sample_weight=None,
+    pos_label=None,
+    labels=None,
+):
+    """:math:`D^2` score function, fraction of Brier score explained.
+
+    Best possible score is 1.0 and it can be negative because the model can
+    be arbitrarily worse than the null model. The null model, also known as the
+    optimal intercept model, is a model that constantly predicts the per-class
+    proportions of `y_true`, disregarding the input features. The null model
+    gets a D^2 score of 0.0.
+
+    Read more in the :ref:`User Guide <d2_score_classification>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        True targets.
+
+    y_proba : array-like of shape (n_samples,) or (n_samples, n_classes)
+        Predicted probabilities. If `y_proba.shape = (n_samples,)`
+        the probabilities provided are assumed to be that of the
+        positive class. If `y_proba.shape = (n_samples, n_classes)`
+        the columns in `y_proba` are assumed to correspond to the
+        labels in alphabetical order, as done by
+        :class:`~sklearn.preprocessing.LabelBinarizer`.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    pos_label : int, float, bool or str, default=None
+        Label of the positive class. `pos_label` will be inferred in the
+        following manner:
+
+        * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;
+        * else if `y_true` contains string, an error will be raised and
+          `pos_label` should be explicitly specified;
+        * otherwise, `pos_label` defaults to the greater label,
+          i.e. `np.unique(y_true)[-1]`.
+
+    labels : array-like of shape (n_classes,), default=None
+        Class labels when `y_proba.shape = (n_samples, n_classes)`.
+        If not provided, labels will be inferred from `y_true`.
+
+    Returns
+    -------
+    d2 : float
+        The D^2 score.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Brier Skill Score (BSS)
+            <https://en.wikipedia.org/wiki/Brier_score>`_.
+    """
+    if _num_samples(y_proba) < 2:
+        msg = "D^2 score is not well-defined with less than two samples."
+        warnings.warn(msg, UndefinedMetricWarning)
+        return float("nan")
+
+    # brier score of the fitted model
+    brier_score = brier_score_loss(
+        y_true=y_true,
+        y_proba=y_proba,
+        sample_weight=sample_weight,
+        pos_label=pos_label,
+        labels=labels,
+    )
+
+    # brier score of the reference or baseline model
+    y_true = column_or_1d(y_true)
+    weights = _check_sample_weight(sample_weight, y_true)
+    labels = np.unique(y_true if labels is None else labels)
+
+    mask = y_true[None, :] == labels[:, None]
+    label_counts = (mask * weights).sum(axis=1)
+    y_prob = label_counts / weights.sum()
+    y_proba_ref = np.tile(y_prob, (len(y_true), 1))
+    brier_score_ref = brier_score_loss(
+        y_true=y_true,
+        y_proba=y_proba_ref,
+        sample_weight=sample_weight,
+        pos_label=pos_label,
+        labels=labels,
+    )
+
+    return 1 - brier_score / brier_score_ref
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+- :func:`metrics.d2_brier_score` has been added which calculates the D^2 for the Brier score.
	`2`	+ By :user:`Omar Salman <OmarManzoor>`.