scikit-learn · JosephBARBIERDARNAL · May 7, 2024 · May 11, 2024 · May 11, 2024 · Jun 3, 2024
diff --git a/doc/visualizations.rst b/doc/visualizations.rst
@@ -82,5 +82,6 @@ Display Objects
    metrics.PrecisionRecallDisplay
    metrics.PredictionErrorDisplay
    metrics.RocCurveDisplay
+   metrics.CAPCurveDisplay
    model_selection.LearningCurveDisplay
    model_selection.ValidationCurveDisplay
diff --git a/doc/whats_new/upcoming_changes/sklearn.metrics/28972.feature.rst b/doc/whats_new/upcoming_changes/sklearn.metrics/28972.feature.rst
@@ -0,0 +1,5 @@
+- :class:`metrics.CAPCurveDisplay` implements Cumulative Accuracy Profile
+  to visualize the total count of positive outcomes plotted on the
+  y-axis against the cumulative proportion of cases, ordered by predicted value, on the
+  x-axis. By :user:`Joseph Barbier <josephbarbierdarnal>`, :user:`Olivier Grisel <ogrisel>` and
+  :user:`Christian Lorentzen <lorentzenchr>`.
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -489,52 +489,51 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100
 #
 # This plot is called a Lorenz curve and can be summarized by the Gini index:
 
-from sklearn.metrics import auc
-
-
-def lorenz_curve(y_true, y_pred, exposure):
-    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
-    exposure = np.asarray(exposure)
-
-    # order samples by increasing predicted risk:
-    ranking = np.argsort(y_pred)
-    ranked_frequencies = y_true[ranking]
-    ranked_exposure = exposure[ranking]
-    cumulated_claims = np.cumsum(ranked_frequencies * ranked_exposure)
-    cumulated_claims /= cumulated_claims[-1]
-    cumulated_exposure = np.cumsum(ranked_exposure)
-    cumulated_exposure /= cumulated_exposure[-1]
-    return cumulated_exposure, cumulated_claims
-
+from sklearn.metrics import CAPCurveDisplay, auc
 
 fig, ax = plt.subplots(figsize=(8, 8))
 
 for model in [dummy, ridge_glm, poisson_glm, poisson_gbrt]:
     y_pred = model.predict(df_test)
-    cum_exposure, cum_claims = lorenz_curve(
-        df_test["Frequency"], y_pred, df_test["Exposure"]
+    y_true = df_test["Frequency"].values
+    sample_weight = df_test["Exposure"].values
+
+    disp = CAPCurveDisplay.from_predictions(
+        y_true,
+        y_pred,
+        sample_weight=sample_weight,
+        ax=ax,
+        plot_chance_level=False,
+        plot_perfect=False,
     )
-    gini = 1 - 2 * auc(cum_exposure, cum_claims)
-    label = "{} (Gini: {:.2f})".format(model[-1], gini)
-    ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
+
+    gini = 1 - 2 * auc(disp.cumulative_total, disp.y_true_cumulative)
+    disp.line_.set_label(f"{model[-1]} (Gini={gini:.2f})")
 
 # Oracle model: y_pred == y_test
-cum_exposure, cum_claims = lorenz_curve(
-    df_test["Frequency"], df_test["Frequency"], df_test["Exposure"]
+disp = CAPCurveDisplay.from_predictions(
+    y_true,
+    y_true,
+    sample_weight=sample_weight,
+    name="Oracle model",
+    ax=ax,
+    plot_chance_level=False,
+    plot_perfect=False,
+    ls="--",
+    color="k",
 )
-gini = 1 - 2 * auc(cum_exposure, cum_claims)
-label = "Oracle (Gini: {:.2f})".format(gini)
-ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)
+gini = 1 - 2 * auc(disp.cumulative_total, disp.y_true_cumulative)
+ax.get_lines()[-1].set_label(f"Oracle model (Gini={gini:.2f})")
 
-# Random Baseline
-ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline")
 ax.set(
-    title="Lorenz curves by model",
+    title="Lorenz curves",
     xlabel="Cumulative proportion of exposure (from safest to riskiest)",
     ylabel="Cumulative proportion of claims",
 )
+
 ax.legend(loc="upper left")
 
+
 # %%
 # As expected, the dummy regressor is unable to correctly rank the samples and
 # therefore performs the worst on this plot.

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -645,50 +645,50 @@ def score_estimator(
 # directly fit on the pure premium is operationally simpler to develop and
 # maintain as it consists of a single scikit-learn estimator instead of a pair
 # of models, each with its own set of hyperparameters.
-from sklearn.metrics import auc
-
-
-def lorenz_curve(y_true, y_pred, exposure):
-    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
-    exposure = np.asarray(exposure)
-
-    # order samples by increasing predicted risk:
-    ranking = np.argsort(y_pred)
-    ranked_exposure = exposure[ranking]
-    ranked_pure_premium = y_true[ranking]
-    cumulative_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure)
-    cumulative_claim_amount /= cumulative_claim_amount[-1]
-    cumulative_exposure = np.cumsum(ranked_exposure)
-    cumulative_exposure /= cumulative_exposure[-1]
-    return cumulative_exposure, cumulative_claim_amount
-
+from sklearn.metrics import CAPCurveDisplay, auc
 
 fig, ax = plt.subplots(figsize=(8, 8))
 
 y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test)
 y_pred_total = glm_pure_premium.predict(X_test)
+y_true = df_test["PurePremium"].values
+sample_weight = df_test["Exposure"].values
 
 for label, y_pred in [
     ("Frequency * Severity model", y_pred_product),
     ("Compound Poisson Gamma", y_pred_total),
 ]:
-    cum_exposure, cum_claims = lorenz_curve(
-        df_test["PurePremium"], y_pred, df_test["Exposure"]
+    disp = CAPCurveDisplay.from_predictions(
+        y_true,
+        y_pred,
+        sample_weight=sample_weight,
+        ax=ax,
+        plot_chance_level=False,
+        plot_perfect=False,
     )
-    gini = 1 - 2 * auc(cum_exposure, cum_claims)
-    label += " (Gini index: {:.3f})".format(gini)
-    ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
+
+    gini = 1 - 2 * auc(disp.cumulative_total, disp.y_true_cumulative)
+    label += f" (Gini={gini:.3f})"
+    disp.line_.set_label(label)
 
 # Oracle model: y_pred == y_test
-cum_exposure, cum_claims = lorenz_curve(
-    df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"]
+disp = CAPCurveDisplay.from_predictions(
+    y_true,
+    y_true,
+    sample_weight=sample_weight,
+    ax=ax,
+    plot_chance_level=False,
+    plot_perfect=False,
+    ls="--",
+    color="k",
 )
-gini = 1 - 2 * auc(cum_exposure, cum_claims)
-label = "Oracle (Gini index: {:.3f})".format(gini)
-ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)
+
+gini = 1 - 2 * auc(disp.cumulative_total, disp.y_true_cumulative)
+label = f"Oracle (Gini={gini:.3f})"
+disp.line_.set_label(label)
 
 # Random baseline
-ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline")
+ax.plot([0, 1], [0, 1], linestyle="--", label="Random baseline")
 ax.set(
     title="Lorenz Curves",
     xlabel=(
@@ -698,3 +698,5 @@ def lorenz_curve(y_true, y_pred, exposure):
 )
 ax.legend(loc="upper left")
 plt.plot()
+
+# %%
diff --git a/examples/model_selection/plot_det.py b/examples/model_selection/plot_det.py
@@ -4,17 +4,22 @@
 ====================================
 
 In this example, we compare two binary classification multi-threshold metrics:
-the Receiver Operating Characteristic (ROC) and the Detection Error Tradeoff
-(DET). For such purpose, we evaluate two different classifiers for the same
-classification task.
+the Receiver Operating Characteristic (ROC), Detection Error Tradeoff (DET)
+and the Cumulative Accuracy Profile (CAP). For such purpose, we evaluate
+two different classifiers for the same classification task.
 
 ROC curves feature true positive rate (TPR) on the Y axis, and false positive
 rate (FPR) on the X axis. This means that the top left corner of the plot is the
 "ideal" point - a FPR of zero, and a TPR of one.
 
+CAP curves display the cumulative proportion of true positives on the Y axis
+versus the cumulative proportion of the dataset (ranked by predicted probability)
+on the X axis.
+
 DET curves are a variation of ROC curves where False Negative Rate (FNR) is
 plotted on the y-axis instead of the TPR. In this case the origin (bottom left
-corner) is the "ideal" point.
+corner) is the "ideal" point. Furthermore, the axes use a normal deviate scale
+to focus on differences closer to the origin.
 
 .. note::
 
@@ -40,6 +45,8 @@
 # Generate synthetic data
 # -----------------------
 
+import numpy as np
+
 from sklearn.datasets import make_classification
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
@@ -52,6 +59,9 @@
     random_state=1,
     n_clusters_per_class=1,
 )
+# Use string labels for the classes to illustrate the use of the `pos_label` parameter
+# to identify which class should be considered as the "positive" class.
+y = np.array(["Class A", "Class B"])[y]
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
 
@@ -60,7 +70,9 @@
 # ----------------------
 #
 # Here we define two different classifiers. The goal is to visually compare their
-# statistical performance across thresholds using the ROC and DET curves.
+# statistical performance across thresholds using the ROC, CAP and DET curves. There
+# is no particular reason why these classifiers are chosen over other classifiers
+# available in scikit-learn.
 
 from sklearn.dummy import DummyClassifier
 from sklearn.ensemble import RandomForestClassifier
@@ -76,8 +88,8 @@
 }
 
 # %%
-# Compare ROC and DET curves
-# --------------------------
+# Compare ROC, DET and CAP curves
+# -------------------------------
 #
 # DET curves are commonly plotted in normal deviate scale. To achieve this the
 # DET display transforms the error rates as returned by the
@@ -86,56 +98,93 @@
 
 import matplotlib.pyplot as plt
 
-from sklearn.dummy import DummyClassifier
-from sklearn.metrics import DetCurveDisplay, RocCurveDisplay
+from sklearn.metrics import CAPCurveDisplay, DetCurveDisplay, RocCurveDisplay
+
+fig, [ax_roc, ax_det, ax_cap] = plt.subplots(
+    1, 3, figsize=(15, 5), constrained_layout=True
+)
+pos_label = "Class A"
+for clf_idx, (name, clf) in enumerate(classifiers.items()):
+    is_last = clf_idx == len(classifiers) - 1
+    clf.fit(X_train, y_train)
 
-fig, [ax_roc, ax_det] = plt.subplots(1, 2, figsize=(11, 5))
+    RocCurveDisplay.from_estimator(
+        clf,
+        X_test,
+        y_test,
+        ax=ax_roc,
+        name=name,
+        pos_label=pos_label,
+        plot_chance_level=is_last,
+    )
+    DetCurveDisplay.from_estimator(
+        clf,
+        X_test,
+        y_test,
+        ax=ax_det,
+        name=name,
+        pos_label=pos_label,
+    )
+    CAPCurveDisplay.from_estimator(
+        clf,
+        X_test,
+        y_test,
+        ax=ax_cap,
+        name=name,
+        pos_label=pos_label,
+        plot_chance_level=is_last,
+        plot_perfect=is_last,
+    )
 
 ax_roc.set_title("Receiver Operating Characteristic (ROC) curves")
 ax_det.set_title("Detection Error Tradeoff (DET) curves")
+ax_cap.set_title("Cumulative Accuracy Profile (CAP) curves")
 
 ax_roc.grid(linestyle="--")
 ax_det.grid(linestyle="--")
-
-for name, clf in classifiers.items():
-    (color, linestyle) = (
-        ("black", "--") if name == "Non-informative baseline" else (None, None)
-    )
-    clf.fit(X_train, y_train)
-    RocCurveDisplay.from_estimator(
-        clf, X_test, y_test, ax=ax_roc, name=name, color=color, linestyle=linestyle
-    )
-    DetCurveDisplay.from_estimator(
-        clf, X_test, y_test, ax=ax_det, name=name, color=color, linestyle=linestyle
-    )
+ax_cap.grid(linestyle="--")
 
 plt.legend()
 plt.show()
 
-# %%
-# Notice that it is easier to visually assess the overall performance of
-# different classification algorithms using DET curves than using ROC curves. As
-# ROC curves are plot in a linear scale, different classifiers usually appear
+# %% Analysis
+# --------
+#
+# All curves agree that the Random Forest classifier has more discriminative
+# power than the Linear SVM:
+#
+# - the area under the ROC and CAP curves is larger for the Random Forest
+#   classifier,
+# - the ROC and CAP curves are uniformly closer to the top left corner for the
+#   Random Forest classifier,
+# - the DET curve is uniformly closer to the origin for the Random Forest
+#   classifier.
+#
+# Notice that it is easier to visually assess that the Random Forest classifier
+# performs better than the Linear SVM classifier using DET curves. As ROC and
+# CAP curves are plot with a linear scale, different classifiers usually appear
 # similar for a large part of the plot and differ the most in the top left
 # corner of the graph. On the other hand, because DET curves represent straight
 # lines in normal deviate scale, they tend to be distinguishable as a whole and
 # the area of interest spans a large part of the plot.
 #
 # DET curves give direct feedback of the detection error tradeoff to aid in
-# operating point analysis. The user can then decide the FNR they are willing to
-# accept at the expense of the FPR (or vice-versa).
+# operating point analysis. The user can then decide the FNR they are willing
+# to accept at the expense of the FPR (or vice-versa).
 #
-# Non-informative classifier baseline for the ROC and DET curves
-# --------------------------------------------------------------
+# Non-informative classifier baseline
+# -----------------------------------
 #
-# The diagonal black-dotted lines in the plots above correspond to a
-# :class:`~sklearn.dummy.DummyClassifier` using the default "prior" strategy, to
-# serve as baseline for comparison with other classifiers. This classifier makes
-# constant predictions, independent of the input features in `X`, making it a
-# non-informative classifier.
+# The diagonal black-dotted lines named "chance level" in the plots above
+# correspond to a the expected value of a non-informative classifier on an
+# infinite evaluation set.
 #
-# To further understand the non-informative baseline of the ROC and DET curves,
-# we recall the following mathematical definitions:
+# The :class:`~sklearn.dummy.DummyClassifier` model makes constant predictions,
+# independently of the input features in `X`, making it a canonical example of
+# such a non-informative classifier. We observe that the ROC and DET curves of
+# the non-informative classifier exactly match the theoretical chance level
+# line. This can be explained as follows. First recall the following
+# mathematical definitions:
 #
 # :math:`\text{FPR} = \frac{\text{FP}}{\text{FP} + \text{TN}}`
 #
@@ -156,3 +205,14 @@
 #
 # - a single point in the lower left corner of the ROC plane,
 # - a single point in the upper left corner of the DET plane.
+#
+# For the non-informative classifier with strategy "prior", the value returned
+# by `predict_proba` is the observed frequency of the positive class in the
+# training set. If the threshold is above this value, the classifier always
+# predicts the negative class, and if the threshold is below this value, the
+# classifier always predicts the positive class. Both ROC and DET curves
+# linearly interpolate between these two points, hence the diagonal line.
+#
+# For the CAP curve of the non-informative classifier, only approximately lie
+# on the diagonal. The match would be better in the limit of an infinite
+# evaluation set.