From d6e864a2250c5f6f264188fd3ebba86c0ac3e161 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 2 Aug 2024 09:46:49 +0200
Subject: [PATCH 01/10] DOC Use quantiles instead of std in ROC example with
 cross-validation

---
 examples/model_selection/plot_roc_crossval.py | 27 +++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index fb6432a71ed79..e16b2aa25941c 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -66,6 +66,9 @@
 # plot the ROC curves fold-wise. Notice that the baseline to define the chance
 # level (dashed ROC curve) is a classifier that would always predict the most
 # frequent class.
+#
+# In the following plot, quantile coverage is represented by shades of grey,
+# with darker colors indicating values closer to the median.
 
 import matplotlib.pyplot as plt
 
@@ -112,17 +115,19 @@
     alpha=0.8,
 )
 
-std_tpr = np.std(tprs, axis=0)
-tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
-tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
-ax.fill_between(
-    mean_fpr,
-    tprs_lower,
-    tprs_upper,
-    color="grey",
-    alpha=0.2,
-    label=r"$\pm$ 1 std. dev.",
-)
+quantile_offsets = np.linspace(0.05, 0.45, 5)
+for offset in quantile_offsets:
+    upper_quantile = np.quantile(tprs, 0.5 + offset, axis=0)
+    lower_quantile = np.quantile(tprs, 0.5 - offset, axis=0)
+    label = f"total quantile coverage = {2*offset:.0%}" if offset == 0.45 else None
+    ax.fill_between(
+        mean_fpr,
+        lower_quantile,
+        upper_quantile,
+        color="grey",
+        alpha=0.4,
+        label=label,
+    )
 
 ax.set(
     xlabel="False Positive Rate",

From e37a08937373d3b8a95a8ff500d1b7f285db35e2 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 2 Aug 2024 09:47:59 +0200
Subject: [PATCH 02/10] Improve wording

---
 examples/model_selection/plot_roc_crossval.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index e16b2aa25941c..d4dd9f182aeca 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -13,12 +13,10 @@
 better. The "steepness" of ROC curves is also important, since it is ideal to
 maximize the TPR while minimizing the FPR.
 
-This example shows the ROC response of different datasets, created from K-fold
-cross-validation. Taking all of these curves, it is possible to calculate the
-mean AUC, and see the variance of the curve when the
-training set is split into different subsets. This roughly shows how the
-classifier output is affected by changes in the training data, and how different
-the splits generated by K-fold cross-validation are from one another.
+This example demonstrates how the classifier's ROC response is influenced by
+variations in the training data as obtained through K-fold cross-validation.
+By analyzing all these curves, we can calculate the mean AUC and visualize the
+variance among them.
 
 .. note::
 

From 9b1d924a59691a6108f25bbab2eb0fae2f10bf38 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Fri, 2 Aug 2024 13:48:26 +0200
Subject: [PATCH 03/10] Apply suggestions from code review

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 examples/model_selection/plot_roc_crossval.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index d4dd9f182aeca..f3fb6cbd62315 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -16,7 +16,8 @@
 This example demonstrates how the classifier's ROC response is influenced by
 variations in the training data as obtained through K-fold cross-validation.
 By analyzing all these curves, we can calculate the mean AUC and visualize the
-variance among them.
+variability of the estimated curves across CV folds via a quantile-based
+region.
 
 .. note::
 
@@ -117,7 +118,7 @@
 for offset in quantile_offsets:
     upper_quantile = np.quantile(tprs, 0.5 + offset, axis=0)
     lower_quantile = np.quantile(tprs, 0.5 - offset, axis=0)
-    label = f"total quantile coverage = {2*offset:.0%}" if offset == 0.45 else None
+    label = f"5% to 95% percentile region" if offset == 0.45 else None
     ax.fill_between(
         mean_fpr,
         lower_quantile,

From 00ada90596815aae8ca4e3be219cf2b091f0c2d7 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 8 Aug 2024 11:08:02 +0200
Subject: [PATCH 04/10] Fix linter

---
 examples/model_selection/plot_roc_crossval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index f3fb6cbd62315..44c91525f47c3 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -118,7 +118,7 @@
 for offset in quantile_offsets:
     upper_quantile = np.quantile(tprs, 0.5 + offset, axis=0)
     lower_quantile = np.quantile(tprs, 0.5 - offset, axis=0)
-    label = f"5% to 95% percentile region" if offset == 0.45 else None
+    label = "5% to 95% percentile region" if offset == 0.45 else None
     ax.fill_between(
         mean_fpr,
         lower_quantile,

From d977d56cb14f4afed3c7441bcec3a5b33a142793 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 26 Aug 2024 16:58:26 +0200
Subject: [PATCH 05/10] Plot a single 90 percentile region as per Olivier's
 suggestion

---
 examples/model_selection/plot_roc_crossval.py | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index 44c91525f47c3..d7c5eb9314961 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -114,24 +114,22 @@
     alpha=0.8,
 )
 
-quantile_offsets = np.linspace(0.05, 0.45, 5)
-for offset in quantile_offsets:
-    upper_quantile = np.quantile(tprs, 0.5 + offset, axis=0)
-    lower_quantile = np.quantile(tprs, 0.5 - offset, axis=0)
-    label = "5% to 95% percentile region" if offset == 0.45 else None
-    ax.fill_between(
-        mean_fpr,
-        lower_quantile,
-        upper_quantile,
-        color="grey",
-        alpha=0.4,
-        label=label,
-    )
+
+upper_quantile = np.quantile(tprs, 0.95, axis=0)
+lower_quantile = np.quantile(tprs, 0.05, axis=0)
+ax.fill_between(
+    mean_fpr,
+    lower_quantile,
+    upper_quantile,
+    color="grey",
+    alpha=0.4,
+    label="5% to 95% percentile region",
+)
 
 ax.set(
     xlabel="False Positive Rate",
     ylabel="True Positive Rate",
-    title=f"Mean ROC curve with variability\n(Positive label '{target_names[1]}')",
+    title="Mean ROC curve with variability",
 )
 ax.legend(loc="lower right")
 plt.show()

From 8bfebd21b7380de4ef02774a44fb4becb14714ab Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 26 Aug 2024 17:00:39 +0200
Subject: [PATCH 06/10] Iter

---
 examples/model_selection/plot_roc_crossval.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index d7c5eb9314961..56def63f9d63a 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -66,8 +66,8 @@
 # level (dashed ROC curve) is a classifier that would always predict the most
 # frequent class.
 #
-# In the following plot, quantile coverage is represented by shades of grey,
-# with darker colors indicating values closer to the median.
+# In the following plot, quantile coverage is represented in grey, though the
+# AUC value is reported in terms of the mean and standar deviation.
 
 import matplotlib.pyplot as plt
 

From beda9e9fb82c00a605aad8333c346688b4be8090 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 26 Aug 2024 17:07:53 +0200
Subject: [PATCH 07/10] Use ShuffleSplit, hgbt and make_classification

---
 examples/model_selection/plot_roc_crossval.py | 62 +++++++++----------
 1 file changed, 28 insertions(+), 34 deletions(-)

diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index 56def63f9d63a..76ae187dd499e 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -14,7 +14,7 @@
 maximize the TPR while minimizing the FPR.
 
 This example demonstrates how the classifier's ROC response is influenced by
-variations in the training data as obtained through K-fold cross-validation.
+variations in the training data as obtained through ShuffleSplit cross-validation.
 By analyzing all these curves, we can calculate the mean AUC and visualize the
 variability of the estimated curves across CV folds via a quantile-based
 region.
@@ -33,51 +33,45 @@
 # Load and prepare data
 # =====================
 #
-# We import the :ref:`iris_dataset` which contains 3 classes, each one
-# corresponding to a type of iris plant. One class is linearly separable from
-# the other 2; the latter are **not** linearly separable from each other.
-#
-# In the following we binarize the dataset by dropping the "virginica" class
-# (`class_id=2`). This means that the "versicolor" class (`class_id=1`) is
-# regarded as the positive class and "setosa" as the negative class
-# (`class_id=0`).
-
-import numpy as np
-
-from sklearn.datasets import load_iris
-
-iris = load_iris()
-target_names = iris.target_names
-X, y = iris.data, iris.target
-X, y = X[y != 2], y[y != 2]
-n_samples, n_features = X.shape
-
-# %%
-# We also add noisy features to make the problem harder.
-random_state = np.random.RandomState(0)
-X = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)
+# We use :class:`~sklearn.datasets.make_classification` to generate a synthetic
+# dataset with 1,000 samples. The generated dataset has two classes by default.
+# In this case, we set a class separation factor of 0.5, making the classes
+# partially overlapping and not perfectly linearly separable.
+
+from sklearn.datasets import make_classification
+
+X, y = make_classification(
+    n_samples=1_000,
+    n_features=2,
+    n_redundant=0,
+    n_informative=2,
+    class_sep=0.5,
+    random_state=0,
+    n_clusters_per_class=1,
+)
 
 # %%
 # Classification and ROC analysis
 # -------------------------------
 #
-# Here we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and
-# plot the ROC curves fold-wise. Notice that the baseline to define the chance
-# level (dashed ROC curve) is a classifier that would always predict the most
-# frequent class.
+# Here we run a :class:`~sklearn.ensemble.HistGradientBoostingClassifier`
+# classifier with cross-validation and plot the ROC curves fold-wise. Notice
+# that the baseline to define the chance level (dashed ROC curve) is a
+# classifier that would always predict the most frequent class.
 #
 # In the following plot, quantile coverage is represented in grey, though the
 # AUC value is reported in terms of the mean and standar deviation.
 
 import matplotlib.pyplot as plt
+import numpy as np
 
-from sklearn import svm
+from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.metrics import RocCurveDisplay, auc
-from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import StratifiedShuffleSplit
 
-n_splits = 6
-cv = StratifiedKFold(n_splits=n_splits)
-classifier = svm.SVC(kernel="linear", probability=True, random_state=random_state)
+n_splits = 30
+cv = StratifiedShuffleSplit(n_splits=n_splits, random_state=0)
+classifier = HistGradientBoostingClassifier(random_state=42)
 
 tprs = []
 aucs = []
@@ -90,7 +84,7 @@
         classifier,
         X[test],
         y[test],
-        name=f"ROC fold {fold}",
+        label=None,
         alpha=0.3,
         lw=1,
         ax=ax,

From edf72998274d47e4e0fe4c50eb5a4ed819abe79c Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 26 Aug 2024 17:09:39 +0200
Subject: [PATCH 08/10] Iter

---
 examples/model_selection/plot_roc_crossval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index 76ae187dd499e..801585989adbf 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -16,7 +16,7 @@
 This example demonstrates how the classifier's ROC response is influenced by
 variations in the training data as obtained through ShuffleSplit cross-validation.
 By analyzing all these curves, we can calculate the mean AUC and visualize the
-variability of the estimated curves across CV folds via a quantile-based
+variability of the estimated curves across CV splits via a quantile-based
 region.
 
 .. note::

From c19e9936422cb0b3ddec64d082ecb96ca2d17b64 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 26 Aug 2024 17:14:05 +0200
Subject: [PATCH 09/10] Prefer f-string format for legend

---
 examples/model_selection/plot_roc_crossval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index 801585989adbf..4c8c1f1c199bb 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -103,7 +103,7 @@
     mean_fpr,
     mean_tpr,
     color="b",
-    label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
+    label=rf"Mean ROC (AUC = {mean_auc:.2f} $\pm$ {std_auc:.2f})",
     lw=2,
     alpha=0.8,
 )

From c0a5518460adaeedef634506b305379c32bcfd45 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 28 Nov 2024 17:14:37 +0100
Subject: [PATCH 10/10] Set chance level label to None

---
 examples/model_selection/plot_roc_crossval.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index 4c8c1f1c199bb..c4bae123ca84b 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -89,6 +89,7 @@
         lw=1,
         ax=ax,
         plot_chance_level=(fold == n_splits - 1),
+        chance_level_kw={"label": None},
     )
     interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
     interp_tpr[0] = 0.0