From d6e864a2250c5f6f264188fd3ebba86c0ac3e161 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 2 Aug 2024 09:46:49 +0200 Subject: [PATCH 01/10] DOC Use quantiles instead of std in ROC example with cross-validation --- examples/model_selection/plot_roc_crossval.py | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index fb6432a71ed79..e16b2aa25941c 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -66,6 +66,9 @@ # plot the ROC curves fold-wise. Notice that the baseline to define the chance # level (dashed ROC curve) is a classifier that would always predict the most # frequent class. +# +# In the following plot, quantile coverage is represented by shades of grey, +# with darker colors indicating values closer to the median. import matplotlib.pyplot as plt @@ -112,17 +115,19 @@ alpha=0.8, ) -std_tpr = np.std(tprs, axis=0) -tprs_upper = np.minimum(mean_tpr + std_tpr, 1) -tprs_lower = np.maximum(mean_tpr - std_tpr, 0) -ax.fill_between( - mean_fpr, - tprs_lower, - tprs_upper, - color="grey", - alpha=0.2, - label=r"$\pm$ 1 std. dev.", -) +quantile_offsets = np.linspace(0.05, 0.45, 5) +for offset in quantile_offsets: + upper_quantile = np.quantile(tprs, 0.5 + offset, axis=0) + lower_quantile = np.quantile(tprs, 0.5 - offset, axis=0) + label = f"total quantile coverage = {2*offset:.0%}" if offset == 0.45 else None + ax.fill_between( + mean_fpr, + lower_quantile, + upper_quantile, + color="grey", + alpha=0.4, + label=label, + ) ax.set( xlabel="False Positive Rate", From e37a08937373d3b8a95a8ff500d1b7f285db35e2 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 2 Aug 2024 09:47:59 +0200 Subject: [PATCH 02/10] Improve wording --- examples/model_selection/plot_roc_crossval.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index e16b2aa25941c..d4dd9f182aeca 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -13,12 +13,10 @@ better. The "steepness" of ROC curves is also important, since it is ideal to maximize the TPR while minimizing the FPR. -This example shows the ROC response of different datasets, created from K-fold -cross-validation. Taking all of these curves, it is possible to calculate the -mean AUC, and see the variance of the curve when the -training set is split into different subsets. This roughly shows how the -classifier output is affected by changes in the training data, and how different -the splits generated by K-fold cross-validation are from one another. +This example demonstrates how the classifier's ROC response is influenced by +variations in the training data as obtained through K-fold cross-validation. +By analyzing all these curves, we can calculate the mean AUC and visualize the +variance among them. .. note:: From 9b1d924a59691a6108f25bbab2eb0fae2f10bf38 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 2 Aug 2024 13:48:26 +0200 Subject: [PATCH 03/10] Apply suggestions from code review Co-authored-by: Olivier Grisel --- examples/model_selection/plot_roc_crossval.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index d4dd9f182aeca..f3fb6cbd62315 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -16,7 +16,8 @@ This example demonstrates how the classifier's ROC response is influenced by variations in the training data as obtained through K-fold cross-validation. By analyzing all these curves, we can calculate the mean AUC and visualize the -variance among them. +variability of the estimated curves across CV folds via a quantile-based +region. .. note:: @@ -117,7 +118,7 @@ for offset in quantile_offsets: upper_quantile = np.quantile(tprs, 0.5 + offset, axis=0) lower_quantile = np.quantile(tprs, 0.5 - offset, axis=0) - label = f"total quantile coverage = {2*offset:.0%}" if offset == 0.45 else None + label = f"5% to 95% percentile region" if offset == 0.45 else None ax.fill_between( mean_fpr, lower_quantile, From 00ada90596815aae8ca4e3be219cf2b091f0c2d7 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 8 Aug 2024 11:08:02 +0200 Subject: [PATCH 04/10] Fix linter --- examples/model_selection/plot_roc_crossval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index f3fb6cbd62315..44c91525f47c3 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -118,7 +118,7 @@ for offset in quantile_offsets: upper_quantile = np.quantile(tprs, 0.5 + offset, axis=0) lower_quantile = np.quantile(tprs, 0.5 - offset, axis=0) - label = f"5% to 95% percentile region" if offset == 0.45 else None + label = "5% to 95% percentile region" if offset == 0.45 else None ax.fill_between( mean_fpr, lower_quantile, From d977d56cb14f4afed3c7441bcec3a5b33a142793 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 26 Aug 2024 16:58:26 +0200 Subject: [PATCH 05/10] Plot a single 90 percentile region as per Olivier's suggestion --- examples/model_selection/plot_roc_crossval.py | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index 44c91525f47c3..d7c5eb9314961 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -114,24 +114,22 @@ alpha=0.8, ) -quantile_offsets = np.linspace(0.05, 0.45, 5) -for offset in quantile_offsets: - upper_quantile = np.quantile(tprs, 0.5 + offset, axis=0) - lower_quantile = np.quantile(tprs, 0.5 - offset, axis=0) - label = "5% to 95% percentile region" if offset == 0.45 else None - ax.fill_between( - mean_fpr, - lower_quantile, - upper_quantile, - color="grey", - alpha=0.4, - label=label, - ) + +upper_quantile = np.quantile(tprs, 0.95, axis=0) +lower_quantile = np.quantile(tprs, 0.05, axis=0) +ax.fill_between( + mean_fpr, + lower_quantile, + upper_quantile, + color="grey", + alpha=0.4, + label="5% to 95% percentile region", +) ax.set( xlabel="False Positive Rate", ylabel="True Positive Rate", - title=f"Mean ROC curve with variability\n(Positive label '{target_names[1]}')", + title="Mean ROC curve with variability", ) ax.legend(loc="lower right") plt.show() From 8bfebd21b7380de4ef02774a44fb4becb14714ab Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 26 Aug 2024 17:00:39 +0200 Subject: [PATCH 06/10] Iter --- examples/model_selection/plot_roc_crossval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index d7c5eb9314961..56def63f9d63a 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -66,8 +66,8 @@ # level (dashed ROC curve) is a classifier that would always predict the most # frequent class. # -# In the following plot, quantile coverage is represented by shades of grey, -# with darker colors indicating values closer to the median. +# In the following plot, quantile coverage is represented in grey, though the +# AUC value is reported in terms of the mean and standar deviation. import matplotlib.pyplot as plt From beda9e9fb82c00a605aad8333c346688b4be8090 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 26 Aug 2024 17:07:53 +0200 Subject: [PATCH 07/10] Use ShuffleSplit, hgbt and make_classification --- examples/model_selection/plot_roc_crossval.py | 62 +++++++++---------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index 56def63f9d63a..76ae187dd499e 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -14,7 +14,7 @@ maximize the TPR while minimizing the FPR. This example demonstrates how the classifier's ROC response is influenced by -variations in the training data as obtained through K-fold cross-validation. +variations in the training data as obtained through ShuffleSplit cross-validation. By analyzing all these curves, we can calculate the mean AUC and visualize the variability of the estimated curves across CV folds via a quantile-based region. @@ -33,51 +33,45 @@ # Load and prepare data # ===================== # -# We import the :ref:`iris_dataset` which contains 3 classes, each one -# corresponding to a type of iris plant. One class is linearly separable from -# the other 2; the latter are **not** linearly separable from each other. -# -# In the following we binarize the dataset by dropping the "virginica" class -# (`class_id=2`). This means that the "versicolor" class (`class_id=1`) is -# regarded as the positive class and "setosa" as the negative class -# (`class_id=0`). - -import numpy as np - -from sklearn.datasets import load_iris - -iris = load_iris() -target_names = iris.target_names -X, y = iris.data, iris.target -X, y = X[y != 2], y[y != 2] -n_samples, n_features = X.shape - -# %% -# We also add noisy features to make the problem harder. -random_state = np.random.RandomState(0) -X = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1) +# We use :class:`~sklearn.datasets.make_classification` to generate a synthetic +# dataset with 1,000 samples. The generated dataset has two classes by default. +# In this case, we set a class separation factor of 0.5, making the classes +# partially overlapping and not perfectly linearly separable. + +from sklearn.datasets import make_classification + +X, y = make_classification( + n_samples=1_000, + n_features=2, + n_redundant=0, + n_informative=2, + class_sep=0.5, + random_state=0, + n_clusters_per_class=1, +) # %% # Classification and ROC analysis # ------------------------------- # -# Here we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and -# plot the ROC curves fold-wise. Notice that the baseline to define the chance -# level (dashed ROC curve) is a classifier that would always predict the most -# frequent class. +# Here we run a :class:`~sklearn.ensemble.HistGradientBoostingClassifier` +# classifier with cross-validation and plot the ROC curves fold-wise. Notice +# that the baseline to define the chance level (dashed ROC curve) is a +# classifier that would always predict the most frequent class. # # In the following plot, quantile coverage is represented in grey, though the # AUC value is reported in terms of the mean and standar deviation. import matplotlib.pyplot as plt +import numpy as np -from sklearn import svm +from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.metrics import RocCurveDisplay, auc -from sklearn.model_selection import StratifiedKFold +from sklearn.model_selection import StratifiedShuffleSplit -n_splits = 6 -cv = StratifiedKFold(n_splits=n_splits) -classifier = svm.SVC(kernel="linear", probability=True, random_state=random_state) +n_splits = 30 +cv = StratifiedShuffleSplit(n_splits=n_splits, random_state=0) +classifier = HistGradientBoostingClassifier(random_state=42) tprs = [] aucs = [] @@ -90,7 +84,7 @@ classifier, X[test], y[test], - name=f"ROC fold {fold}", + label=None, alpha=0.3, lw=1, ax=ax, From edf72998274d47e4e0fe4c50eb5a4ed819abe79c Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 26 Aug 2024 17:09:39 +0200 Subject: [PATCH 08/10] Iter --- examples/model_selection/plot_roc_crossval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index 76ae187dd499e..801585989adbf 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -16,7 +16,7 @@ This example demonstrates how the classifier's ROC response is influenced by variations in the training data as obtained through ShuffleSplit cross-validation. By analyzing all these curves, we can calculate the mean AUC and visualize the -variability of the estimated curves across CV folds via a quantile-based +variability of the estimated curves across CV splits via a quantile-based region. .. note:: From c19e9936422cb0b3ddec64d082ecb96ca2d17b64 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 26 Aug 2024 17:14:05 +0200 Subject: [PATCH 09/10] Prefer f-string format for legend --- examples/model_selection/plot_roc_crossval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index 801585989adbf..4c8c1f1c199bb 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -103,7 +103,7 @@ mean_fpr, mean_tpr, color="b", - label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc), + label=rf"Mean ROC (AUC = {mean_auc:.2f} $\pm$ {std_auc:.2f})", lw=2, alpha=0.8, ) From c0a5518460adaeedef634506b305379c32bcfd45 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 28 Nov 2024 17:14:37 +0100 Subject: [PATCH 10/10] Set chance level label to None --- examples/model_selection/plot_roc_crossval.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index 4c8c1f1c199bb..c4bae123ca84b 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -89,6 +89,7 @@ lw=1, ax=ax, plot_chance_level=(fold == n_splits - 1), + chance_level_kw={"label": None}, ) interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr) interp_tpr[0] = 0.0