From 5f7d6ef6268bc0f3b84917ab4d80581672d29b0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Mon, 13 May 2024 14:07:06 +0200 Subject: [PATCH 01/10] init --- .../plot_release_highlights_1_5_0.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 examples/release_highlights/plot_release_highlights_1_5_0.py diff --git a/examples/release_highlights/plot_release_highlights_1_5_0.py b/examples/release_highlights/plot_release_highlights_1_5_0.py new file mode 100644 index 0000000000000..e89cc49d5f096 --- /dev/null +++ b/examples/release_highlights/plot_release_highlights_1_5_0.py @@ -0,0 +1,30 @@ +# ruff: noqa +""" +======================================= +Release Highlights for scikit-learn 1.5 +======================================= + +.. currentmodule:: sklearn + +We are pleased to announce the release of scikit-learn 1.5! Many bug fixes +and improvements were added, as well as some new key features. We detail +below a few of the major features of this release. **For an exhaustive list of +all the changes**, please refer to the :ref:`release notes `. + +To install the latest version (with pip):: + + pip install --upgrade scikit-learn + +or with conda:: + + conda install -c conda-forge scikit-learn + +""" + +# %% +# Placeholder +# ----------- + +# %% +# Placeholder +# ----------- From 9f10c23a7fa2a310fb9cb98435220aeba37a9fda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Fri, 17 May 2024 16:21:53 +0200 Subject: [PATCH 02/10] add highlights --- .../plot_release_highlights_1_5_0.py | 152 +++++++++++++++++- 1 file changed, 148 insertions(+), 4 deletions(-) diff --git a/examples/release_highlights/plot_release_highlights_1_5_0.py b/examples/release_highlights/plot_release_highlights_1_5_0.py index e89cc49d5f096..3d6d3f5674f95 100644 --- a/examples/release_highlights/plot_release_highlights_1_5_0.py +++ b/examples/release_highlights/plot_release_highlights_1_5_0.py @@ -22,9 +22,153 @@ """ # %% -# Placeholder -# ----------- +# FixedThresholdClassifier: Setting the decision threshold of a binary classifier +# ------------------------------------------------------------------------------- +# All binary classifiers of scikit-learn use a fixed decision threshold of 0.5 to +# convert probability estimates (i.e. outputs of `predict_proba`) into class +# predictions. However it is often not the desired threshold for a given problem. +# :class:`model_selection.FixedThresholdClassifier` allows to wrap any binary +# classifier and set a custom decision threshold. +from sklearn.datasets import make_classification +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import confusion_matrix + +X, y = make_classification(n_samples=1_000, weights=[0.9, 0.1], random_state=0) +classifier = LogisticRegression(random_state=0).fit(X, y) + +print("confusion matrix:\n", confusion_matrix(y, classifier.predict(X))) + +# %% +# Moving the threshold towards the negative class, i.e. allowing more samples to be +# classified as the positive class, increases the number of true positives at the +# cost of more false positives. +from sklearn.model_selection import FixedThresholdClassifier + +wrapped_classifier = FixedThresholdClassifier(classifier, threshold=0.1).fit(X, y) + +print("confusion matrix:\n", confusion_matrix(y, wrapped_classifier.predict(X))) + +# %% +# TunedThresholdClassifierCV: Tuning the decision threshold of a binary classifier +# -------------------------------------------------------------------------------- +# The decision threshold of a binary classifier can be tuned to optimize a given +# metric, using :class:`model_selection.TunedThresholdClassifierCV`. +from sklearn.metrics import balanced_accuracy_score + +# Due to the class imbalance, the balanced accuracy is not optimal for the default +# threshold. The classifier tends to over predict the majority class. +print("balanced accuracy:", balanced_accuracy_score(y, classifier.predict(X))) + +# %% +# Tuning the threshold to optimize the balanced accuracy gives a smaller threshold +# that allows more samples to be classified as the positive class. +from sklearn.model_selection import TunedThresholdClassifierCV + +tuned_classifier = TunedThresholdClassifierCV( + classifier, cv=5, scoring="balanced_accuracy" +).fit(X, y) + +print("new threshold:", tuned_classifier.best_threshold_) +print("balanced accuracy:", balanced_accuracy_score(y, tuned_classifier.predict(X))) + +# %% +# :class:`model_selection.TunedThresholdClassifierCV` also benefits from the +# metadata routing support allowing to optimze complex business metrics, detailed +# in :ref:`Post-tuning the decision threshold for cost-sensitive learning` +# `. + +# %% +# Performance improvements in PCA +# ------------------------------- +# :class:`decomposition.PCA` has a new solver, "covariance_eigh", which is faster +# and more memory efficient than the other solvers for datasets with a large number +# of samples and a small number of features. +from sklearn.datasets import make_low_rank_matrix +from sklearn.decomposition import PCA + +X = make_low_rank_matrix( + n_samples=10_000, n_features=100, tail_strength=0.1, random_state=0 +) + +pca = PCA(n_components=10).fit(X) + +print("explained variance:", pca.explained_variance_ratio_.sum()) + +# %% +# The "full" solver has also been improved to use less memory and allows to +# transform faster. The "auto" option for the solver takes advantage of the +# new solver and is now able to select an appropriate solver for sparse +# datasets. +from scipy.sparse import random + +X = random(10000, 100, format="csr", random_state=0) + +pca = PCA(n_components=10, svd_solver="auto").fit(X) # %% -# Placeholder -# ----------- +# :class:`compose.ColumnTransformer` is now subscriptable, allowing to access +# transformers by name. +import numpy as np +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import StandardScaler, OneHotEncoder + +X = np.array([[0, 1, 2], [3, 4, 5]]) +column_transformer = ColumnTransformer( + [("std_scaler", StandardScaler(), [0]), ("one_hot", OneHotEncoder(), [1, 2])] +) + +column_transformer.fit(X) + +print(column_transformer["std_scaler"]) +print(column_transformer["one_hot"]) + +# %% +# :class:`impute.SimpleImputer` now supports custom strategies for imputation, +# using a callable that computes a scalar value from the non missing values of +# a column vector. +from sklearn.impute import SimpleImputer + +X = np.array( + [ + [-1.1, 1.1, 1.1], + [3.9, -1.2, np.nan], + [np.nan, 1.3, np.nan], + [-0.1, -1.4, -1.4], + [-4.9, 1.5, -1.5], + [np.nan, 1.6, 1.6], + ] +) + + +def smallest_abs(arr): + """Return the smallest absolute value of a 1D array.""" + return np.min(np.abs(arr)) + + +imputer = SimpleImputer(strategy=smallest_abs) + +imputer.fit_transform(X) + +# %% +# :func:`metrics.pairwise_distances` can now compute distances between +# non-numeric arrays using a callable metric. +from sklearn.metrics import pairwise_distances + +X = np.array(["cat", "dog"]) +Y = np.array(["cat", "fox"]) + + +def levenshtein_distance(x, y): + """Return the Levenshtein distance between two strings.""" + if x == "" or y == "": + return max(len(x), len(y)) + if x[0] == y[0]: + return levenshtein_distance(x[1:], y[1:]) + return 1 + min( + levenshtein_distance(x[1:], y), + levenshtein_distance(x, y[1:]), + levenshtein_distance(x[1:], y[1:]), + ) + + +pairwise_distances(X, Y, metric=levenshtein_distance) From f11be22bf248c0c5d35d212a8a8a88ffad7795ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Fri, 17 May 2024 16:51:35 +0200 Subject: [PATCH 03/10] iter --- .../plot_release_highlights_1_5_0.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/release_highlights/plot_release_highlights_1_5_0.py b/examples/release_highlights/plot_release_highlights_1_5_0.py index 3d6d3f5674f95..4dd3274ceffaa 100644 --- a/examples/release_highlights/plot_release_highlights_1_5_0.py +++ b/examples/release_highlights/plot_release_highlights_1_5_0.py @@ -74,7 +74,7 @@ # %% # :class:`model_selection.TunedThresholdClassifierCV` also benefits from the # metadata routing support allowing to optimze complex business metrics, detailed -# in :ref:`Post-tuning the decision threshold for cost-sensitive learning` +# in :ref:`Post-tuning the decision threshold for cost-sensitive learning # `. # %% @@ -106,8 +106,10 @@ pca = PCA(n_components=10, svd_solver="auto").fit(X) # %% -# :class:`compose.ColumnTransformer` is now subscriptable, allowing to access -# transformers by name. +# ColumnTransformer is subscriptable +# ---------------------------------- +# The transformers of a :class:`compose.ColumnTransformer` can now be directly +# accessed using indexing by name. import numpy as np from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder @@ -123,6 +125,8 @@ print(column_transformer["one_hot"]) # %% +# Custom imputation strategies for the SimpleImputer +# -------------------------------------------------- # :class:`impute.SimpleImputer` now supports custom strategies for imputation, # using a callable that computes a scalar value from the non missing values of # a column vector. @@ -150,6 +154,8 @@ def smallest_abs(arr): imputer.fit_transform(X) # %% +# Pairwise distances with non-numeric arrays +# ------------------------------------------ # :func:`metrics.pairwise_distances` can now compute distances between # non-numeric arrays using a callable metric. from sklearn.metrics import pairwise_distances From 42f2b7a67eccb4d7964ab0179f383e35104dad6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Fri, 17 May 2024 17:09:10 +0200 Subject: [PATCH 04/10] Update examples/release_highlights/plot_release_highlights_1_5_0.py Co-authored-by: Tim Head --- examples/release_highlights/plot_release_highlights_1_5_0.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/release_highlights/plot_release_highlights_1_5_0.py b/examples/release_highlights/plot_release_highlights_1_5_0.py index 4dd3274ceffaa..dca9eef59ecf9 100644 --- a/examples/release_highlights/plot_release_highlights_1_5_0.py +++ b/examples/release_highlights/plot_release_highlights_1_5_0.py @@ -7,7 +7,7 @@ .. currentmodule:: sklearn We are pleased to announce the release of scikit-learn 1.5! Many bug fixes -and improvements were added, as well as some new key features. We detail +and improvements were added, as well as some key new features. We detail below a few of the major features of this release. **For an exhaustive list of all the changes**, please refer to the :ref:`release notes `. From 524204bcbb6fd017968a7d86d7439c481bfe1105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Fri, 17 May 2024 17:10:12 +0200 Subject: [PATCH 05/10] Update examples/release_highlights/plot_release_highlights_1_5_0.py --- examples/release_highlights/plot_release_highlights_1_5_0.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/release_highlights/plot_release_highlights_1_5_0.py b/examples/release_highlights/plot_release_highlights_1_5_0.py index dca9eef59ecf9..bcc130472f92d 100644 --- a/examples/release_highlights/plot_release_highlights_1_5_0.py +++ b/examples/release_highlights/plot_release_highlights_1_5_0.py @@ -7,8 +7,8 @@ .. currentmodule:: sklearn We are pleased to announce the release of scikit-learn 1.5! Many bug fixes -and improvements were added, as well as some key new features. We detail -below a few of the major features of this release. **For an exhaustive list of +and improvements were added, as well as some key new features. Below we +detail a few of the major features of this release. **For an exhaustive list of all the changes**, please refer to the :ref:`release notes `. To install the latest version (with pip):: From e516746b4f0d5c064c18d84721b91e3af4780df9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Fri, 17 May 2024 17:43:26 +0200 Subject: [PATCH 06/10] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- .../plot_release_highlights_1_5_0.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/release_highlights/plot_release_highlights_1_5_0.py b/examples/release_highlights/plot_release_highlights_1_5_0.py index bcc130472f92d..b948cac246266 100644 --- a/examples/release_highlights/plot_release_highlights_1_5_0.py +++ b/examples/release_highlights/plot_release_highlights_1_5_0.py @@ -27,7 +27,7 @@ # All binary classifiers of scikit-learn use a fixed decision threshold of 0.5 to # convert probability estimates (i.e. outputs of `predict_proba`) into class # predictions. However it is often not the desired threshold for a given problem. -# :class:`model_selection.FixedThresholdClassifier` allows to wrap any binary +# :class:`~model_selection.FixedThresholdClassifier` allows to wrap any binary # classifier and set a custom decision threshold. from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression @@ -52,7 +52,7 @@ # TunedThresholdClassifierCV: Tuning the decision threshold of a binary classifier # -------------------------------------------------------------------------------- # The decision threshold of a binary classifier can be tuned to optimize a given -# metric, using :class:`model_selection.TunedThresholdClassifierCV`. +# metric, using :class:`~model_selection.TunedThresholdClassifierCV`. from sklearn.metrics import balanced_accuracy_score # Due to the class imbalance, the balanced accuracy is not optimal for the default @@ -72,7 +72,7 @@ print("balanced accuracy:", balanced_accuracy_score(y, tuned_classifier.predict(X))) # %% -# :class:`model_selection.TunedThresholdClassifierCV` also benefits from the +# :class:`~model_selection.TunedThresholdClassifierCV` also benefits from the # metadata routing support allowing to optimze complex business metrics, detailed # in :ref:`Post-tuning the decision threshold for cost-sensitive learning # `. @@ -80,7 +80,7 @@ # %% # Performance improvements in PCA # ------------------------------- -# :class:`decomposition.PCA` has a new solver, "covariance_eigh", which is faster +# :class:`~decomposition.PCA` has a new solver, "covariance_eigh", which is faster # and more memory efficient than the other solvers for datasets with a large number # of samples and a small number of features. from sklearn.datasets import make_low_rank_matrix @@ -108,7 +108,7 @@ # %% # ColumnTransformer is subscriptable # ---------------------------------- -# The transformers of a :class:`compose.ColumnTransformer` can now be directly +# The transformers of a :class:`~compose.ColumnTransformer` can now be directly # accessed using indexing by name. import numpy as np from sklearn.compose import ColumnTransformer @@ -127,7 +127,7 @@ # %% # Custom imputation strategies for the SimpleImputer # -------------------------------------------------- -# :class:`impute.SimpleImputer` now supports custom strategies for imputation, +# :class:`~impute.SimpleImputer` now supports custom strategies for imputation, # using a callable that computes a scalar value from the non missing values of # a column vector. from sklearn.impute import SimpleImputer @@ -156,7 +156,7 @@ def smallest_abs(arr): # %% # Pairwise distances with non-numeric arrays # ------------------------------------------ -# :func:`metrics.pairwise_distances` can now compute distances between +# :func:`~metrics.pairwise_distances` can now compute distances between # non-numeric arrays using a callable metric. from sklearn.metrics import pairwise_distances From 444218be929c813da1551b370218edc4fcbaf95b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Fri, 17 May 2024 17:51:16 +0200 Subject: [PATCH 07/10] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- .../plot_release_highlights_1_5_0.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/release_highlights/plot_release_highlights_1_5_0.py b/examples/release_highlights/plot_release_highlights_1_5_0.py index b948cac246266..3de528467f83d 100644 --- a/examples/release_highlights/plot_release_highlights_1_5_0.py +++ b/examples/release_highlights/plot_release_highlights_1_5_0.py @@ -57,7 +57,7 @@ # Due to the class imbalance, the balanced accuracy is not optimal for the default # threshold. The classifier tends to over predict the majority class. -print("balanced accuracy:", balanced_accuracy_score(y, classifier.predict(X))) +print(f"balanced accuracy: {balanced_accuracy_score(y, classifier.predict(X)):.2f}") # %% # Tuning the threshold to optimize the balanced accuracy gives a smaller threshold @@ -68,8 +68,10 @@ classifier, cv=5, scoring="balanced_accuracy" ).fit(X, y) -print("new threshold:", tuned_classifier.best_threshold_) -print("balanced accuracy:", balanced_accuracy_score(y, tuned_classifier.predict(X))) +print("new threshold: {tuned_classifier.best_threshold_:.4f}") +print( + "balanced accuracy: {balanced_accuracy_score(y, tuned_classifier.predict(X)):.2f}" +) # %% # :class:`~model_selection.TunedThresholdClassifierCV` also benefits from the @@ -92,7 +94,7 @@ pca = PCA(n_components=10).fit(X) -print("explained variance:", pca.explained_variance_ratio_.sum()) +print(f"explained variance: {pca.explained_variance_ratio_.sum():.2f}") # %% # The "full" solver has also been improved to use less memory and allows to From 19f17bc65bf7b3f1ad18719ebc1a38078927ec06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Fri, 17 May 2024 18:10:33 +0200 Subject: [PATCH 08/10] Update examples/release_highlights/plot_release_highlights_1_5_0.py --- examples/release_highlights/plot_release_highlights_1_5_0.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/release_highlights/plot_release_highlights_1_5_0.py b/examples/release_highlights/plot_release_highlights_1_5_0.py index 3de528467f83d..3448dc90ebd03 100644 --- a/examples/release_highlights/plot_release_highlights_1_5_0.py +++ b/examples/release_highlights/plot_release_highlights_1_5_0.py @@ -162,8 +162,8 @@ def smallest_abs(arr): # non-numeric arrays using a callable metric. from sklearn.metrics import pairwise_distances -X = np.array(["cat", "dog"]) -Y = np.array(["cat", "fox"]) +X = ["cat", "dog"] +Y = ["cat", "fox"] def levenshtein_distance(x, y): From d8cb520dc4344f02a3eed1eda24cf0ddcc399ee2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Mon, 20 May 2024 10:29:30 +0200 Subject: [PATCH 09/10] Update examples/release_highlights/plot_release_highlights_1_5_0.py Co-authored-by: Christian Lorentzen --- examples/release_highlights/plot_release_highlights_1_5_0.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/release_highlights/plot_release_highlights_1_5_0.py b/examples/release_highlights/plot_release_highlights_1_5_0.py index 3448dc90ebd03..a4aed7c5db61f 100644 --- a/examples/release_highlights/plot_release_highlights_1_5_0.py +++ b/examples/release_highlights/plot_release_highlights_1_5_0.py @@ -25,7 +25,7 @@ # FixedThresholdClassifier: Setting the decision threshold of a binary classifier # ------------------------------------------------------------------------------- # All binary classifiers of scikit-learn use a fixed decision threshold of 0.5 to -# convert probability estimates (i.e. outputs of `predict_proba`) into class +# convert probability estimates (i.e. output of `predict_proba`) into class # predictions. However it is often not the desired threshold for a given problem. # :class:`~model_selection.FixedThresholdClassifier` allows to wrap any binary # classifier and set a custom decision threshold. From 5bdc8533c0a9e563b0003c41cbd85f24316b12ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Mon, 20 May 2024 10:41:11 +0200 Subject: [PATCH 10/10] address comments --- .../plot_release_highlights_1_5_0.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/release_highlights/plot_release_highlights_1_5_0.py b/examples/release_highlights/plot_release_highlights_1_5_0.py index a4aed7c5db61f..0acc6fda6589d 100644 --- a/examples/release_highlights/plot_release_highlights_1_5_0.py +++ b/examples/release_highlights/plot_release_highlights_1_5_0.py @@ -8,7 +8,7 @@ We are pleased to announce the release of scikit-learn 1.5! Many bug fixes and improvements were added, as well as some key new features. Below we -detail a few of the major features of this release. **For an exhaustive list of +detail the highlights of this release. **For an exhaustive list of all the changes**, please refer to the :ref:`release notes `. To install the latest version (with pip):: @@ -26,7 +26,7 @@ # ------------------------------------------------------------------------------- # All binary classifiers of scikit-learn use a fixed decision threshold of 0.5 to # convert probability estimates (i.e. output of `predict_proba`) into class -# predictions. However it is often not the desired threshold for a given problem. +# predictions. However, 0.5 is almost never the desired threshold for a given problem. # :class:`~model_selection.FixedThresholdClassifier` allows to wrap any binary # classifier and set a custom decision threshold. from sklearn.datasets import make_classification @@ -39,9 +39,9 @@ print("confusion matrix:\n", confusion_matrix(y, classifier.predict(X))) # %% -# Moving the threshold towards the negative class, i.e. allowing more samples to be -# classified as the positive class, increases the number of true positives at the -# cost of more false positives. +# Lowering the threshold, i.e. allowing more samples to be classified as the positive +# class, increases the number of true positives at the cost of more false positives +# (as is well known from the concavity of the ROC curve). from sklearn.model_selection import FixedThresholdClassifier wrapped_classifier = FixedThresholdClassifier(classifier, threshold=0.1).fit(X, y) @@ -68,14 +68,15 @@ classifier, cv=5, scoring="balanced_accuracy" ).fit(X, y) -print("new threshold: {tuned_classifier.best_threshold_:.4f}") +print(f"new threshold: {tuned_classifier.best_threshold_:.4f}") print( - "balanced accuracy: {balanced_accuracy_score(y, tuned_classifier.predict(X)):.2f}" + f"balanced accuracy: {balanced_accuracy_score(y, tuned_classifier.predict(X)):.2f}" ) # %% # :class:`~model_selection.TunedThresholdClassifierCV` also benefits from the -# metadata routing support allowing to optimze complex business metrics, detailed +# metadata routing support (:ref:`Metadata Routing User Guide`) +# allowing to optimze complex business metrics, detailed # in :ref:`Post-tuning the decision threshold for cost-sensitive learning # `.