DOC Various improvements and more details in release highlights for 1.5 (scikit-learn#29056)

ogrisel · jeremiedbb · web-flow · commit 071293d7ded1 · 2024-05-21T14:51:01.000Z
Co-authored-by: Jérémie du Boisberranger &lt;jeremie@probabl.ai&gt;
diff --git a/examples/release_highlights/plot_release_highlights_1_5_0.py b/examples/release_highlights/plot_release_highlights_1_5_0.py
@@ -24,89 +24,136 @@
 # %%
 # FixedThresholdClassifier: Setting the decision threshold of a binary classifier
 # -------------------------------------------------------------------------------
-# All binary classifiers of scikit-learn use a fixed decision threshold of 0.5 to
-# convert probability estimates (i.e. output of `predict_proba`) into class
-# predictions. However, 0.5 is almost never the desired threshold for a given problem.
-# :class:`~model_selection.FixedThresholdClassifier` allows to wrap any binary
-# classifier and set a custom decision threshold.
+# All binary classifiers of scikit-learn use a fixed decision threshold of 0.5
+# to convert probability estimates (i.e. output of `predict_proba`) into class
+# predictions. However, 0.5 is almost never the desired threshold for a given
+# problem. :class:`~model_selection.FixedThresholdClassifier` allows wrapping any
+# binary classifier and setting a custom decision threshold.
 from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import confusion_matrix
+from sklearn.metrics import ConfusionMatrixDisplay
+
 
-X, y = make_classification(n_samples=1_000, weights=[0.9, 0.1], random_state=0)
-classifier = LogisticRegression(random_state=0).fit(X, y)
+X, y = make_classification(n_samples=10_000, weights=[0.9, 0.1], random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 
-print("confusion matrix:\n", confusion_matrix(y, classifier.predict(X)))
+classifier_05 = LogisticRegression(C=1e6, random_state=0).fit(X_train, y_train)
+_ = ConfusionMatrixDisplay.from_estimator(classifier_05, X_test, y_test)
 
 # %%
 # Lowering the threshold, i.e. allowing more samples to be classified as the positive
 # class, increases the number of true positives at the cost of more false positives
 # (as is well known from the concavity of the ROC curve).
 from sklearn.model_selection import FixedThresholdClassifier
 
-wrapped_classifier = FixedThresholdClassifier(classifier, threshold=0.1).fit(X, y)
-
-print("confusion matrix:\n", confusion_matrix(y, wrapped_classifier.predict(X)))
+classifier_01 = FixedThresholdClassifier(classifier_05, threshold=0.1)
+classifier_01.fit(X_train, y_train)
+_ = ConfusionMatrixDisplay.from_estimator(classifier_01, X_test, y_test)
 
 # %%
 # TunedThresholdClassifierCV: Tuning the decision threshold of a binary classifier
 # --------------------------------------------------------------------------------
-# The decision threshold of a binary classifier can be tuned to optimize a given
-# metric, using :class:`~model_selection.TunedThresholdClassifierCV`.
-from sklearn.metrics import balanced_accuracy_score
+# The decision threshold of a binary classifier can be tuned to optimize a
+# given metric, using :class:`~model_selection.TunedThresholdClassifierCV`.
+#
+# It is particularly useful to find the best decision threshold when the model
+# is meant to be deployed in a specific application context where we can assign
+# different gains or costs for true positives, true negatives, false positives,
+# and false negatives.
+#
+# Let's illustrate this by considering an arbitrary case where:
+#
+# - each true positive gains 1 unit of profit, e.g. euro, year of life in good
+#   health, etc.;
+# - true negatives gain or cost nothing;
+# - each false negative costs 2;
+# - each false positive costs 0.1.
+#
+# Our metric quantifies the average profit per sample, which is defined by the
+# following Python function:
+from sklearn.metrics import confusion_matrix
+
+
+def custom_score(y_observed, y_pred):
+    tn, fp, fn, tp = confusion_matrix(y_observed, y_pred, normalize="all").ravel()
+    return tp - 2 * fn - 0.1 * fp
 
-# Due to the class imbalance, the balanced accuracy is not optimal for the default
-# threshold. The classifier tends to over predict the majority class.
-print(f"balanced accuracy: {balanced_accuracy_score(y, classifier.predict(X)):.2f}")
+
+print("Untuned decision threshold: 0.5")
+print(f"Custom score: {custom_score(y_test, classifier_05.predict(X_test)):.2f}")
 
 # %%
-# Tuning the threshold to optimize the balanced accuracy gives a smaller threshold
-# that allows more samples to be classified as the positive class.
+# It is interesting to observe that the average gain per prediction is negative
+# which means that this decision system is making a loss on average.
+#
+# Tuning the threshold to optimize this custom metric gives a smaller threshold
+# that allows more samples to be classified as the positive class. As a result,
+# the average gain per prediction improves.
 from sklearn.model_selection import TunedThresholdClassifierCV
+from sklearn.metrics import make_scorer
 
+custom_scorer = make_scorer(
+    custom_score, response_method="predict", greater_is_better=True
+)
 tuned_classifier = TunedThresholdClassifierCV(
-    classifier, cv=5, scoring="balanced_accuracy"
+    classifier_05, cv=5, scoring=custom_scorer
 ).fit(X, y)
 
-print(f"new threshold: {tuned_classifier.best_threshold_:.4f}")
-print(
-    f"balanced accuracy: {balanced_accuracy_score(y, tuned_classifier.predict(X)):.2f}"
-)
+print(f"Tuned decision threshold: {tuned_classifier.best_threshold_:.3f}")
+print(f"Custom score: {custom_score(y_test, tuned_classifier.predict(X_test)):.2f}")
 
 # %%
-# :class:`~model_selection.TunedThresholdClassifierCV` also benefits from the
-# metadata routing support (:ref:`Metadata Routing User Guide<metadata_routing>`)
-# allowing to optimze complex business metrics, detailed
-# in :ref:`Post-tuning the decision threshold for cost-sensitive learning
+# We observe that tuning the decision threshold can turn a machine
+# learning-based system that makes a loss on average into a beneficial one.
+#
+# In practice, defining a meaningful application-specific metric might involve
+# making those costs for bad predictions and gains for good predictions depend on
+# auxiliary metadata specific to each individual data point such as the amount
+# of a transaction in a fraud detection system.
+#
+# To achieve this, :class:`~model_selection.TunedThresholdClassifierCV`
+# leverages metadata routing support (:ref:`Metadata Routing User
+# Guide<metadata_routing>`) allowing to optimize complex business metrics as
+# detailed in :ref:`Post-tuning the decision threshold for cost-sensitive
+# learning
 # <sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py>`.
 
 # %%
 # Performance improvements in PCA
 # -------------------------------
-# :class:`~decomposition.PCA` has a new solver, "covariance_eigh", which is faster
-# and more memory efficient than the other solvers for datasets with a large number
-# of samples and a small number of features.
+# :class:`~decomposition.PCA` has a new solver, `"covariance_eigh"`, which is
+# up to an order of magnitude faster and more memory efficient than the other
+# solvers for datasets with many data points and few features.
 from sklearn.datasets import make_low_rank_matrix
 from sklearn.decomposition import PCA
 
 X = make_low_rank_matrix(
     n_samples=10_000, n_features=100, tail_strength=0.1, random_state=0
 )
 
-pca = PCA(n_components=10).fit(X)
+pca = PCA(n_components=10, svd_solver="covariance_eigh").fit(X)
+print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2f}")
 
-print(f"explained variance: {pca.explained_variance_ratio_.sum():.2f}")
 
 # %%
-# The "full" solver has also been improved to use less memory and allows to
-# transform faster. The "auto" option for the solver takes advantage of the
-# new solver and is now able to select an appropriate solver for sparse
-# datasets.
+# The new solver also accepts sparse input data:
 from scipy.sparse import random
 
-X = random(10000, 100, format="csr", random_state=0)
+X = random(10_000, 100, format="csr", random_state=0)
 
-pca = PCA(n_components=10, svd_solver="auto").fit(X)
+pca = PCA(n_components=10, svd_solver="covariance_eigh").fit(X)
+print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2f}")
+
+# %%
+# The `"full"` solver has also been improved to use less memory and allows
+# faster transformation. The default `svd_solver="auto"`` option takes
+# advantage of the new solver and is now able to select an appropriate solver
+# for sparse datasets.
+#
+# Similarly to most other PCA solvers, the new `"covariance_eigh"` solver can leverage
+# GPU computation if the input data is passed as a PyTorch or CuPy array by
+# enabling the experimental support for :ref:`Array API <array_api>`.
 
 # %%
 # ColumnTransformer is subscriptable