Skip to content

Commit 071293d

Browse files
ogriseljeremiedbb
andauthored
DOC Various improvements and more details in release highlights for 1.5 (scikit-learn#29056)
Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
1 parent 0f27a26 commit 071293d

File tree

1 file changed

+87
-40
lines changed

1 file changed

+87
-40
lines changed

examples/release_highlights/plot_release_highlights_1_5_0.py

Lines changed: 87 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -24,89 +24,136 @@
2424
# %%
2525
# FixedThresholdClassifier: Setting the decision threshold of a binary classifier
2626
# -------------------------------------------------------------------------------
27-
# All binary classifiers of scikit-learn use a fixed decision threshold of 0.5 to
28-
# convert probability estimates (i.e. output of `predict_proba`) into class
29-
# predictions. However, 0.5 is almost never the desired threshold for a given problem.
30-
# :class:`~model_selection.FixedThresholdClassifier` allows to wrap any binary
31-
# classifier and set a custom decision threshold.
27+
# All binary classifiers of scikit-learn use a fixed decision threshold of 0.5
28+
# to convert probability estimates (i.e. output of `predict_proba`) into class
29+
# predictions. However, 0.5 is almost never the desired threshold for a given
30+
# problem. :class:`~model_selection.FixedThresholdClassifier` allows wrapping any
31+
# binary classifier and setting a custom decision threshold.
3232
from sklearn.datasets import make_classification
33+
from sklearn.model_selection import train_test_split
3334
from sklearn.linear_model import LogisticRegression
34-
from sklearn.metrics import confusion_matrix
35+
from sklearn.metrics import ConfusionMatrixDisplay
36+
3537

36-
X, y = make_classification(n_samples=1_000, weights=[0.9, 0.1], random_state=0)
37-
classifier = LogisticRegression(random_state=0).fit(X, y)
38+
X, y = make_classification(n_samples=10_000, weights=[0.9, 0.1], random_state=0)
39+
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
3840

39-
print("confusion matrix:\n", confusion_matrix(y, classifier.predict(X)))
41+
classifier_05 = LogisticRegression(C=1e6, random_state=0).fit(X_train, y_train)
42+
_ = ConfusionMatrixDisplay.from_estimator(classifier_05, X_test, y_test)
4043

4144
# %%
4245
# Lowering the threshold, i.e. allowing more samples to be classified as the positive
4346
# class, increases the number of true positives at the cost of more false positives
4447
# (as is well known from the concavity of the ROC curve).
4548
from sklearn.model_selection import FixedThresholdClassifier
4649

47-
wrapped_classifier = FixedThresholdClassifier(classifier, threshold=0.1).fit(X, y)
48-
49-
print("confusion matrix:\n", confusion_matrix(y, wrapped_classifier.predict(X)))
50+
classifier_01 = FixedThresholdClassifier(classifier_05, threshold=0.1)
51+
classifier_01.fit(X_train, y_train)
52+
_ = ConfusionMatrixDisplay.from_estimator(classifier_01, X_test, y_test)
5053

5154
# %%
5255
# TunedThresholdClassifierCV: Tuning the decision threshold of a binary classifier
5356
# --------------------------------------------------------------------------------
54-
# The decision threshold of a binary classifier can be tuned to optimize a given
55-
# metric, using :class:`~model_selection.TunedThresholdClassifierCV`.
56-
from sklearn.metrics import balanced_accuracy_score
57+
# The decision threshold of a binary classifier can be tuned to optimize a
58+
# given metric, using :class:`~model_selection.TunedThresholdClassifierCV`.
59+
#
60+
# It is particularly useful to find the best decision threshold when the model
61+
# is meant to be deployed in a specific application context where we can assign
62+
# different gains or costs for true positives, true negatives, false positives,
63+
# and false negatives.
64+
#
65+
# Let's illustrate this by considering an arbitrary case where:
66+
#
67+
# - each true positive gains 1 unit of profit, e.g. euro, year of life in good
68+
# health, etc.;
69+
# - true negatives gain or cost nothing;
70+
# - each false negative costs 2;
71+
# - each false positive costs 0.1.
72+
#
73+
# Our metric quantifies the average profit per sample, which is defined by the
74+
# following Python function:
75+
from sklearn.metrics import confusion_matrix
76+
77+
78+
def custom_score(y_observed, y_pred):
79+
tn, fp, fn, tp = confusion_matrix(y_observed, y_pred, normalize="all").ravel()
80+
return tp - 2 * fn - 0.1 * fp
5781

58-
# Due to the class imbalance, the balanced accuracy is not optimal for the default
59-
# threshold. The classifier tends to over predict the majority class.
60-
print(f"balanced accuracy: {balanced_accuracy_score(y, classifier.predict(X)):.2f}")
82+
83+
print("Untuned decision threshold: 0.5")
84+
print(f"Custom score: {custom_score(y_test, classifier_05.predict(X_test)):.2f}")
6185

6286
# %%
63-
# Tuning the threshold to optimize the balanced accuracy gives a smaller threshold
64-
# that allows more samples to be classified as the positive class.
87+
# It is interesting to observe that the average gain per prediction is negative
88+
# which means that this decision system is making a loss on average.
89+
#
90+
# Tuning the threshold to optimize this custom metric gives a smaller threshold
91+
# that allows more samples to be classified as the positive class. As a result,
92+
# the average gain per prediction improves.
6593
from sklearn.model_selection import TunedThresholdClassifierCV
94+
from sklearn.metrics import make_scorer
6695

96+
custom_scorer = make_scorer(
97+
custom_score, response_method="predict", greater_is_better=True
98+
)
6799
tuned_classifier = TunedThresholdClassifierCV(
68-
classifier, cv=5, scoring="balanced_accuracy"
100+
classifier_05, cv=5, scoring=custom_scorer
69101
).fit(X, y)
70102

71-
print(f"new threshold: {tuned_classifier.best_threshold_:.4f}")
72-
print(
73-
f"balanced accuracy: {balanced_accuracy_score(y, tuned_classifier.predict(X)):.2f}"
74-
)
103+
print(f"Tuned decision threshold: {tuned_classifier.best_threshold_:.3f}")
104+
print(f"Custom score: {custom_score(y_test, tuned_classifier.predict(X_test)):.2f}")
75105

76106
# %%
77-
# :class:`~model_selection.TunedThresholdClassifierCV` also benefits from the
78-
# metadata routing support (:ref:`Metadata Routing User Guide<metadata_routing>`)
79-
# allowing to optimze complex business metrics, detailed
80-
# in :ref:`Post-tuning the decision threshold for cost-sensitive learning
107+
# We observe that tuning the decision threshold can turn a machine
108+
# learning-based system that makes a loss on average into a beneficial one.
109+
#
110+
# In practice, defining a meaningful application-specific metric might involve
111+
# making those costs for bad predictions and gains for good predictions depend on
112+
# auxiliary metadata specific to each individual data point such as the amount
113+
# of a transaction in a fraud detection system.
114+
#
115+
# To achieve this, :class:`~model_selection.TunedThresholdClassifierCV`
116+
# leverages metadata routing support (:ref:`Metadata Routing User
117+
# Guide<metadata_routing>`) allowing to optimize complex business metrics as
118+
# detailed in :ref:`Post-tuning the decision threshold for cost-sensitive
119+
# learning
81120
# <sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py>`.
82121

83122
# %%
84123
# Performance improvements in PCA
85124
# -------------------------------
86-
# :class:`~decomposition.PCA` has a new solver, "covariance_eigh", which is faster
87-
# and more memory efficient than the other solvers for datasets with a large number
88-
# of samples and a small number of features.
125+
# :class:`~decomposition.PCA` has a new solver, `"covariance_eigh"`, which is
126+
# up to an order of magnitude faster and more memory efficient than the other
127+
# solvers for datasets with many data points and few features.
89128
from sklearn.datasets import make_low_rank_matrix
90129
from sklearn.decomposition import PCA
91130

92131
X = make_low_rank_matrix(
93132
n_samples=10_000, n_features=100, tail_strength=0.1, random_state=0
94133
)
95134

96-
pca = PCA(n_components=10).fit(X)
135+
pca = PCA(n_components=10, svd_solver="covariance_eigh").fit(X)
136+
print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2f}")
97137

98-
print(f"explained variance: {pca.explained_variance_ratio_.sum():.2f}")
99138

100139
# %%
101-
# The "full" solver has also been improved to use less memory and allows to
102-
# transform faster. The "auto" option for the solver takes advantage of the
103-
# new solver and is now able to select an appropriate solver for sparse
104-
# datasets.
140+
# The new solver also accepts sparse input data:
105141
from scipy.sparse import random
106142

107-
X = random(10000, 100, format="csr", random_state=0)
143+
X = random(10_000, 100, format="csr", random_state=0)
108144

109-
pca = PCA(n_components=10, svd_solver="auto").fit(X)
145+
pca = PCA(n_components=10, svd_solver="covariance_eigh").fit(X)
146+
print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2f}")
147+
148+
# %%
149+
# The `"full"` solver has also been improved to use less memory and allows
150+
# faster transformation. The default `svd_solver="auto"`` option takes
151+
# advantage of the new solver and is now able to select an appropriate solver
152+
# for sparse datasets.
153+
#
154+
# Similarly to most other PCA solvers, the new `"covariance_eigh"` solver can leverage
155+
# GPU computation if the input data is passed as a PyTorch or CuPy array by
156+
# enabling the experimental support for :ref:`Array API <array_api>`.
110157

111158
# %%
112159
# ColumnTransformer is subscriptable

0 commit comments

Comments
 (0)