Skip to content
183 changes: 183 additions & 0 deletions examples/release_highlights/plot_release_highlights_1_5_0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
# ruff: noqa
"""
=======================================
Release Highlights for scikit-learn 1.5
=======================================

.. currentmodule:: sklearn

We are pleased to announce the release of scikit-learn 1.5! Many bug fixes
and improvements were added, as well as some key new features. Below we
detail the highlights of this release. **For an exhaustive list of
all the changes**, please refer to the :ref:`release notes <release_notes_1_5>`.

To install the latest version (with pip)::

pip install --upgrade scikit-learn

or with conda::

conda install -c conda-forge scikit-learn

"""

# %%
# FixedThresholdClassifier: Setting the decision threshold of a binary classifier
# -------------------------------------------------------------------------------
# All binary classifiers of scikit-learn use a fixed decision threshold of 0.5 to
# convert probability estimates (i.e. output of `predict_proba`) into class
# predictions. However, 0.5 is almost never the desired threshold for a given problem.
# :class:`~model_selection.FixedThresholdClassifier` allows to wrap any binary
# classifier and set a custom decision threshold.
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

X, y = make_classification(n_samples=1_000, weights=[0.9, 0.1], random_state=0)
classifier = LogisticRegression(random_state=0).fit(X, y)

print("confusion matrix:\n", confusion_matrix(y, classifier.predict(X)))

# %%
# Lowering the threshold, i.e. allowing more samples to be classified as the positive
# class, increases the number of true positives at the cost of more false positives
# (as is well known from the concavity of the ROC curve).
from sklearn.model_selection import FixedThresholdClassifier

wrapped_classifier = FixedThresholdClassifier(classifier, threshold=0.1).fit(X, y)

print("confusion matrix:\n", confusion_matrix(y, wrapped_classifier.predict(X)))

# %%
# TunedThresholdClassifierCV: Tuning the decision threshold of a binary classifier
# --------------------------------------------------------------------------------
# The decision threshold of a binary classifier can be tuned to optimize a given
# metric, using :class:`~model_selection.TunedThresholdClassifierCV`.
from sklearn.metrics import balanced_accuracy_score

# Due to the class imbalance, the balanced accuracy is not optimal for the default
# threshold. The classifier tends to over predict the majority class.
print(f"balanced accuracy: {balanced_accuracy_score(y, classifier.predict(X)):.2f}")

# %%
# Tuning the threshold to optimize the balanced accuracy gives a smaller threshold
# that allows more samples to be classified as the positive class.
from sklearn.model_selection import TunedThresholdClassifierCV

tuned_classifier = TunedThresholdClassifierCV(
classifier, cv=5, scoring="balanced_accuracy"
).fit(X, y)

print(f"new threshold: {tuned_classifier.best_threshold_:.4f}")
print(
f"balanced accuracy: {balanced_accuracy_score(y, tuned_classifier.predict(X)):.2f}"
)

# %%
# :class:`~model_selection.TunedThresholdClassifierCV` also benefits from the
# metadata routing support (:ref:`Metadata Routing User Guide<metadata_routing>`)
# allowing to optimze complex business metrics, detailed
# in :ref:`Post-tuning the decision threshold for cost-sensitive learning
# <sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py>`.

# %%
# Performance improvements in PCA
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not actionable: I know we are a bit thin with big new things and this is one of them. But we did not list the newton-cholesky solver in 1.2 as a highlight.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should have promoted the newton-cholesky in 1.2.

If I put on my "developer relationships" hat on, I would add a new entry for newton-cholesky solver in the 1.2 highlights and link to it in the 1.5 highlights.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But we did not list the newton-cholesky solver in 1.2 as a highlight.

It could have but all the reviewers for the 1.2 highlights missed it, including me. That said there were already many entries in the 1.2 highlights and there's always a compromise to make.

I would add a new entry for newton-cholesky solver in the 1.2 highlights and link to it in the 1.5 highlights.

I don't think adding a new entry today in the 1.2 highlights is relevant. It's already not even accessible from the front page. I expect a very low traffic on this page now.

# -------------------------------
# :class:`~decomposition.PCA` has a new solver, "covariance_eigh", which is faster
# and more memory efficient than the other solvers for datasets with a large number
# of samples and a small number of features.
from sklearn.datasets import make_low_rank_matrix
from sklearn.decomposition import PCA

X = make_low_rank_matrix(
n_samples=10_000, n_features=100, tail_strength=0.1, random_state=0
)

pca = PCA(n_components=10).fit(X)

print(f"explained variance: {pca.explained_variance_ratio_.sum():.2f}")

# %%
# The "full" solver has also been improved to use less memory and allows to
# transform faster. The "auto" option for the solver takes advantage of the
# new solver and is now able to select an appropriate solver for sparse
# datasets.
from scipy.sparse import random

X = random(10000, 100, format="csr", random_state=0)

pca = PCA(n_components=10, svd_solver="auto").fit(X)

# %%
# ColumnTransformer is subscriptable
# ----------------------------------
# The transformers of a :class:`~compose.ColumnTransformer` can now be directly
# accessed using indexing by name.
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

X = np.array([[0, 1, 2], [3, 4, 5]])
column_transformer = ColumnTransformer(
[("std_scaler", StandardScaler(), [0]), ("one_hot", OneHotEncoder(), [1, 2])]
)

column_transformer.fit(X)

print(column_transformer["std_scaler"])
print(column_transformer["one_hot"])

# %%
# Custom imputation strategies for the SimpleImputer
# --------------------------------------------------
# :class:`~impute.SimpleImputer` now supports custom strategies for imputation,
# using a callable that computes a scalar value from the non missing values of
# a column vector.
from sklearn.impute import SimpleImputer

X = np.array(
[
[-1.1, 1.1, 1.1],
[3.9, -1.2, np.nan],
[np.nan, 1.3, np.nan],
[-0.1, -1.4, -1.4],
[-4.9, 1.5, -1.5],
[np.nan, 1.6, 1.6],
]
)


def smallest_abs(arr):
"""Return the smallest absolute value of a 1D array."""
return np.min(np.abs(arr))


imputer = SimpleImputer(strategy=smallest_abs)

imputer.fit_transform(X)

# %%
# Pairwise distances with non-numeric arrays
# ------------------------------------------
# :func:`~metrics.pairwise_distances` can now compute distances between
# non-numeric arrays using a callable metric.
from sklearn.metrics import pairwise_distances

X = ["cat", "dog"]
Y = ["cat", "fox"]


def levenshtein_distance(x, y):
"""Return the Levenshtein distance between two strings."""
if x == "" or y == "":
return max(len(x), len(y))
if x[0] == y[0]:
return levenshtein_distance(x[1:], y[1:])
return 1 + min(
levenshtein_distance(x[1:], y),
levenshtein_distance(x, y[1:]),
levenshtein_distance(x[1:], y[1:]),
)


pairwise_distances(X, Y, metric=levenshtein_distance)