Skip to content

ENH accept class_of_interest in DecisionBoundaryDisplay to inspect multiclass classifiers #27291

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions doc/whats_new/v1.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,20 @@ Changelog
deprecated and will be removed in 1.6. :pr:`26830` by :user:`Stefanie Senger
<StefanieSenger>`.

:mod:`sklearn.inspection`
.........................

- |Enhancement| :class:`inspection.DecisionBoundaryDisplay` now accepts a parameter
`class_of_interest` to select the class of interest when plotting the response
provided by `response_method="predict_proba"` or
`response_method="decision_function"`. It allows to plot the decision boundary for
both binary and multiclass classifiers.
:pr:`27291` by :user:`Guillaume Lemaitre <glemaitre>`.

- |API| :class:`inspection.DecisionBoundaryDisplay` raise an `AttributeError` instead
of a `ValueError` when an estimator does not implement the requested response method.
:pr:`27291` by :user:`Guillaume Lemaitre <glemaitre>`.

:mod:`sklearn.linear_model`
...........................

Expand Down
63 changes: 32 additions & 31 deletions examples/classification/plot_classification_probability.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@

import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm

from sklearn import datasets
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
Expand Down Expand Up @@ -56,40 +58,39 @@

n_classifiers = len(classifiers)

plt.figure(figsize=(3 * 2, n_classifiers * 2))
plt.subplots_adjust(bottom=0.2, top=0.95)

xx = np.linspace(3, 9, 100)
yy = np.linspace(1, 5, 100).T
xx, yy = np.meshgrid(xx, yy)
Xfull = np.c_[xx.ravel(), yy.ravel()]

for index, (name, classifier) in enumerate(classifiers.items()):
classifier.fit(X, y)

y_pred = classifier.predict(X)
fig, axes = plt.subplots(
nrows=n_classifiers,
ncols=len(iris.target_names),
figsize=(3 * 2, n_classifiers * 2),
)
for classifier_idx, (name, classifier) in enumerate(classifiers.items()):
y_pred = classifier.fit(X, y).predict(X)
accuracy = accuracy_score(y, y_pred)
print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))

# View probabilities:
probas = classifier.predict_proba(Xfull)
n_classes = np.unique(y_pred).size
for k in range(n_classes):
plt.subplot(n_classifiers, n_classes, index * n_classes + k + 1)
plt.title("Class %d" % k)
if k == 0:
plt.ylabel(name)
imshow_handle = plt.imshow(
probas[:, k].reshape((100, 100)), extent=(3, 9, 1, 5), origin="lower"
print(f"Accuracy (train) for {name}: {accuracy:0.1%}")
for label in np.unique(y):
# plot the probability estimate provided by the classifier
disp = DecisionBoundaryDisplay.from_estimator(
classifier,
X,
response_method="predict_proba",
class_of_interest=label,
ax=axes[classifier_idx, label],
vmin=0,
vmax=1,
)
axes[classifier_idx, label].set_title(f"Class {label}")
# plot data predicted to belong to given class
mask_y_pred = y_pred == label
axes[classifier_idx, label].scatter(
X[mask_y_pred, 0], X[mask_y_pred, 1], marker="o", c="w", edgecolor="k"
)
plt.xticks(())
plt.yticks(())
idx = y_pred == k
if idx.any():
plt.scatter(X[idx, 0], X[idx, 1], marker="o", c="w", edgecolor="k")
axes[classifier_idx, label].set(xticks=(), yticks=())
axes[classifier_idx, 0].set_ylabel(name)

ax = plt.axes([0.15, 0.04, 0.7, 0.05])
ax = plt.axes([0.15, 0.04, 0.7, 0.02])
plt.title("Probability")
plt.colorbar(imshow_handle, cax=ax, orientation="horizontal")
_ = plt.colorbar(
cm.ScalarMappable(norm=None, cmap="viridis"), cax=ax, orientation="horizontal"
)

plt.show()
82 changes: 57 additions & 25 deletions sklearn/inspection/_plot/decision_boundary.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
from functools import reduce

import numpy as np

from ...base import is_regressor
from ...preprocessing import LabelEncoder
from ...utils import _safe_indexing, check_matplotlib_support
from ...utils._response import _get_response_values
from ...utils.validation import (
_is_arraylike_not_scalar,
_num_features,
check_is_fitted,
)


def _check_boundary_response_method(estimator, response_method):
"""Return prediction method from the `response_method` for decision boundary.
def _check_boundary_response_method(estimator, response_method, class_of_interest):
"""Validate the response methods to be used with the fitted estimator.

Parameters
----------
Expand All @@ -26,36 +25,38 @@ def _check_boundary_response_method(estimator, response_method):
If set to 'auto', the response method is tried in the following order:
:term:`decision_function`, :term:`predict_proba`, :term:`predict`.

class_of_interest : int, float, bool, str or None
The class considered when plotting the decision. If the label is specified, it
is then possible to plot the decision boundary in multiclass settings.

.. versionadded:: 1.4

Returns
-------
prediction_method: callable
Prediction method of estimator.
prediction_method : list of str or str
The name or list of names of the response methods to use.
"""
has_classes = hasattr(estimator, "classes_")
if has_classes and _is_arraylike_not_scalar(estimator.classes_[0]):
msg = "Multi-label and multi-output multi-class classifiers are not supported"
raise ValueError(msg)

if has_classes and len(estimator.classes_) > 2:
if response_method not in {"auto", "predict"}:
if response_method not in {"auto", "predict"} and class_of_interest is None:
msg = (
"Multiclass classifiers are only supported when response_method is"
" 'predict' or 'auto'"
"Multiclass classifiers are only supported when `response_method` is "
"'predict' or 'auto'. Else you must provide `class_of_interest` to "
"plot the decision boundary of a specific class."
)
raise ValueError(msg)
methods_list = ["predict"]
prediction_method = "predict" if response_method == "auto" else response_method
elif response_method == "auto":
methods_list = ["decision_function", "predict_proba", "predict"]
if is_regressor(estimator):
prediction_method = "predict"
else:
prediction_method = ["decision_function", "predict_proba", "predict"]
else:
methods_list = [response_method]

prediction_method = [getattr(estimator, method, None) for method in methods_list]
prediction_method = reduce(lambda x, y: x or y, prediction_method)
if prediction_method is None:
raise ValueError(
f"{estimator.__class__.__name__} has none of the following attributes: "
f"{', '.join(methods_list)}."
)
prediction_method = response_method

return prediction_method

Expand Down Expand Up @@ -206,6 +207,7 @@ def from_estimator(
eps=1.0,
plot_method="contourf",
response_method="auto",
class_of_interest=None,
xlabel=None,
ylabel=None,
ax=None,
Expand Down Expand Up @@ -248,6 +250,14 @@ def from_estimator(
For multiclass problems, :term:`predict` is selected when
`response_method="auto"`.

class_of_interest : int, float, bool or str, default=None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what float and bool here mean? do we accept floats for classification targets? is bool accepted when target is boolean?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we accept floats for classification targets?

(unfortunatelly) yes we do. The type reported here are the same than the one accepted as pos_label in thresholded metric.

The class considered when plotting the decision. If None,
`estimator.classes_[1]` is considered as the positive class
for binary classifiers. For multiclass classifiers, passing
an explicit value for `class_of_interest` is mandatory.

.. versionadded:: 1.4

xlabel : str, default=None
The label used for the x-axis. If `None`, an attempt is made to
extract a label from `X` if it is a dataframe, otherwise an empty
Expand Down Expand Up @@ -342,11 +352,30 @@ def from_estimator(
else:
X_grid = np.c_[xx0.ravel(), xx1.ravel()]

pred_func = _check_boundary_response_method(estimator, response_method)
response = pred_func(X_grid)
prediction_method = _check_boundary_response_method(
estimator, response_method, class_of_interest
)
try:
response, _, response_method_used = _get_response_values(
estimator,
X_grid,
response_method=prediction_method,
pos_label=class_of_interest,
return_response_method_used=True,
)
except ValueError as exc:
if "is not a valid label" in str(exc):
# re-raise a more informative error message since `pos_label` is unknown
# to our user when interacting with
# `DecisionBoundaryDisplay.from_estimator`
raise ValueError(
f"class_of_interest={class_of_interest} is not a valid label: It "
f"should be one of {estimator.classes_}"
) from exc
raise

# convert classes predictions into integers
if pred_func.__name__ == "predict" and hasattr(estimator, "classes_"):
if response_method_used == "predict" and hasattr(estimator, "classes_"):
encoder = LabelEncoder()
encoder.classes_ = estimator.classes_
response = encoder.transform(response)
Expand All @@ -355,8 +384,11 @@ def from_estimator(
if is_regressor(estimator):
raise ValueError("Multi-output regressors are not supported")

# TODO: Support pos_label
response = response[:, 1]
# For the multiclass case, `_get_response_values` returns the response
# as-is. Thus, we have a column per class and we need to select the column
# corresponding to the positive class.
col_idx = np.flatnonzero(estimator.classes_ == class_of_interest)[0]
response = response[:, col_idx]

if xlabel is None:
xlabel = X.columns[0] if hasattr(X, "columns") else ""
Expand Down
Loading