diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 3fc6bb93e64c9..82f5378d9d07b 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -51,7 +51,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from joblib import Parallel from ..base import is_classifier -from ..base import ClassifierMixin, RegressorMixin, MultiOutputMixin +from ..base import ClassifierMixin, MultiOutputMixin, RegressorMixin from ..metrics import accuracy_score, r2_score from ..preprocessing import OneHotEncoder from ..tree import ( @@ -1052,6 +1052,9 @@ def _compute_partial_dependence_recursion(self, grid, target_features): return averaged_predictions + def _more_tags(self): + return {"multilabel": True} + class RandomForestClassifier(ForestClassifier): """ diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 7f900f56e7e54..67d9a47881953 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -21,7 +21,7 @@ from ._base import LinearClassifierMixin, LinearModel from ._base import _deprecate_normalize, _rescale_data from ._sag import sag_solver -from ..base import RegressorMixin, MultiOutputMixin, is_classifier +from ..base import MultiOutputMixin, RegressorMixin, is_classifier from ..utils.extmath import safe_sparse_dot from ..utils.extmath import row_norms from ..utils import check_array @@ -2319,9 +2319,17 @@ def classes_(self): def _more_tags(self): return { + "multilabel": True, "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), - } + # FIXME: see + # https://github.com/scikit-learn/scikit-learn/issues/19858 + # to track progress to resolve this issue + "check_classifiers_multilabel_output_format_predict": ( + "RidgeClassifierCV.predict outputs an array of shape (25,) " + "instead of (25, 5)" + ), + }, } diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 9ef91a7a2c3e3..4098b821cfebe 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -287,6 +287,9 @@ def predict_proba(self, X): return probabilities + def _more_tags(self): + return {"multilabel": True} + class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase): """Classifier implementing a vote among neighbors within a given radius @@ -651,3 +654,6 @@ def predict_proba(self, X): probabilities = probabilities[0] return probabilities + + def _more_tags(self): + return {"multilabel": True} diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 8d3b0853b5336..ce57d7376665a 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -6,6 +6,7 @@ # Jiyuan Qian # License: BSD 3 clause +from tkinter.tix import Tree import numpy as np from abc import ABCMeta, abstractmethod @@ -13,7 +14,11 @@ import scipy.optimize -from ..base import BaseEstimator, ClassifierMixin, RegressorMixin +from ..base import ( + BaseEstimator, + ClassifierMixin, + RegressorMixin, +) from ..base import is_classifier from ._base import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS from ._stochastic_optimizers import SGDOptimizer, AdamOptimizer @@ -1246,6 +1251,9 @@ def predict_proba(self, X): else: return y_pred + def _more_tags(self): + return {"multilabel": Tree} + class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): """Multi-layer Perceptron regressor. diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 0256513a64721..87a9b5f815e28 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1021,6 +1021,9 @@ def predict_log_proba(self, X): def n_features_(self): return self.n_features_in_ + def _more_tags(self): + return {"multilabel": True} + class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): """A decision tree regressor. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index ff853be22f663..7749484ea5b22 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -19,6 +19,7 @@ from ._testing import assert_array_almost_equal from ._testing import assert_allclose from ._testing import assert_allclose_dense_sparse +from ._testing import assert_array_less from ._testing import set_random_state from ._testing import SkipTest from ._testing import ignore_warnings @@ -141,6 +142,9 @@ def _yield_classifier_checks(classifier): yield check_classifiers_regression_target if tags["multilabel"]: yield check_classifiers_multilabel_representation_invariance + yield check_classifiers_multilabel_output_format_predict + yield check_classifiers_multilabel_output_format_predict_proba + yield check_classifiers_multilabel_output_format_decision_function if not tags["no_validation"]: yield check_supervised_y_no_nan if not tags["multioutput_only"]: @@ -651,7 +655,7 @@ def _set_checking_parameters(estimator): estimator.set_params(strategy="stratified") # Speed-up by reducing the number of CV or splits for CV estimators - loo_cv = ["RidgeCV"] + loo_cv = ["RidgeCV", "RidgeClassifierCV"] if name not in loo_cv and hasattr(estimator, "cv"): estimator.set_params(cv=3) if hasattr(estimator, "n_splits"): @@ -2258,18 +2262,18 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True): estimator.fit(X) -@ignore_warnings(category=(FutureWarning)) +@ignore_warnings(category=FutureWarning) def check_classifiers_multilabel_representation_invariance(name, classifier_orig): - X, y = make_multilabel_classification( n_samples=100, - n_features=20, + n_features=2, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0, ) + X = scale(X) X_train, y_train = X[:80], y[:80] X_test = X[80:] @@ -2299,6 +2303,181 @@ def check_classifiers_multilabel_representation_invariance(name, classifier_orig assert type(y_pred) == type(y_pred_list_of_lists) +@ignore_warnings(category=FutureWarning) +def check_classifiers_multilabel_output_format_predict(name, classifier_orig): + """Check the output of the `predict` method for classifiers supporting + multilabel-indicator targets.""" + classifier = clone(classifier_orig) + set_random_state(classifier) + + n_samples, test_size, n_outputs = 100, 25, 5 + X, y = make_multilabel_classification( + n_samples=n_samples, + n_features=2, + n_classes=n_outputs, + n_labels=3, + length=50, + allow_unlabeled=True, + random_state=0, + ) + X = scale(X) + + X_train, X_test = X[:-test_size], X[-test_size:] + y_train, y_test = y[:-test_size], y[-test_size:] + classifier.fit(X_train, y_train) + + response_method_name = "predict" + predict_method = getattr(classifier, response_method_name, None) + if predict_method is None: + raise SkipTest(f"{name} does not have a {response_method_name} method.") + + y_pred = predict_method(X_test) + + # y_pred.shape -> y_test.shape with the same dtype + assert isinstance(y_pred, np.ndarray), ( + f"{name}.predict is expected to output a NumPy array. Got " + f"{type(y_pred)} instead." + ) + assert y_pred.shape == y_test.shape, ( + f"{name}.predict outputs a NumPy array of shape {y_pred.shape} " + f"instead of {y_test.shape}." + ) + assert y_pred.dtype == y_test.dtype, ( + f"{name}.predict does not output the same dtype than the targets. " + f"Got {y_pred.dtype} instead of {y_test.dtype}." + ) + + +@ignore_warnings(category=FutureWarning) +def check_classifiers_multilabel_output_format_predict_proba(name, classifier_orig): + """Check the output of the `predict_proba` method for classifiers supporting + multilabel-indicator targets.""" + classifier = clone(classifier_orig) + set_random_state(classifier) + + n_samples, test_size, n_outputs = 100, 25, 5 + X, y = make_multilabel_classification( + n_samples=n_samples, + n_features=2, + n_classes=n_outputs, + n_labels=3, + length=50, + allow_unlabeled=True, + random_state=0, + ) + X = scale(X) + + X_train, X_test = X[:-test_size], X[-test_size:] + y_train = y[:-test_size] + classifier.fit(X_train, y_train) + + response_method_name = "predict_proba" + predict_proba_method = getattr(classifier, response_method_name, None) + if predict_proba_method is None: + raise SkipTest(f"{name} does not have a {response_method_name} method.") + + y_pred = predict_proba_method(X_test) + + # y_pred.shape -> 2 possibilities: + # - list of length n_outputs of shape (n_samples, 2); + # - ndarray of shape (n_samples, n_outputs). + # dtype should be floating + if isinstance(y_pred, list): + assert len(y_pred) == n_outputs, ( + f"When {name}.predict_proba returns a list, the list should " + "be of length n_outputs and contain NumPy arrays. Got length " + f"of {len(y_pred)} instead of {n_outputs}." + ) + for pred in y_pred: + assert pred.shape == (test_size, 2), ( + f"When {name}.predict_proba returns a list, this list " + "should contain NumPy arrays of shape (n_samples, 2). Got " + f"NumPy arrays of shape {pred.shape} instead of " + f"{(test_size, 2)}." + ) + assert pred.dtype.kind == "f", ( + f"When {name}.predict_proba returns a list, it should " + "contain NumPy arrays with floating dtype. Got " + f"{pred.dtype} instead." + ) + # check that we have the correct probabilities + err_msg = ( + f"When {name}.predict_proba returns a list, each NumPy " + "array should contain probabilities for each class and " + "thus each row should sum to 1 (or close to 1 due to " + "numerical errors)." + ) + assert_allclose(pred.sum(axis=1), 1, err_msg=err_msg) + elif isinstance(y_pred, np.ndarray): + assert y_pred.shape == (test_size, n_outputs), ( + f"When {name}.predict_proba returns a NumPy array, the " + f"expected shape is (n_samples, n_outputs). Got {y_pred.shape}" + f" instead of {(test_size, n_outputs)}." + ) + assert y_pred.dtype.kind == "f", ( + f"When {name}.predict_proba returns a NumPy array, the " + f"expected data type is floating. Got {y_pred.dtype} instead." + ) + err_msg = ( + f"When {name}.predict_proba returns a NumPy array, this array " + "is expected to provide probabilities of the positive class " + "and should therefore contain values between 0 and 1." + ) + assert_array_less(0, y_pred, err_msg=err_msg) + assert_array_less(y_pred, 1, err_msg=err_msg) + else: + raise ValueError( + f"Unknown returned type {type(y_pred)} by {name}." + "predict_proba. A list or a Numpy array is expected." + ) + + +@ignore_warnings(category=FutureWarning) +def check_classifiers_multilabel_output_format_decision_function(name, classifier_orig): + """Check the output of the `decision_function` method for classifiers supporting + multilabel-indicator targets.""" + classifier = clone(classifier_orig) + set_random_state(classifier) + + n_samples, test_size, n_outputs = 100, 25, 5 + X, y = make_multilabel_classification( + n_samples=n_samples, + n_features=2, + n_classes=n_outputs, + n_labels=3, + length=50, + allow_unlabeled=True, + random_state=0, + ) + X = scale(X) + + X_train, X_test = X[:-test_size], X[-test_size:] + y_train = y[:-test_size] + classifier.fit(X_train, y_train) + + response_method_name = "decision_function" + decision_function_method = getattr(classifier, response_method_name, None) + if decision_function_method is None: + raise SkipTest(f"{name} does not have a {response_method_name} method.") + + y_pred = decision_function_method(X_test) + + # y_pred.shape -> y_test.shape with floating dtype + assert isinstance(y_pred, np.ndarray), ( + f"{name}.decision_function is expected to output a NumPy array." + f" Got {type(y_pred)} instead." + ) + assert y_pred.shape == (test_size, n_outputs), ( + f"{name}.decision_function is expected to provide a NumPy array " + f"of shape (n_samples, n_outputs). Got {y_pred.shape} instead of " + f"{(test_size, n_outputs)}." + ) + assert y_pred.dtype.kind == "f", ( + f"{name}.decision_function is expected to output a floating dtype." + f" Got {y_pred.dtype} instead." + ) + + @ignore_warnings(category=FutureWarning) def check_estimators_fit_returns_self(name, estimator_orig, readonly_memmap=False): """Check if self is returned when calling fit.""" diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 8ff8d8cf5e782..ea158234ea785 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -11,6 +11,7 @@ import joblib from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.datasets import make_multilabel_classification from sklearn.utils import deprecated from sklearn.utils._testing import ( raises, @@ -20,18 +21,7 @@ MinimalTransformer, SkipTest, ) -from sklearn.utils.estimator_checks import check_estimator, _NotAnArray -from sklearn.utils.estimator_checks import check_class_weight_balanced_linear_classifier -from sklearn.utils.estimator_checks import set_random_state -from sklearn.utils.estimator_checks import _set_checking_parameters -from sklearn.utils.estimator_checks import check_estimators_unfitted -from sklearn.utils.estimator_checks import check_fit_score_takes_y -from sklearn.utils.estimator_checks import check_no_attributes_set_in_init -from sklearn.utils.estimator_checks import check_classifier_data_not_an_array -from sklearn.utils.estimator_checks import check_regressor_data_not_an_array -from sklearn.utils.estimator_checks import check_estimator_get_tags_default_keys from sklearn.utils.validation import check_is_fitted -from sklearn.utils.estimator_checks import check_outlier_corruption from sklearn.utils.fixes import np_version, parse_version from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LinearRegression, SGDClassifier @@ -45,6 +35,24 @@ from sklearn.utils import all_estimators from sklearn.exceptions import SkipTestWarning +from sklearn.utils.estimator_checks import ( + _NotAnArray, + _set_checking_parameters, + check_class_weight_balanced_linear_classifier, + check_classifier_data_not_an_array, + check_classifiers_multilabel_output_format_decision_function, + check_classifiers_multilabel_output_format_predict, + check_classifiers_multilabel_output_format_predict_proba, + check_estimator, + check_estimator_get_tags_default_keys, + check_estimators_unfitted, + check_fit_score_takes_y, + check_no_attributes_set_in_init, + check_regressor_data_not_an_array, + check_outlier_corruption, + set_random_state, +) + class CorrectNotFittedError(ValueError): """Exception class to raise if estimator is used before fitting. @@ -689,6 +697,236 @@ def test_check_estimator_get_tags_default_keys(): check_estimator_get_tags_default_keys(estimator.__class__.__name__, estimator) +class _BaseMultiLabelClassifierMock(ClassifierMixin, BaseEstimator): + def __init__(self, response_output): + self.response_output = response_output + + def fit(self, X, y): + return self + + def _more_tags(self): + return {"multilabel": True} + + +def test_check_classifiers_multilabel_output_format_predict(): + n_samples, test_size, n_outputs = 100, 25, 5 + _, y = make_multilabel_classification( + n_samples=n_samples, + n_features=2, + n_classes=n_outputs, + n_labels=3, + length=50, + allow_unlabeled=True, + random_state=0, + ) + y_test = y[-test_size:] + + class MultiLabelClassifierPredict(_BaseMultiLabelClassifierMock): + def predict(self, X): + return self.response_output + + # 1. inconsistent array type + clf = MultiLabelClassifierPredict(response_output=y_test.tolist()) + err_msg = ( + r"MultiLabelClassifierPredict.predict is expected to output a " + r"NumPy array. Got instead." + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_predict(clf.__class__.__name__, clf) + # 2. inconsistent shape + clf = MultiLabelClassifierPredict(response_output=y_test[:, :-1]) + err_msg = ( + r"MultiLabelClassifierPredict.predict outputs a NumPy array of " + r"shape \(25, 4\) instead of \(25, 5\)." + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_predict(clf.__class__.__name__, clf) + # 3. inconsistent dtype + clf = MultiLabelClassifierPredict(response_output=y_test.astype(np.float64)) + err_msg = ( + r"MultiLabelClassifierPredict.predict does not output the same " + r"dtype than the targets." + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_predict(clf.__class__.__name__, clf) + + +def test_check_classifiers_multilabel_output_format_predict_proba(): + n_samples, test_size, n_outputs = 100, 25, 5 + _, y = make_multilabel_classification( + n_samples=n_samples, + n_features=2, + n_classes=n_outputs, + n_labels=3, + length=50, + allow_unlabeled=True, + random_state=0, + ) + y_test = y[-test_size:] + + class MultiLabelClassifierPredictProba(_BaseMultiLabelClassifierMock): + def predict_proba(self, X): + return self.response_output + + # 1. unknown output type + clf = MultiLabelClassifierPredictProba(response_output=sp.csr_matrix(y_test)) + err_msg = ( + r"Unknown returned type by " + r"MultiLabelClassifierPredictProba.predict_proba. A list or a Numpy " + r"array is expected." + ) + with raises(ValueError, match=err_msg): + check_classifiers_multilabel_output_format_predict_proba( + clf.__class__.__name__, + clf, + ) + # 2. for list output + # 2.1. inconsistent length + clf = MultiLabelClassifierPredictProba(response_output=y_test.tolist()) + err_msg = ( + "When MultiLabelClassifierPredictProba.predict_proba returns a list, " + "the list should be of length n_outputs and contain NumPy arrays. Got " + f"length of {test_size} instead of {n_outputs}." + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_predict_proba( + clf.__class__.__name__, + clf, + ) + # 2.2. array of inconsistent shape + response_output = [np.ones_like(y_test) for _ in range(n_outputs)] + clf = MultiLabelClassifierPredictProba(response_output=response_output) + err_msg = ( + r"When MultiLabelClassifierPredictProba.predict_proba returns a list, " + r"this list should contain NumPy arrays of shape \(n_samples, 2\). Got " + r"NumPy arrays of shape \(25, 5\) instead of \(25, 2\)." + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_predict_proba( + clf.__class__.__name__, + clf, + ) + # 2.3. array of inconsistent dtype + response_output = [ + np.ones(shape=(y_test.shape[0], 2), dtype=np.int64) for _ in range(n_outputs) + ] + clf = MultiLabelClassifierPredictProba(response_output=response_output) + err_msg = ( + "When MultiLabelClassifierPredictProba.predict_proba returns a list, " + "it should contain NumPy arrays with floating dtype." + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_predict_proba( + clf.__class__.__name__, + clf, + ) + # 2.4. array does not contain probability (each row should sum to 1) + response_output = [ + np.ones(shape=(y_test.shape[0], 2), dtype=np.float64) for _ in range(n_outputs) + ] + clf = MultiLabelClassifierPredictProba(response_output=response_output) + err_msg = ( + r"When MultiLabelClassifierPredictProba.predict_proba returns a list, " + r"each NumPy array should contain probabilities for each class and " + r"thus each row should sum to 1" + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_predict_proba( + clf.__class__.__name__, + clf, + ) + # 3 for array output + # 3.1. array of inconsistent shape + clf = MultiLabelClassifierPredictProba(response_output=y_test[:, :-1]) + err_msg = ( + r"When MultiLabelClassifierPredictProba.predict_proba returns a NumPy " + r"array, the expected shape is \(n_samples, n_outputs\). Got \(25, 4\)" + r" instead of \(25, 5\)." + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_predict_proba( + clf.__class__.__name__, + clf, + ) + # 3.2. array of inconsistent dtype + response_output = np.zeros_like(y_test, dtype=np.int64) + clf = MultiLabelClassifierPredictProba(response_output=response_output) + err_msg = ( + r"When MultiLabelClassifierPredictProba.predict_proba returns a NumPy " + r"array, the expected data type is floating." + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_predict_proba( + clf.__class__.__name__, + clf, + ) + # 4. array does not contain probabilities + clf = MultiLabelClassifierPredictProba(response_output=y_test * 2.0) + err_msg = ( + r"When MultiLabelClassifierPredictProba.predict_proba returns a NumPy " + r"array, this array is expected to provide probabilities of the " + r"positive class and should therefore contain values between 0 and 1." + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_predict_proba( + clf.__class__.__name__, + clf, + ) + + +def test_check_classifiers_multilabel_output_format_decision_function(): + n_samples, test_size, n_outputs = 100, 25, 5 + _, y = make_multilabel_classification( + n_samples=n_samples, + n_features=2, + n_classes=n_outputs, + n_labels=3, + length=50, + allow_unlabeled=True, + random_state=0, + ) + y_test = y[-test_size:] + + class MultiLabelClassifierDecisionFunction(_BaseMultiLabelClassifierMock): + def decision_function(self, X): + return self.response_output + + # 1. inconsistent array type + clf = MultiLabelClassifierDecisionFunction(response_output=y_test.tolist()) + err_msg = ( + r"MultiLabelClassifierDecisionFunction.decision_function is expected " + r"to output a NumPy array. Got instead." + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_decision_function( + clf.__class__.__name__, + clf, + ) + # 2. inconsistent shape + clf = MultiLabelClassifierDecisionFunction(response_output=y_test[:, :-1]) + err_msg = ( + r"MultiLabelClassifierDecisionFunction.decision_function is expected " + r"to provide a NumPy array of shape \(n_samples, n_outputs\). Got " + r"\(25, 4\) instead of \(25, 5\)" + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_decision_function( + clf.__class__.__name__, + clf, + ) + # 3. inconsistent dtype + clf = MultiLabelClassifierDecisionFunction(response_output=y_test) + err_msg = ( + r"MultiLabelClassifierDecisionFunction.decision_function is expected " + r"to output a floating dtype." + ) + with raises(AssertionError, match=err_msg): + check_classifiers_multilabel_output_format_decision_function( + clf.__class__.__name__, + clf, + ) + + def run_tests_without_pytest(): """Runs the tests in this file without using pytest.""" main_module = sys.modules["__main__"]