From 305a63cfa40129e5a5e0d389d98a800546492603 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 26 Jul 2021 16:38:45 +0200 Subject: [PATCH] TST split API checks from other checks --- sklearn/utils/common_utils_checks.py | 114 ++++++++++++++++++++ sklearn/utils/estimator_api_checks.py | 92 ++++++++++++++++ sklearn/utils/estimator_checks.py | 148 +++----------------------- 3 files changed, 222 insertions(+), 132 deletions(-) create mode 100644 sklearn/utils/common_utils_checks.py create mode 100644 sklearn/utils/estimator_api_checks.py diff --git a/sklearn/utils/common_utils_checks.py b/sklearn/utils/common_utils_checks.py new file mode 100644 index 0000000000000..589747d03acbb --- /dev/null +++ b/sklearn/utils/common_utils_checks.py @@ -0,0 +1,114 @@ +import numpy as np + +from ..base import _is_pairwise +from ..metrics.pairwise import linear_kernel, pairwise_distances + +from ._tags import _safe_tags + + +def _enforce_estimator_tags_y(estimator, y): + """Modify `y` to be compatible with the available estimator tags. + + Parameters + ---------- + estimator : object + Estimator object to test. + + y : ndarray + The data to be converted. + + Returns + ------- + y : ndarray + The converted data. + """ + # Estimators with a `requires_positive_y` tag only accept strictly positive + # data + if _safe_tags(estimator, key="requires_positive_y"): + # Create strictly positive y. The minimal increment above 0 is 1, as + # y could be of integer dtype. + y += 1 + abs(y.min()) + # Estimators with a `binary_only` tag only accept up to two unique y values + if _safe_tags(estimator, key="binary_only") and y.size > 0: + y = np.where(y == y.flat[0], y, y.flat[0] + 1) + # Estimators in mono_output_task_error raise ValueError if y is of 1-D + # Convert into a 2-D y for those estimators. + if _safe_tags(estimator, key="multioutput_only"): + return np.reshape(y, (-1, 1)) + return y + + +def _enforce_estimator_tags_x(estimator, X): + """Modify `X` to be compatible with the available estimator tags. + + Parameters + ---------- + estimator : object + Estimator object to test. + + X : ndarray + The data to be converted. + + Returns + ------- + X : ndarray + The converted data. + """ + # Pairwise estimators only accept + # X of shape (`n_samples`, `n_samples`) + if _is_pairwise(estimator): + X = X.dot(X.T) + # Estimators with `1darray` in `X_types` tag only accept + # X of shape (`n_samples`,) + if "1darray" in _safe_tags(estimator, key="X_types"): + X = X[:, 0] + # Estimators with a `requires_positive_X` tag only accept + # strictly positive data + if _safe_tags(estimator, key="requires_positive_X"): + X -= X.min() + return X + + +def _is_pairwise_metric(estimator): + """Returns True if estimator accepts pairwise metric. + + Parameters + ---------- + estimator : object + Estimator object to test. + + Returns + ------- + out : bool + True if _pairwise is set to True and False otherwise. + """ + metric = getattr(estimator, "metric", None) + + return bool(metric == "precomputed") + + +def _pairwise_estimator_convert_X(X, estimator, kernel=linear_kernel): + """Convert `X` so to be used by a pairwise estimator. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The data to be converted. + + estimator : object + An estimator to apply on `X`. + + kernel : callable, default=linear_kernel + If `estimator` requires a kernel, this parameter will transform `X` + into a kernel matrix. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_features) or (n_samples, n_samples) + The converted `X`. + """ + if _is_pairwise_metric(estimator): + return pairwise_distances(X, metric="euclidean") + if _is_pairwise(estimator): + return kernel(X, X) + return X diff --git a/sklearn/utils/estimator_api_checks.py b/sklearn/utils/estimator_api_checks.py new file mode 100644 index 0000000000000..bb153189b863d --- /dev/null +++ b/sklearn/utils/estimator_api_checks.py @@ -0,0 +1,92 @@ +"""Checks for minimal scikit-learn estimator support.""" +from inspect import signature + +import numpy as np + +# from . import IS_PYPY +from ..base import clone +from ._testing import _get_args, ignore_warnings, set_random_state + +from .common_utils_checks import ( + # _enforce_estimator_tags_x, + _enforce_estimator_tags_y, + _pairwise_estimator_convert_X, +) + + +def _yield_api_estimator_checks(estimator): + # name = estimator.__class__.__name__ + # tags = _safe_tags(estimator) + # pairwise = _is_pairwise(estimator) + + yield check_no_attributes_set_in_init + yield check_takes_at_least_optional_y + + +@ignore_warnings(category=FutureWarning) +def check_no_attributes_set_in_init(name, estimator_orig): + """Check attribute setting at `__init__`.""" + try: + # Clone fails if the estimator does not store + # all parameters as an attribute during init + estimator = clone(estimator_orig) + except AttributeError: + raise AttributeError( + f"Estimator {name} should store all parameters as an attribute during init." + " Cloning mechanism will not work otherwise." + ) + + init_params = _get_args(type(estimator).__init__) + # TODO: check if we can get more generic and not have a special case for PyPy + # if IS_PYPY: + # # __init__ signature has additional objects in PyPy + # for key in ["obj"]: + # if key in init_params: + # init_params.remove(key) + parents_init_params = [ + param + for params_parent in (_get_args(parent) for parent in type(estimator).__mro__) + for param in params_parent + ] + + # Test for no setting apart from parameters during init + invalid_attr = set(vars(estimator)) - set(init_params) - set(parents_init_params) + assert not invalid_attr, ( + f"Estimator {name} should not set any attribute apart" + f" from parameters during init. Found attributes {sorted(invalid_attr)}." + ) + + +@ignore_warnings +def check_takes_at_least_optional_y(name, estimator_orig): + """Check that estimator accepts an optional `y` to be compatible with + `Pipeline`. + + Tha parameter `y` should be available in the following methods: `fit`, + `score`, `partial_fit`, `fit_predict`, `fit_transform`. + """ + estimator = clone(estimator_orig) + set_random_state(estimator) + + rnd = np.random.RandomState(0) + n_samples = 30 + X = rnd.uniform(size=(n_samples, 3)) + X = _pairwise_estimator_convert_X(X, estimator_orig) + y = np.arange(n_samples) % 3 + y = _enforce_estimator_tags_y(estimator, y) + + supported_methods = ["fit", "score", "partial_fit", "fit_predict", "fit_transform"] + for method_name in supported_methods: + method = getattr(estimator, method_name, None) + if method is not None: + method(X, y) + args = [p.name for p in signature(method).parameters.values()] + if args[0] == "self": + # `if_delegate_has_method` or `available_if` makes methods + # into functions with an explicit "self", so need to shift + # arguments + args = args[1:] + assert args[1] in ["y", "Y"], ( + "Expected `y` or `Y` as second argument for method " + f"{method_name}of {name}. Got arguments: {repr(args)}." + ) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index ff853be22f663..eb3aa425d76c6 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -11,9 +11,10 @@ from scipy.stats import rankdata import joblib -from . import IS_PYPY +# from . import IS_PYPY from .. import config_context -from ._testing import _get_args + +# from ._testing import _get_args from ._testing import assert_raise_message from ._testing import assert_array_equal from ._testing import assert_array_almost_equal @@ -49,7 +50,8 @@ from ..model_selection import train_test_split from ..model_selection import ShuffleSplit from ..model_selection._validation import _safe_split -from ..metrics.pairwise import rbf_kernel, linear_kernel, pairwise_distances +from ..metrics.pairwise import rbf_kernel + from . import shuffle from ._tags import ( @@ -66,6 +68,14 @@ make_regression, ) +from .common_utils_checks import ( + _enforce_estimator_tags_x, + _enforce_estimator_tags_y, + _is_pairwise_metric, + _pairwise_estimator_convert_X, +) +from .estimator_api_checks import _yield_api_estimator_checks + REGRESSION_DATASET = None CROSS_DECOMPOSITION = ["PLSCanonical", "PLSRegression", "CCA", "PLSSVD"] @@ -75,9 +85,7 @@ def _yield_checks(estimator): tags = _safe_tags(estimator) pairwise = _is_pairwise(estimator) - yield check_no_attributes_set_in_init yield check_estimators_dtypes - yield check_fit_score_takes_y if has_fit_parameter(estimator, "sample_weight"): yield check_sample_weights_pandas_series yield check_sample_weights_not_an_array @@ -274,6 +282,9 @@ def _yield_all_checks(estimator): ) return + for check in _yield_api_estimator_checks(estimator): + yield check + for check in _yield_checks(estimator): yield check if is_classifier(estimator): @@ -685,34 +696,6 @@ def __array_function__(self, func, types, args, kwargs): raise TypeError("Don't want to call array_function {}!".format(func.__name__)) -def _is_pairwise_metric(estimator): - """Returns True if estimator accepts pairwise metric. - - Parameters - ---------- - estimator : object - Estimator object to test. - - Returns - ------- - out : bool - True if _pairwise is set to True and False otherwise. - """ - metric = getattr(estimator, "metric", None) - - return bool(metric == "precomputed") - - -def _pairwise_estimator_convert_X(X, estimator, kernel=linear_kernel): - - if _is_pairwise_metric(estimator): - return pairwise_distances(X, metric="euclidean") - if _is_pairwise(estimator): - return kernel(X, X) - - return X - - def _generate_sparse_matrix(X_csr): """Generate sparse matrices with {32,64}bit indices of diverse format. @@ -1580,36 +1563,6 @@ def check_pipeline_consistency(name, estimator_orig): assert_allclose_dense_sparse(result, result_pipe) -@ignore_warnings -def check_fit_score_takes_y(name, estimator_orig): - # check that all estimators accept an optional y - # in fit and score so they can be used in pipelines - rnd = np.random.RandomState(0) - n_samples = 30 - X = rnd.uniform(size=(n_samples, 3)) - X = _pairwise_estimator_convert_X(X, estimator_orig) - y = np.arange(n_samples) % 3 - estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) - set_random_state(estimator) - - funcs = ["fit", "score", "partial_fit", "fit_predict", "fit_transform"] - for func_name in funcs: - func = getattr(estimator, func_name, None) - if func is not None: - func(X, y) - args = [p.name for p in signature(func).parameters.values()] - if args[0] == "self": - # if_delegate_has_method makes methods into functions - # with an explicit "self", so need to shift arguments - args = args[1:] - assert args[1] in ["y", "Y"], ( - "Expected y or Y as second argument for method " - "%s of %s. Got arguments: %r." - % (func_name, type(estimator).__name__, args) - ) - - @ignore_warnings def check_estimators_dtypes(name, estimator_orig): rnd = np.random.RandomState(0) @@ -2723,42 +2676,6 @@ def check_estimators_overwrite_params(name, estimator_orig): ) -@ignore_warnings(category=FutureWarning) -def check_no_attributes_set_in_init(name, estimator_orig): - """Check setting during init.""" - try: - # Clone fails if the estimator does not store - # all parameters as an attribute during init - estimator = clone(estimator_orig) - except AttributeError: - raise AttributeError( - f"Estimator {name} should store all parameters as an attribute during init." - ) - - if hasattr(type(estimator).__init__, "deprecated_original"): - return - - init_params = _get_args(type(estimator).__init__) - if IS_PYPY: - # __init__ signature has additional objects in PyPy - for key in ["obj"]: - if key in init_params: - init_params.remove(key) - parents_init_params = [ - param - for params_parent in (_get_args(parent) for parent in type(estimator).__mro__) - for param in params_parent - ] - - # Test for no setting apart from parameters during init - invalid_attr = set(vars(estimator)) - set(init_params) - set(parents_init_params) - assert not invalid_attr, ( - "Estimator %s should not set any attribute apart" - " from parameters during init. Found attributes %s." - % (name, sorted(invalid_attr)) - ) - - @ignore_warnings(category=FutureWarning) def check_sparsify_coefficients(name, estimator_orig): X = np.array( @@ -2973,39 +2890,6 @@ def param_filter(p): assert param_value == init_param.default, failure_text -def _enforce_estimator_tags_y(estimator, y): - # Estimators with a `requires_positive_y` tag only accept strictly positive - # data - if _safe_tags(estimator, key="requires_positive_y"): - # Create strictly positive y. The minimal increment above 0 is 1, as - # y could be of integer dtype. - y += 1 + abs(y.min()) - # Estimators with a `binary_only` tag only accept up to two unique y values - if _safe_tags(estimator, key="binary_only") and y.size > 0: - y = np.where(y == y.flat[0], y, y.flat[0] + 1) - # Estimators in mono_output_task_error raise ValueError if y is of 1-D - # Convert into a 2-D y for those estimators. - if _safe_tags(estimator, key="multioutput_only"): - return np.reshape(y, (-1, 1)) - return y - - -def _enforce_estimator_tags_x(estimator, X): - # Pairwise estimators only accept - # X of shape (`n_samples`, `n_samples`) - if _is_pairwise(estimator): - X = X.dot(X.T) - # Estimators with `1darray` in `X_types` tag only accept - # X of shape (`n_samples`,) - if "1darray" in _safe_tags(estimator, key="X_types"): - X = X[:, 0] - # Estimators with a `requires_positive_X` tag only accept - # strictly positive data - if _safe_tags(estimator, key="requires_positive_X"): - X -= X.min() - return X - - @ignore_warnings(category=FutureWarning) def check_non_transformer_estimators_n_iter(name, estimator_orig): # Test that estimators that are not transformers with a parameter