From 3338c71576d4fb58b8e38c1f85da0b68623888ed Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 9 Sep 2024 23:19:37 +0200 Subject: [PATCH 1/6] TST allow testing on multiple instances of estimators --- doc/sphinxext/allow_nan_estimators.py | 7 +- sklearn/tests/test_common.py | 31 +- sklearn/tests/test_docstring_parameters.py | 6 +- .../utils/_test_common/instance_generator.py | 271 ++++++++++++------ sklearn/utils/estimator_checks.py | 22 +- 5 files changed, 209 insertions(+), 128 deletions(-) diff --git a/doc/sphinxext/allow_nan_estimators.py b/doc/sphinxext/allow_nan_estimators.py index a1c77d81d1460..d2eb0e940b6a1 100755 --- a/doc/sphinxext/allow_nan_estimators.py +++ b/doc/sphinxext/allow_nan_estimators.py @@ -4,7 +4,7 @@ from docutils.parsers.rst import Directive from sklearn.utils import all_estimators -from sklearn.utils._test_common.instance_generator import _construct_instance +from sklearn.utils._test_common.instance_generator import _construct_instances from sklearn.utils._testing import SkipTest @@ -19,7 +19,10 @@ def make_paragraph_for_estimator_type(estimator_type): lst = nodes.bullet_list() for name, est_class in all_estimators(type_filter=estimator_type): with suppress(SkipTest): - est = _construct_instance(est_class) + # Here we generate the text only for one instance. This directive + # should not be used for meta-estimators where tags depend on the + # sub-estimator. + est = next(_construct_instances(est_class)) if est.__sklearn_tags__().input_tags.allow_nan: module_name = ".".join(est_class.__module__.split(".")[:2]) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index aefc7b03e1615..e66fd96921640 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -28,7 +28,7 @@ ) from sklearn.compose import ColumnTransformer from sklearn.datasets import make_blobs -from sklearn.exceptions import ConvergenceWarning, FitFailedWarning +from sklearn.exceptions import ConvergenceWarning # make it possible to discover experimental estimators when calling `all_estimators` from sklearn.experimental import ( @@ -55,8 +55,6 @@ from sklearn.utils import all_estimators from sklearn.utils._tags import get_tags from sklearn.utils._test_common.instance_generator import ( - _generate_pipeline, - _generate_search_cv_instances, _get_check_estimator_ids, _tested_estimators, ) @@ -132,7 +130,7 @@ def test_get_check_estimator_ids(val, expected): assert _get_check_estimator_ids(val) == expected -@parametrize_with_checks(list(chain(_tested_estimators(), _generate_pipeline()))) +@parametrize_with_checks(list(_tested_estimators())) def test_estimators(estimator, check, request): # Common tests for estimator instances with ignore_warnings( @@ -233,22 +231,6 @@ def test_class_support_removed(): parametrize_with_checks([LogisticRegression]) -@parametrize_with_checks(list(_generate_search_cv_instances())) -def test_search_cv(estimator, check, request): - # Common tests for SearchCV instances - # We have a separate test because those meta-estimators can accept a - # wide range of base estimators (classifiers, regressors, pipelines) - with ignore_warnings( - category=( - FutureWarning, - ConvergenceWarning, - UserWarning, - FitFailedWarning, - ) - ): - check(estimator) - - @pytest.mark.parametrize( "estimator", _tested_estimators(), ids=_get_check_estimator_ids ) @@ -311,7 +293,6 @@ def _estimators_that_predict_in_fit(): chain( _tested_estimators(), [make_pipeline(LogisticRegression(C=1))], - list(_generate_search_cv_instances()), _estimators_that_predict_in_fit(), ) ) @@ -401,13 +382,7 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator): @pytest.mark.parametrize( - "estimator", - chain( - _tested_estimators(), - _generate_pipeline(), - _generate_search_cv_instances(), - ), - ids=_get_check_estimator_ids, + "estimator", list(_tested_estimators()), ids=_get_check_estimator_ids ) def test_check_param_validation(estimator): if isinstance(estimator, FeatureUnion): diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 3211f918aea36..34b698e3f2073 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -23,7 +23,7 @@ from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import FunctionTransformer from sklearn.utils import all_estimators -from sklearn.utils._test_common.instance_generator import _construct_instance +from sklearn.utils._test_common.instance_generator import _construct_instances from sklearn.utils._testing import ( _get_func_name, assert_docstring_consistency, @@ -202,7 +202,9 @@ def test_fit_docstring_attributes(name, Estimator): elif Estimator.__name__ == "SparseCoder": est = _construct_sparse_coder(Estimator) else: - est = _construct_instance(Estimator) + # TODO(devtools): use _tested_estimators instead of all_estimators in the + # decorator + est = next(_construct_instances(Estimator)) if Estimator.__name__ == "SelectKBest": est.set_params(k=2) diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index aff5d58a8f3a7..9ad5b85b82978 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -5,10 +5,10 @@ import re import warnings from functools import partial -from inspect import isfunction, signature -from itertools import product +from inspect import isfunction from sklearn import config_context +from sklearn.base import clone from sklearn.calibration import CalibratedClassifierCV from sklearn.cluster import ( HDBSCAN, @@ -129,7 +129,7 @@ ) from sklearn.neighbors import NeighborhoodComponentsAnalysis from sklearn.neural_network import BernoulliRBM, MLPClassifier, MLPRegressor -from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline +from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder from sklearn.random_projection import ( GaussianRandomProjection, @@ -143,7 +143,7 @@ from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils import all_estimators -from sklearn.utils._testing import SkipTest, set_random_state +from sklearn.utils._testing import SkipTest CROSS_DECOMPOSITION = ["PLSCanonical", "PLSRegression", "CCA", "PLSSVD"] @@ -190,25 +190,102 @@ GradientBoostingRegressor: dict(n_estimators=5), GraphicalLassoCV: dict(max_iter=5, cv=3), GraphicalLasso: dict(max_iter=5), - GridSearchCV: dict( - estimator=LogisticRegression(C=1), param_grid={"C": [1.0]}, cv=3 - ), - HalvingGridSearchCV: dict( - estimator=Ridge(), - min_resources="smallest", - param_grid={"alpha": [0.1, 1.0]}, - random_state=0, - cv=2, - error_score="raise", - ), - HalvingRandomSearchCV: dict( - estimator=Ridge(), - param_distributions={"alpha": [0.1, 1.0]}, - min_resources="smallest", - cv=2, - error_score="raise", - random_state=0, - ), + GridSearchCV: [ + dict( + cv=2, + error_score="raise", + estimator=Ridge(), + param_grid={"alpha": [0.1, 1.0]}, + ), + dict( + cv=2, + error_score="raise", + estimator=LogisticRegression(), + param_grid={"C": [0.1, 1.0]}, + ), + dict( + cv=2, + error_score="raise", + estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]), + param_grid={"ridge__alpha": [0.1, 1.0]}, + ), + dict( + cv=2, + error_score="raise", + estimator=Pipeline( + steps=[("pca", PCA()), ("logisticregression", LogisticRegression())] + ), + param_grid={"logisticregression__C": [0.1, 1.0]}, + ), + ], + HalvingGridSearchCV: [ + dict( + cv=2, + error_score="raise", + estimator=Ridge(), + min_resources="smallest", + param_grid={"alpha": [0.1, 1.0]}, + random_state=0, + ), + dict( + cv=2, + error_score="raise", + estimator=LogisticRegression(), + min_resources="smallest", + param_grid={"C": [0.1, 1.0]}, + random_state=0, + ), + dict( + cv=2, + error_score="raise", + estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]), + min_resources="smallest", + param_grid={"ridge__alpha": [0.1, 1.0]}, + random_state=0, + ), + dict( + cv=2, + error_score="raise", + estimator=Pipeline( + steps=[("pca", PCA()), ("logisticregression", LogisticRegression())] + ), + min_resources="smallest", + param_grid={"logisticregression__C": [0.1, 1.0]}, + random_state=0, + ), + ], + HalvingRandomSearchCV: [ + dict( + cv=2, + error_score="raise", + estimator=Ridge(), + param_distributions={"alpha": [0.1, 1.0]}, + random_state=0, + ), + dict( + cv=2, + error_score="raise", + estimator=LogisticRegression(), + param_distributions={"C": [0.1, 1.0]}, + random_state=0, + ), + dict( + cv=2, + error_score="raise", + estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]), + param_distributions={"ridge__alpha": [0.1, 1.0]}, + random_state=0, + ), + dict( + cv=2, + error_score="raise", + estimator=Pipeline( + steps=[("pca", PCA()), ("logisticregression", LogisticRegression())] + ), + param_distributions={"logisticregression__C": [0.1, 1.0]}, + random_state=0, + ), + ], HDBSCAN: dict(min_samples=1), # The default min_samples_leaf (20) isn't appropriate for small # datasets (only very shallow trees are built) that the checks use. @@ -264,19 +341,53 @@ PassiveAggressiveClassifier: dict(max_iter=5), PassiveAggressiveRegressor: dict(max_iter=5), Perceptron: dict(max_iter=5), - Pipeline: dict(steps=[("scaler", StandardScaler()), ("est", Ridge())]), + Pipeline: [ + {"steps": [("scaler", StandardScaler()), ("final_estimator", Ridge())]}, + { + "steps": [ + ("scaler", StandardScaler()), + ("final_estimator", LogisticRegression()), + ] + }, + ], PLSCanonical: dict(n_components=1, max_iter=5), PLSRegression: dict(n_components=1, max_iter=5), PLSSVD: dict(n_components=1), PoissonRegressor: dict(max_iter=5), RandomForestClassifier: dict(n_estimators=5), RandomForestRegressor: dict(n_estimators=5), - RandomizedSearchCV: dict( - estimator=LogisticRegression(C=1), - param_distributions={"C": [1.0]}, - n_iter=5, - cv=3, - ), + RandomizedSearchCV: [ + dict( + cv=2, + error_score="raise", + estimator=Ridge(), + param_distributions={"alpha": [0.1, 1.0]}, + random_state=0, + ), + dict( + cv=2, + error_score="raise", + estimator=LogisticRegression(), + param_distributions={"C": [0.1, 1.0]}, + random_state=0, + ), + dict( + cv=2, + error_score="raise", + estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]), + param_distributions={"ridge__alpha": [0.1, 1.0]}, + random_state=0, + ), + dict( + cv=2, + error_score="raise", + estimator=Pipeline( + steps=[("pca", PCA()), ("logisticregression", LogisticRegression())] + ), + param_distributions={"logisticregression__C": [0.1, 1.0]}, + random_state=0, + ), + ], RandomTreesEmbedding: dict(n_estimators=5), # `RANSACRegressor` will raise an error with any model other # than `LinearRegression` if we don't fix the `min_samples` parameter. @@ -348,33 +459,31 @@ } +# This dictionary stores parameters for specific checks. It also enables running the +# same check with multiple instances of the same estimator with different parameters. +# The special key "*" allows to apply the parameters to all checks. +CHECK_PARAMS = { + Pipeline: {"*": []}, + GridSearchCV: {"*": []}, + HalvingGridSearchCV: {"*": []}, + RandomizedSearchCV: {"*": []}, + HalvingRandomSearchCV: {"*": []}, +} + + def _tested_estimators(type_filter=None): for name, Estimator in all_estimators(type_filter=type_filter): try: - estimator = _construct_instance(Estimator) + for estimator in _construct_instances(Estimator): + yield estimator except SkipTest: continue - yield estimator - - -def _generate_pipeline(): - """Generator of simple pipeline to check compliance of the - :class:`~sklearn.pipeline.Pipeline` class. - """ - for final_estimator in [Ridge(), LogisticRegression()]: - yield Pipeline( - steps=[ - ("scaler", StandardScaler()), - ("final_estimator", final_estimator), - ] - ) - SKIPPED_ESTIMATORS = [SparseCoder] -def _construct_instance(Estimator): +def _construct_instances(Estimator): """Construct Estimator instance if possible.""" if Estimator in SKIPPED_ESTIMATORS: msg = f"Can't instantiate estimator {Estimator.__name__}" @@ -425,48 +534,28 @@ def _get_check_estimator_ids(obj): return re.sub(r"\s", "", str(obj)) -def _generate_search_cv_instances(): - """Generator of `SearchCV` instances to check their compliance with scikit-learn.""" - for SearchCV, (Estimator, param_grid) in product( - [ - GridSearchCV, - HalvingGridSearchCV, - RandomizedSearchCV, - HalvingGridSearchCV, - ], - [ - (Ridge, {"alpha": [0.1, 1.0]}), - (LogisticRegression, {"C": [0.1, 1.0]}), - ], - ): - init_params = signature(SearchCV).parameters - extra_params = ( - {"min_resources": "smallest"} if "min_resources" in init_params else {} - ) - search_cv = SearchCV( - Estimator(), param_grid, cv=2, error_score="raise", **extra_params - ) - set_random_state(search_cv) - yield search_cv +def _yield_instances_for_check(check, estimator_orig): + """Yield instances for a check. - for SearchCV, (Estimator, param_grid) in product( - [ - GridSearchCV, - HalvingGridSearchCV, - RandomizedSearchCV, - HalvingRandomSearchCV, - ], - [ - (Ridge, {"ridge__alpha": [0.1, 1.0]}), - (LogisticRegression, {"logisticregression__C": [0.1, 1.0]}), - ], - ): - init_params = signature(SearchCV).parameters - extra_params = ( - {"min_resources": "smallest"} if "min_resources" in init_params else {} - ) - search_cv = SearchCV( - make_pipeline(PCA(), Estimator()), param_grid, cv=2, **extra_params - ).set_params(error_score="raise") - set_random_state(search_cv) - yield search_cv + For most estimators, this is a no-op. + + For estimators which have an entry in CHECK_PARAMS, this will yield + an estimator for each parameter set in CHECK_PARAMS[estimator]. + """ + if type(estimator_orig) not in CHECK_PARAMS: + return (estimator_orig,) + + check_params = CHECK_PARAMS[type(estimator_orig)] + + if "*" not in check_params and check not in check_params: + return (estimator_orig,) + + if "*" in check_params: + param_set = check_params["*"] + else: + param_set = check_params[check] + + for params in param_set: + estimator = clone(estimator_orig) + estimator.set_params(**params) + yield estimator diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 6da7c8eb1c7ff..30b961228e80f 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -59,8 +59,9 @@ from ._test_common.instance_generator import ( CROSS_DECOMPOSITION, INIT_PARAMS, - _construct_instance, + _construct_instances, _get_check_estimator_ids, + _yield_instances_for_check, ) from ._testing import ( SkipTest, @@ -508,10 +509,14 @@ def parametrize_with_checks(estimators, *, legacy=True): def checks_generator(): for estimator in estimators: + # First check that the estimator is cloneable which is needed for the rest + # of the checks to run name = type(estimator).__name__ + yield estimator, partial(check_estimator_cloneable, name) for check in _yield_all_checks(estimator, legacy=legacy): - check = partial(check, name) - yield _maybe_mark_xfail(estimator, check, pytest) + check_with_name = partial(check, name) + for check_instance in _yield_instances_for_check(check, estimator): + yield _maybe_mark_xfail(check_instance, check_with_name, pytest) return pytest.mark.parametrize( "estimator, check", checks_generator(), ids=_get_check_estimator_ids @@ -596,9 +601,13 @@ def check_estimator(estimator=None, generate_only=False, *, legacy=True): name = type(estimator).__name__ def checks_generator(): + # we first need to check if the estimator is cloneable for the rest of the tests + # to run + yield estimator, partial(check_estimator_cloneable, name) for check in _yield_all_checks(estimator, legacy=legacy): check = _maybe_skip(estimator, check) - yield estimator, partial(check, name) + for check_instance in _yield_instances_for_check(check, estimator): + yield check_instance, partial(check, name) if generate_only: return checks_generator() @@ -3303,7 +3312,10 @@ def check_parameters_default_constructible(name, estimator_orig): Estimator = estimator_orig.__class__ with ignore_warnings(category=FutureWarning): - estimator = _construct_instance(Estimator) + # TODO(devtools): this test is flawed because _construct_instances doesn't + # construct with default parameters for sklearn estimators, while it does + # for third party estimators. + estimator = next(_construct_instances(Estimator)) # test that set_params returns self assert estimator.set_params() is estimator From c1509d8f063b189d30db777597a0907fb5d5c9f7 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 10 Sep 2024 13:37:34 +0200 Subject: [PATCH 2/6] cleanup --- doc/sphinxext/allow_nan_estimators.py | 7 +- sklearn/tests/test_common.py | 31 ++- sklearn/tests/test_docstring_parameters.py | 6 +- .../utils/_test_common/instance_generator.py | 249 +++++++----------- sklearn/utils/estimator_checks.py | 4 +- 5 files changed, 130 insertions(+), 167 deletions(-) diff --git a/doc/sphinxext/allow_nan_estimators.py b/doc/sphinxext/allow_nan_estimators.py index d2eb0e940b6a1..a1c77d81d1460 100755 --- a/doc/sphinxext/allow_nan_estimators.py +++ b/doc/sphinxext/allow_nan_estimators.py @@ -4,7 +4,7 @@ from docutils.parsers.rst import Directive from sklearn.utils import all_estimators -from sklearn.utils._test_common.instance_generator import _construct_instances +from sklearn.utils._test_common.instance_generator import _construct_instance from sklearn.utils._testing import SkipTest @@ -19,10 +19,7 @@ def make_paragraph_for_estimator_type(estimator_type): lst = nodes.bullet_list() for name, est_class in all_estimators(type_filter=estimator_type): with suppress(SkipTest): - # Here we generate the text only for one instance. This directive - # should not be used for meta-estimators where tags depend on the - # sub-estimator. - est = next(_construct_instances(est_class)) + est = _construct_instance(est_class) if est.__sklearn_tags__().input_tags.allow_nan: module_name = ".".join(est_class.__module__.split(".")[:2]) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index e66fd96921640..aefc7b03e1615 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -28,7 +28,7 @@ ) from sklearn.compose import ColumnTransformer from sklearn.datasets import make_blobs -from sklearn.exceptions import ConvergenceWarning +from sklearn.exceptions import ConvergenceWarning, FitFailedWarning # make it possible to discover experimental estimators when calling `all_estimators` from sklearn.experimental import ( @@ -55,6 +55,8 @@ from sklearn.utils import all_estimators from sklearn.utils._tags import get_tags from sklearn.utils._test_common.instance_generator import ( + _generate_pipeline, + _generate_search_cv_instances, _get_check_estimator_ids, _tested_estimators, ) @@ -130,7 +132,7 @@ def test_get_check_estimator_ids(val, expected): assert _get_check_estimator_ids(val) == expected -@parametrize_with_checks(list(_tested_estimators())) +@parametrize_with_checks(list(chain(_tested_estimators(), _generate_pipeline()))) def test_estimators(estimator, check, request): # Common tests for estimator instances with ignore_warnings( @@ -231,6 +233,22 @@ def test_class_support_removed(): parametrize_with_checks([LogisticRegression]) +@parametrize_with_checks(list(_generate_search_cv_instances())) +def test_search_cv(estimator, check, request): + # Common tests for SearchCV instances + # We have a separate test because those meta-estimators can accept a + # wide range of base estimators (classifiers, regressors, pipelines) + with ignore_warnings( + category=( + FutureWarning, + ConvergenceWarning, + UserWarning, + FitFailedWarning, + ) + ): + check(estimator) + + @pytest.mark.parametrize( "estimator", _tested_estimators(), ids=_get_check_estimator_ids ) @@ -293,6 +311,7 @@ def _estimators_that_predict_in_fit(): chain( _tested_estimators(), [make_pipeline(LogisticRegression(C=1))], + list(_generate_search_cv_instances()), _estimators_that_predict_in_fit(), ) ) @@ -382,7 +401,13 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator): @pytest.mark.parametrize( - "estimator", list(_tested_estimators()), ids=_get_check_estimator_ids + "estimator", + chain( + _tested_estimators(), + _generate_pipeline(), + _generate_search_cv_instances(), + ), + ids=_get_check_estimator_ids, ) def test_check_param_validation(estimator): if isinstance(estimator, FeatureUnion): diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 34b698e3f2073..3211f918aea36 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -23,7 +23,7 @@ from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import FunctionTransformer from sklearn.utils import all_estimators -from sklearn.utils._test_common.instance_generator import _construct_instances +from sklearn.utils._test_common.instance_generator import _construct_instance from sklearn.utils._testing import ( _get_func_name, assert_docstring_consistency, @@ -202,9 +202,7 @@ def test_fit_docstring_attributes(name, Estimator): elif Estimator.__name__ == "SparseCoder": est = _construct_sparse_coder(Estimator) else: - # TODO(devtools): use _tested_estimators instead of all_estimators in the - # decorator - est = next(_construct_instances(Estimator)) + est = _construct_instance(Estimator) if Estimator.__name__ == "SelectKBest": est.set_params(k=2) diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 9ad5b85b82978..5235f8dd4dad2 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -5,10 +5,10 @@ import re import warnings from functools import partial -from inspect import isfunction +from inspect import isfunction, signature +from itertools import product -from sklearn import config_context -from sklearn.base import clone +from sklearn import clone, config_context from sklearn.calibration import CalibratedClassifierCV from sklearn.cluster import ( HDBSCAN, @@ -129,7 +129,7 @@ ) from sklearn.neighbors import NeighborhoodComponentsAnalysis from sklearn.neural_network import BernoulliRBM, MLPClassifier, MLPRegressor -from sklearn.pipeline import FeatureUnion, Pipeline +from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder from sklearn.random_projection import ( GaussianRandomProjection, @@ -143,7 +143,7 @@ from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils import all_estimators -from sklearn.utils._testing import SkipTest +from sklearn.utils._testing import SkipTest, set_random_state CROSS_DECOMPOSITION = ["PLSCanonical", "PLSRegression", "CCA", "PLSSVD"] @@ -190,102 +190,25 @@ GradientBoostingRegressor: dict(n_estimators=5), GraphicalLassoCV: dict(max_iter=5, cv=3), GraphicalLasso: dict(max_iter=5), - GridSearchCV: [ - dict( - cv=2, - error_score="raise", - estimator=Ridge(), - param_grid={"alpha": [0.1, 1.0]}, - ), - dict( - cv=2, - error_score="raise", - estimator=LogisticRegression(), - param_grid={"C": [0.1, 1.0]}, - ), - dict( - cv=2, - error_score="raise", - estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]), - param_grid={"ridge__alpha": [0.1, 1.0]}, - ), - dict( - cv=2, - error_score="raise", - estimator=Pipeline( - steps=[("pca", PCA()), ("logisticregression", LogisticRegression())] - ), - param_grid={"logisticregression__C": [0.1, 1.0]}, - ), - ], - HalvingGridSearchCV: [ - dict( - cv=2, - error_score="raise", - estimator=Ridge(), - min_resources="smallest", - param_grid={"alpha": [0.1, 1.0]}, - random_state=0, - ), - dict( - cv=2, - error_score="raise", - estimator=LogisticRegression(), - min_resources="smallest", - param_grid={"C": [0.1, 1.0]}, - random_state=0, - ), - dict( - cv=2, - error_score="raise", - estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]), - min_resources="smallest", - param_grid={"ridge__alpha": [0.1, 1.0]}, - random_state=0, - ), - dict( - cv=2, - error_score="raise", - estimator=Pipeline( - steps=[("pca", PCA()), ("logisticregression", LogisticRegression())] - ), - min_resources="smallest", - param_grid={"logisticregression__C": [0.1, 1.0]}, - random_state=0, - ), - ], - HalvingRandomSearchCV: [ - dict( - cv=2, - error_score="raise", - estimator=Ridge(), - param_distributions={"alpha": [0.1, 1.0]}, - random_state=0, - ), - dict( - cv=2, - error_score="raise", - estimator=LogisticRegression(), - param_distributions={"C": [0.1, 1.0]}, - random_state=0, - ), - dict( - cv=2, - error_score="raise", - estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]), - param_distributions={"ridge__alpha": [0.1, 1.0]}, - random_state=0, - ), - dict( - cv=2, - error_score="raise", - estimator=Pipeline( - steps=[("pca", PCA()), ("logisticregression", LogisticRegression())] - ), - param_distributions={"logisticregression__C": [0.1, 1.0]}, - random_state=0, - ), - ], + GridSearchCV: dict( + estimator=LogisticRegression(C=1), param_grid={"C": [1.0]}, cv=3 + ), + HalvingGridSearchCV: dict( + estimator=Ridge(), + min_resources="smallest", + param_grid={"alpha": [0.1, 1.0]}, + random_state=0, + cv=2, + error_score="raise", + ), + HalvingRandomSearchCV: dict( + estimator=Ridge(), + param_distributions={"alpha": [0.1, 1.0]}, + min_resources="smallest", + cv=2, + error_score="raise", + random_state=0, + ), HDBSCAN: dict(min_samples=1), # The default min_samples_leaf (20) isn't appropriate for small # datasets (only very shallow trees are built) that the checks use. @@ -341,53 +264,19 @@ PassiveAggressiveClassifier: dict(max_iter=5), PassiveAggressiveRegressor: dict(max_iter=5), Perceptron: dict(max_iter=5), - Pipeline: [ - {"steps": [("scaler", StandardScaler()), ("final_estimator", Ridge())]}, - { - "steps": [ - ("scaler", StandardScaler()), - ("final_estimator", LogisticRegression()), - ] - }, - ], + Pipeline: dict(steps=[("scaler", StandardScaler()), ("est", Ridge())]), PLSCanonical: dict(n_components=1, max_iter=5), PLSRegression: dict(n_components=1, max_iter=5), PLSSVD: dict(n_components=1), PoissonRegressor: dict(max_iter=5), RandomForestClassifier: dict(n_estimators=5), RandomForestRegressor: dict(n_estimators=5), - RandomizedSearchCV: [ - dict( - cv=2, - error_score="raise", - estimator=Ridge(), - param_distributions={"alpha": [0.1, 1.0]}, - random_state=0, - ), - dict( - cv=2, - error_score="raise", - estimator=LogisticRegression(), - param_distributions={"C": [0.1, 1.0]}, - random_state=0, - ), - dict( - cv=2, - error_score="raise", - estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]), - param_distributions={"ridge__alpha": [0.1, 1.0]}, - random_state=0, - ), - dict( - cv=2, - error_score="raise", - estimator=Pipeline( - steps=[("pca", PCA()), ("logisticregression", LogisticRegression())] - ), - param_distributions={"logisticregression__C": [0.1, 1.0]}, - random_state=0, - ), - ], + RandomizedSearchCV: dict( + estimator=LogisticRegression(C=1), + param_distributions={"C": [1.0]}, + n_iter=5, + cv=3, + ), RandomTreesEmbedding: dict(n_estimators=5), # `RANSACRegressor` will raise an error with any model other # than `LinearRegression` if we don't fix the `min_samples` parameter. @@ -458,32 +347,39 @@ ), } - # This dictionary stores parameters for specific checks. It also enables running the # same check with multiple instances of the same estimator with different parameters. # The special key "*" allows to apply the parameters to all checks. -CHECK_PARAMS = { - Pipeline: {"*": []}, - GridSearchCV: {"*": []}, - HalvingGridSearchCV: {"*": []}, - RandomizedSearchCV: {"*": []}, - HalvingRandomSearchCV: {"*": []}, -} +CHECK_PARAMS: dict = {} def _tested_estimators(type_filter=None): for name, Estimator in all_estimators(type_filter=type_filter): try: - for estimator in _construct_instances(Estimator): - yield estimator + estimator = _construct_instance(Estimator) except SkipTest: continue + yield estimator + + +def _generate_pipeline(): + """Generator of simple pipeline to check compliance of the + :class:`~sklearn.pipeline.Pipeline` class. + """ + for final_estimator in [Ridge(), LogisticRegression()]: + yield Pipeline( + steps=[ + ("scaler", StandardScaler()), + ("final_estimator", final_estimator), + ] + ) + SKIPPED_ESTIMATORS = [SparseCoder] -def _construct_instances(Estimator): +def _construct_instance(Estimator): """Construct Estimator instance if possible.""" if Estimator in SKIPPED_ESTIMATORS: msg = f"Can't instantiate estimator {Estimator.__name__}" @@ -534,6 +430,53 @@ def _get_check_estimator_ids(obj): return re.sub(r"\s", "", str(obj)) +def _generate_search_cv_instances(): + """Generator of `SearchCV` instances to check their compliance with scikit-learn.""" + for SearchCV, (Estimator, param_grid) in product( + [ + GridSearchCV, + HalvingGridSearchCV, + RandomizedSearchCV, + HalvingGridSearchCV, + ], + [ + (Ridge, {"alpha": [0.1, 1.0]}), + (LogisticRegression, {"C": [0.1, 1.0]}), + ], + ): + init_params = signature(SearchCV).parameters + extra_params = ( + {"min_resources": "smallest"} if "min_resources" in init_params else {} + ) + search_cv = SearchCV( + Estimator(), param_grid, cv=2, error_score="raise", **extra_params + ) + set_random_state(search_cv) + yield search_cv + + for SearchCV, (Estimator, param_grid) in product( + [ + GridSearchCV, + HalvingGridSearchCV, + RandomizedSearchCV, + HalvingRandomSearchCV, + ], + [ + (Ridge, {"ridge__alpha": [0.1, 1.0]}), + (LogisticRegression, {"logisticregression__C": [0.1, 1.0]}), + ], + ): + init_params = signature(SearchCV).parameters + extra_params = ( + {"min_resources": "smallest"} if "min_resources" in init_params else {} + ) + search_cv = SearchCV( + make_pipeline(PCA(), Estimator()), param_grid, cv=2, **extra_params + ).set_params(error_score="raise") + set_random_state(search_cv) + yield search_cv + + def _yield_instances_for_check(check, estimator_orig): """Yield instances for a check. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 30b961228e80f..3fda4b98af2e4 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -59,7 +59,7 @@ from ._test_common.instance_generator import ( CROSS_DECOMPOSITION, INIT_PARAMS, - _construct_instances, + _construct_instance, _get_check_estimator_ids, _yield_instances_for_check, ) @@ -3315,7 +3315,7 @@ def check_parameters_default_constructible(name, estimator_orig): # TODO(devtools): this test is flawed because _construct_instances doesn't # construct with default parameters for sklearn estimators, while it does # for third party estimators. - estimator = next(_construct_instances(Estimator)) + estimator = _construct_instance(Estimator) # test that set_params returns self assert estimator.set_params() is estimator From dffd6acd808f76745b48ab832ac899d3c59836dc Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 10 Sep 2024 15:14:29 +0200 Subject: [PATCH 3/6] cleanup and fix test --- sklearn/cluster/_bicluster.py | 9 ++ .../utils/_test_common/instance_generator.py | 106 ++++++++++++++++-- sklearn/utils/estimator_checks.py | 22 +--- 3 files changed, 107 insertions(+), 30 deletions(-) diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index e86ca4e7268fe..c42f2a133fa24 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -362,6 +362,15 @@ def _fit(self, X): [self.column_labels_ == c for c in range(self.n_clusters)] ) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags._xfail_checks = { + # ValueError: Found array with 0 feature(s) (shape=(23, 0)) + # while a minimum of 1 is required. + "check_dict_unchanged": "FIXME", + } + return tags + class SpectralBiclustering(BaseSpectral): """Spectral biclustering (Kluger, 2003). diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 5235f8dd4dad2..c410dcc933c15 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -34,6 +34,7 @@ FactorAnalysis, FastICA, IncrementalPCA, + KernelPCA, LatentDirichletAllocation, MiniBatchDictionaryLearning, MiniBatchNMF, @@ -42,6 +43,7 @@ SparsePCA, TruncatedSVD, ) +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.dummy import DummyClassifier from sklearn.ensemble import ( AdaBoostClassifier, @@ -73,6 +75,12 @@ SelectKBest, SequentialFeatureSelector, ) +from sklearn.kernel_approximation import ( + Nystroem, + PolynomialCountSketch, + RBFSampler, + SkewedChi2Sampler, +) from sklearn.linear_model import ( ARDRegression, BayesianRidge, @@ -106,7 +114,13 @@ TheilSenRegressor, TweedieRegressor, ) -from sklearn.manifold import MDS, TSNE, LocallyLinearEmbedding, SpectralEmbedding +from sklearn.manifold import ( + MDS, + TSNE, + Isomap, + LocallyLinearEmbedding, + SpectralEmbedding, +) from sklearn.mixture import BayesianGaussianMixture, GaussianMixture from sklearn.model_selection import ( FixedThresholdClassifier, @@ -350,7 +364,74 @@ # This dictionary stores parameters for specific checks. It also enables running the # same check with multiple instances of the same estimator with different parameters. # The special key "*" allows to apply the parameters to all checks. -CHECK_PARAMS: dict = {} +CHECK_PARAMS: dict = { + # TODO(devtools): check that function names here exist in checks for the estimator + # TODO(devtools): write a test for the same thing with tags._xfail_checks + AgglomerativeClustering: {"check_dict_unchanged": dict(n_clusters=1)}, + BayesianGaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)}, + BernoulliRBM: {"check_dict_unchanged": dict(n_components=1, n_iter=5)}, + Birch: {"check_dict_unchanged": dict(n_clusters=1)}, + BisectingKMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)}, + CCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + DictionaryLearning: { + "check_dict_unchanged": dict( + max_iter=20, n_components=1, transform_algorithm="lasso_lars" + ) + }, + FactorAnalysis: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + FastICA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + FeatureAgglomeration: {"check_dict_unchanged": dict(n_clusters=1)}, + GaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)}, + GaussianRandomProjection: {"check_dict_unchanged": dict(n_components=1)}, + IncrementalPCA: {"check_dict_unchanged": dict(batch_size=10, n_components=1)}, + Isomap: {"check_dict_unchanged": dict(n_components=1)}, + KMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)}, + KernelPCA: {"check_dict_unchanged": dict(n_components=1)}, + LatentDirichletAllocation: { + "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1) + }, + LinearDiscriminantAnalysis: {"check_dict_unchanged": dict(n_components=1)}, + LocallyLinearEmbedding: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + MDS: {"check_dict_unchanged": dict(max_iter=5, n_components=1, n_init=2)}, + MiniBatchDictionaryLearning: { + "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1) + }, + MiniBatchKMeans: { + "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_clusters=1, n_init=2) + }, + MiniBatchNMF: { + "check_dict_unchanged": dict( + batch_size=10, fresh_restarts=True, max_iter=20, n_components=1 + ) + }, + MiniBatchSparsePCA: { + "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1) + }, + NMF: {"check_dict_unchanged": dict(max_iter=500, n_components=1)}, + NeighborhoodComponentsAnalysis: { + "check_dict_unchanged": dict(max_iter=5, n_components=1) + }, + Nystroem: {"check_dict_unchanged": dict(n_components=1)}, + PCA: {"check_dict_unchanged": dict(n_components=1)}, + PLSCanonical: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + PLSRegression: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + PLSSVD: {"check_dict_unchanged": dict(n_components=1)}, + PolynomialCountSketch: {"check_dict_unchanged": dict(n_components=1)}, + RBFSampler: {"check_dict_unchanged": dict(n_components=1)}, + SkewedChi2Sampler: {"check_dict_unchanged": dict(n_components=1)}, + SparsePCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + SparseRandomProjection: {"check_dict_unchanged": dict(n_components=1)}, + SpectralBiclustering: { + "check_dict_unchanged": dict(n_best=1, n_clusters=1, n_components=1, n_init=2) + }, + SpectralClustering: { + "check_dict_unchanged": dict(n_clusters=1, n_components=1, n_init=2) + }, + SpectralCoclustering: {"check_dict_unchanged": dict(n_clusters=1, n_init=2)}, + SpectralEmbedding: {"check_dict_unchanged": dict(eigen_tol=1e-05, n_components=1)}, + TSNE: {"check_dict_unchanged": dict(n_components=1, perplexity=2)}, + TruncatedSVD: {"check_dict_unchanged": dict(n_components=1)}, +} def _tested_estimators(type_filter=None): @@ -486,17 +567,24 @@ def _yield_instances_for_check(check, estimator_orig): an estimator for each parameter set in CHECK_PARAMS[estimator]. """ if type(estimator_orig) not in CHECK_PARAMS: - return (estimator_orig,) + yield estimator_orig + return check_params = CHECK_PARAMS[type(estimator_orig)] - if "*" not in check_params and check not in check_params: - return (estimator_orig,) + try: + check_name = check.__name__ + except AttributeError: + # partial tests + check_name = check.func.__name__ - if "*" in check_params: - param_set = check_params["*"] - else: - param_set = check_params[check] + if check_name not in check_params: + yield estimator_orig + return + + param_set = check_params[check_name] + if isinstance(param_set, dict): + param_set = [param_set] for params in param_set: estimator = clone(estimator_orig) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3fda4b98af2e4..1a25bc10e0ca1 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -84,7 +84,6 @@ def _yield_api_checks(estimator): - yield check_estimator_cloneable yield check_estimator_repr yield check_no_attributes_set_in_init yield check_fit_score_takes_y @@ -1243,32 +1242,13 @@ def check_complex_data(name, estimator_orig): @ignore_warnings def check_dict_unchanged(name, estimator_orig): - # this estimator raises - # ValueError: Found array with 0 feature(s) (shape=(23, 0)) - # while a minimum of 1 is required. - # error - if name in ["SpectralCoclustering"]: - return rnd = np.random.RandomState(0) - if name in ["RANSACRegressor"]: - X = 3 * rnd.uniform(size=(20, 3)) - else: - X = 2 * rnd.uniform(size=(20, 3)) - + X = 3 * rnd.uniform(size=(20, 3)) X = _enforce_estimator_tags_X(estimator_orig, X) y = X[:, 0].astype(int) estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) - if hasattr(estimator, "n_components"): - estimator.n_components = 1 - - if hasattr(estimator, "n_clusters"): - estimator.n_clusters = 1 - - if hasattr(estimator, "n_best"): - estimator.n_best = 1 - set_random_state(estimator, 1) estimator.fit(X, y) From 17d15d64f76358b1885f4311fdad0efdb4a3a063 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 10 Sep 2024 15:16:45 +0200 Subject: [PATCH 4/6] ... --- sklearn/utils/_test_common/instance_generator.py | 1 + sklearn/utils/estimator_checks.py | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index c410dcc933c15..61f35e200834f 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -566,6 +566,7 @@ def _yield_instances_for_check(check, estimator_orig): For estimators which have an entry in CHECK_PARAMS, this will yield an estimator for each parameter set in CHECK_PARAMS[estimator]. """ + # TODO(devtools): enable this behavior for third party estimators as well if type(estimator_orig) not in CHECK_PARAMS: yield estimator_orig return diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 1a25bc10e0ca1..725d1c82443b2 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3292,9 +3292,6 @@ def check_parameters_default_constructible(name, estimator_orig): Estimator = estimator_orig.__class__ with ignore_warnings(category=FutureWarning): - # TODO(devtools): this test is flawed because _construct_instances doesn't - # construct with default parameters for sklearn estimators, while it does - # for third party estimators. estimator = _construct_instance(Estimator) # test that set_params returns self assert estimator.set_params() is estimator From 42e1e33bae54b00a3dd4f1cfa859ca1b67c499e9 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 10 Sep 2024 15:33:55 +0200 Subject: [PATCH 5/6] fix tags --- sklearn/cluster/_bicluster.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index c42f2a133fa24..08cd63b58cbaa 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -364,11 +364,13 @@ def _fit(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags._xfail_checks = { - # ValueError: Found array with 0 feature(s) (shape=(23, 0)) - # while a minimum of 1 is required. - "check_dict_unchanged": "FIXME", - } + tags._xfail_checks.update( + { + # ValueError: Found array with 0 feature(s) (shape=(23, 0)) + # while a minimum of 1 is required. + "check_dict_unchanged": "FIXME", + } + ) return tags From 226a4509aa859751d08ad7d5434f01deb00cceb3 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 11 Sep 2024 15:07:35 +0200 Subject: [PATCH 6/6] rename dict to PER_ESTIMATOR_CHECK_PARAMS --- sklearn/utils/_test_common/instance_generator.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 4080d47e4ab42..c2c6be238d52d 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -474,7 +474,8 @@ # This dictionary stores parameters for specific checks. It also enables running the # same check with multiple instances of the same estimator with different parameters. # The special key "*" allows to apply the parameters to all checks. -CHECK_PARAMS: dict = { +# TODO(devtools): allow third-party developers to pass test specific params to checks +PER_ESTIMATOR_CHECK_PARAMS: dict = { # TODO(devtools): check that function names here exist in checks for the estimator # TODO(devtools): write a test for the same thing with tags._xfail_checks AgglomerativeClustering: {"check_dict_unchanged": dict(n_clusters=1)}, @@ -620,15 +621,15 @@ def _yield_instances_for_check(check, estimator_orig): For most estimators, this is a no-op. - For estimators which have an entry in CHECK_PARAMS, this will yield - an estimator for each parameter set in CHECK_PARAMS[estimator]. + For estimators which have an entry in PER_ESTIMATOR_CHECK_PARAMS, this will yield + an estimator for each parameter set in PER_ESTIMATOR_CHECK_PARAMS[estimator]. """ # TODO(devtools): enable this behavior for third party estimators as well - if type(estimator_orig) not in CHECK_PARAMS: + if type(estimator_orig) not in PER_ESTIMATOR_CHECK_PARAMS: yield estimator_orig return - check_params = CHECK_PARAMS[type(estimator_orig)] + check_params = PER_ESTIMATOR_CHECK_PARAMS[type(estimator_orig)] try: check_name = check.__name__