From d28c3cb7c3f465a668a655a36d477a31f360ae35 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 21 Aug 2024 15:50:51 +0200 Subject: [PATCH 01/17] TST allow categorisation of tests into API and legacy --- sklearn/utils/estimator_checks.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 42edfe0d4d3c4..01b1276edcdaa 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -90,13 +90,17 @@ CROSS_DECOMPOSITION = ["PLSCanonical", "PLSRegression", "CCA", "PLSSVD"] +def _yield_api_checks(estimator): + yield check_no_attributes_set_in_init + yield check_fit_score_takes_y + yield check_estimators_overwrite_params + + def _yield_checks(estimator): name = estimator.__class__.__name__ tags = _safe_tags(estimator) - yield check_no_attributes_set_in_init yield check_estimators_dtypes - yield check_fit_score_takes_y if has_fit_parameter(estimator, "sample_weight"): yield check_sample_weights_pandas_series yield check_sample_weights_not_an_array @@ -129,7 +133,6 @@ def _yield_checks(estimator): # Check that pairwise estimator throws error on non-square input yield check_nonsquare_error - yield check_estimators_overwrite_params if hasattr(estimator, "sparsify"): yield check_sparsify_coefficients @@ -323,7 +326,7 @@ def _yield_array_api_checks(estimator): ) -def _yield_all_checks(estimator): +def _yield_all_checks(estimator, legacy: bool): name = estimator.__class__.__name__ tags = _safe_tags(estimator) if "2darray" not in tags["X_types"]: @@ -341,6 +344,12 @@ def _yield_all_checks(estimator): ) return + for check in _yield_api_checks(estimator): + yield check + + if not legacy: + return + for check in _yield_checks(estimator): yield check if is_classifier(estimator): @@ -513,9 +522,14 @@ def _should_be_skipped_or_marked(estimator, check): return False, "placeholder reason that will never be used" -def parametrize_with_checks(estimators): +def parametrize_with_checks(estimators, legacy=True): """Pytest specific decorator for parametrizing estimator checks. + Checks are categorised into the following groups: + + - API checks: a set of checks to ensure API compatibility with scikit-learn + - legacy: a set of checks which gradually will be grouped into other categories + The `id` of each check is set to be a pprint version of the estimator and the name of the check with its keyword arguments. This allows to use `pytest -k` to specify which tests to run:: @@ -533,6 +547,11 @@ def parametrize_with_checks(estimators): .. versionadded:: 0.24 + legacy : bool (default=True) + Whether to include legacy checks. + + .. versionadded:: 1.6 + Returns ------- decorator : `pytest.mark.parametrize` @@ -566,7 +585,7 @@ def parametrize_with_checks(estimators): def checks_generator(): for estimator in estimators: name = type(estimator).__name__ - for check in _yield_all_checks(estimator): + for check in _yield_all_checks(estimator, legacy=legacy): check = partial(check, name) yield _maybe_mark_xfail(estimator, check, pytest) From 13a8e27c422a2d383585f7b098216984e797234e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 22 Aug 2024 08:09:41 +0200 Subject: [PATCH 02/17] TST refactor instance generation and parameter setting --- doc/sphinxext/allow_nan_estimators.py | 2 +- sklearn/decomposition/tests/test_pca.py | 2 +- sklearn/linear_model/tests/test_ridge.py | 2 +- sklearn/preprocessing/tests/test_data.py | 2 +- sklearn/tests/test_common.py | 104 +---- sklearn/tests/test_docstring_parameters.py | 2 +- sklearn/utils/_test_common/__init__.py | 2 + .../utils/_test_common/instance_generator.py | 441 ++++++++++++++++++ sklearn/utils/estimator_checks.py | 220 +-------- sklearn/utils/tests/test_estimator_checks.py | 2 +- 10 files changed, 466 insertions(+), 313 deletions(-) create mode 100644 sklearn/utils/_test_common/__init__.py create mode 100644 sklearn/utils/_test_common/instance_generator.py diff --git a/doc/sphinxext/allow_nan_estimators.py b/doc/sphinxext/allow_nan_estimators.py index 89d7077bce2b5..00a6ddc0048e9 100755 --- a/doc/sphinxext/allow_nan_estimators.py +++ b/doc/sphinxext/allow_nan_estimators.py @@ -4,8 +4,8 @@ from docutils.parsers.rst import Directive from sklearn.utils import all_estimators +from sklearn.utils._test_common.instance_generator import _construct_instance from sklearn.utils._testing import SkipTest -from sklearn.utils.estimator_checks import _construct_instance class AllowNanEstimators(Directive): diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index bd7f60061abdc..52f769bfb9001 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -17,9 +17,9 @@ yield_namespace_device_dtype_combinations, ) from sklearn.utils._array_api import device as array_device +from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids from sklearn.utils._testing import _array_api_for_tests, assert_allclose from sklearn.utils.estimator_checks import ( - _get_check_estimator_ids, check_array_api_input_and_values, ) from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index 9db58dd499269..c727d268e0ebc 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -48,6 +48,7 @@ yield_namespace_device_dtype_combinations, yield_namespaces, ) +from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids from sklearn.utils._testing import ( assert_allclose, assert_almost_equal, @@ -57,7 +58,6 @@ ) from sklearn.utils.estimator_checks import ( _array_api_for_tests, - _get_check_estimator_ids, check_array_api_input_and_values, ) from sklearn.utils.fixes import ( diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 5d254e491b400..049b188cf66a7 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -40,6 +40,7 @@ from sklearn.utils._array_api import ( yield_namespace_device_dtype_combinations, ) +from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids from sklearn.utils._testing import ( _convert_container, assert_allclose, @@ -51,7 +52,6 @@ skip_if_32bit, ) from sklearn.utils.estimator_checks import ( - _get_check_estimator_ids, check_array_api_input_and_values, ) from sklearn.utils.fixes import ( diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 3a61503530f23..467c7db9a3d21 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -11,7 +11,7 @@ import warnings from functools import partial from inspect import isgenerator, signature -from itertools import chain, product +from itertools import chain import numpy as np import pytest @@ -26,25 +26,13 @@ MeanShift, SpectralClustering, ) -from sklearn.compose import ColumnTransformer from sklearn.datasets import make_blobs -from sklearn.decomposition import PCA from sklearn.exceptions import ConvergenceWarning, FitFailedWarning # make it possible to discover experimental estimators when calling `all_estimators` -from sklearn.experimental import ( - enable_halving_search_cv, # noqa - enable_iterative_imputer, # noqa -) -from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.linear_model import LogisticRegression from sklearn.linear_model._base import LinearClassifierMixin from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding -from sklearn.model_selection import ( - GridSearchCV, - HalvingGridSearchCV, - HalvingRandomSearchCV, - RandomizedSearchCV, -) from sklearn.neighbors import ( KNeighborsClassifier, KNeighborsRegressor, @@ -52,7 +40,7 @@ RadiusNeighborsClassifier, RadiusNeighborsRegressor, ) -from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.pipeline import make_pipeline from sklearn.preprocessing import ( FunctionTransformer, MinMaxScaler, @@ -62,15 +50,19 @@ from sklearn.semi_supervised import LabelPropagation, LabelSpreading from sklearn.utils import all_estimators from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags +from sklearn.utils._test_common.instance_generator import ( + _generate_column_transformer_instances, + _generate_pipeline, + _generate_search_cv_instances, + _get_check_estimator_ids, + _set_checking_parameters, + _tested_estimators, +) from sklearn.utils._testing import ( SkipTest, ignore_warnings, - set_random_state, ) from sklearn.utils.estimator_checks import ( - _construct_instance, - _get_check_estimator_ids, - _set_checking_parameters, check_class_weight_balanced_linear_classifier, check_dataframe_column_names_consistency, check_estimator, @@ -139,26 +131,6 @@ def test_get_check_estimator_ids(val, expected): assert _get_check_estimator_ids(val) == expected -def _tested_estimators(type_filter=None): - for name, Estimator in all_estimators(type_filter=type_filter): - try: - estimator = _construct_instance(Estimator) - except SkipTest: - continue - - yield estimator - - -def _generate_pipeline(): - for final_estimator in [Ridge(), LogisticRegression()]: - yield Pipeline( - steps=[ - ("scaler", StandardScaler()), - ("final_estimator", final_estimator), - ] - ) - - @parametrize_with_checks(list(chain(_tested_estimators(), _generate_pipeline()))) def test_estimators(estimator, check, request): # Common tests for estimator instances @@ -282,60 +254,6 @@ def test_class_support_removed(): parametrize_with_checks([LogisticRegression]) -def _generate_column_transformer_instances(): - yield ColumnTransformer( - transformers=[ - ("trans1", StandardScaler(), [0, 1]), - ] - ) - - -def _generate_search_cv_instances(): - for SearchCV, (Estimator, param_grid) in product( - [ - GridSearchCV, - HalvingGridSearchCV, - RandomizedSearchCV, - HalvingGridSearchCV, - ], - [ - (Ridge, {"alpha": [0.1, 1.0]}), - (LogisticRegression, {"C": [0.1, 1.0]}), - ], - ): - init_params = signature(SearchCV).parameters - extra_params = ( - {"min_resources": "smallest"} if "min_resources" in init_params else {} - ) - search_cv = SearchCV( - Estimator(), param_grid, cv=2, error_score="raise", **extra_params - ) - set_random_state(search_cv) - yield search_cv - - for SearchCV, (Estimator, param_grid) in product( - [ - GridSearchCV, - HalvingGridSearchCV, - RandomizedSearchCV, - HalvingRandomSearchCV, - ], - [ - (Ridge, {"ridge__alpha": [0.1, 1.0]}), - (LogisticRegression, {"logisticregression__C": [0.1, 1.0]}), - ], - ): - init_params = signature(SearchCV).parameters - extra_params = ( - {"min_resources": "smallest"} if "min_resources" in init_params else {} - ) - search_cv = SearchCV( - make_pipeline(PCA(), Estimator()), param_grid, cv=2, **extra_params - ).set_params(error_score="raise") - set_random_state(search_cv) - yield search_cv - - @parametrize_with_checks(list(_generate_search_cv_instances())) def test_search_cv(estimator, check, request): # Common tests for SearchCV instances diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 3af463b783bc3..687b85ed00187 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -22,6 +22,7 @@ from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import FunctionTransformer from sklearn.utils import all_estimators +from sklearn.utils._test_common.instance_generator import _construct_instance from sklearn.utils._testing import ( _get_func_name, check_docstring_parameters, @@ -29,7 +30,6 @@ ) from sklearn.utils.deprecation import _is_deprecated from sklearn.utils.estimator_checks import ( - _construct_instance, _enforce_estimator_tags_X, _enforce_estimator_tags_y, ) diff --git a/sklearn/utils/_test_common/__init__.py b/sklearn/utils/_test_common/__init__.py new file mode 100644 index 0000000000000..67dd18fb94b59 --- /dev/null +++ b/sklearn/utils/_test_common/__init__.py @@ -0,0 +1,2 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py new file mode 100644 index 0000000000000..3e215111adcda --- /dev/null +++ b/sklearn/utils/_test_common/instance_generator.py @@ -0,0 +1,441 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + + +import re +import warnings +from functools import partial +from inspect import isfunction, signature +from itertools import product + +from sklearn import config_context +from sklearn.base import RegressorMixin +from sklearn.calibration import CalibratedClassifierCV +from sklearn.cluster import ( + HDBSCAN, + AffinityPropagation, + AgglomerativeClustering, + Birch, + BisectingKMeans, + FeatureAgglomeration, + KMeans, + MeanShift, + MiniBatchKMeans, + SpectralBiclustering, + SpectralClustering, + SpectralCoclustering, +) +from sklearn.compose import ColumnTransformer +from sklearn.covariance import GraphicalLasso, GraphicalLassoCV +from sklearn.cross_decomposition import CCA, PLSSVD, PLSCanonical, PLSRegression +from sklearn.decomposition import ( + NMF, + PCA, + DictionaryLearning, + FactorAnalysis, + FastICA, + IncrementalPCA, + LatentDirichletAllocation, + MiniBatchDictionaryLearning, + MiniBatchNMF, + MiniBatchSparsePCA, + SparsePCA, + TruncatedSVD, +) +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import ( + AdaBoostClassifier, + AdaBoostRegressor, + BaggingClassifier, + BaggingRegressor, + ExtraTreesClassifier, + ExtraTreesRegressor, + GradientBoostingClassifier, + GradientBoostingRegressor, + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, + IsolationForest, + RandomForestClassifier, + RandomForestRegressor, + RandomTreesEmbedding, + StackingClassifier, + StackingRegressor, +) +from sklearn.exceptions import SkipTestWarning +from sklearn.experimental import enable_halving_search_cv # noqa +from sklearn.feature_selection import ( + RFECV, + SelectFdr, + SelectFromModel, + SelectKBest, + SequentialFeatureSelector, +) +from sklearn.linear_model import ( + ARDRegression, + BayesianRidge, + ElasticNet, + ElasticNetCV, + GammaRegressor, + HuberRegressor, + LarsCV, + Lasso, + LassoCV, + LassoLars, + LassoLarsCV, + LassoLarsIC, + LinearRegression, + LogisticRegression, + LogisticRegressionCV, + MultiTaskElasticNet, + MultiTaskElasticNetCV, + MultiTaskLasso, + MultiTaskLassoCV, + OrthogonalMatchingPursuitCV, + PassiveAggressiveClassifier, + PassiveAggressiveRegressor, + Perceptron, + PoissonRegressor, + RANSACRegressor, + Ridge, + SGDClassifier, + SGDOneClassSVM, + SGDRegressor, + TheilSenRegressor, + TweedieRegressor, +) +from sklearn.manifold import MDS, TSNE, LocallyLinearEmbedding, SpectralEmbedding +from sklearn.mixture import BayesianGaussianMixture, GaussianMixture +from sklearn.model_selection import ( + GridSearchCV, + HalvingGridSearchCV, + HalvingRandomSearchCV, + RandomizedSearchCV, + TunedThresholdClassifierCV, +) +from sklearn.multioutput import ClassifierChain, RegressorChain +from sklearn.neighbors import NeighborhoodComponentsAnalysis +from sklearn.neural_network import BernoulliRBM, MLPClassifier, MLPRegressor +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder +from sklearn.random_projection import ( + GaussianRandomProjection, + SparseRandomProjection, +) +from sklearn.semi_supervised import ( + LabelPropagation, + LabelSpreading, + SelfTrainingClassifier, +) +from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils import all_estimators +from sklearn.utils._testing import SkipTest, set_random_state + +CROSS_DECOMPOSITION = ["PLSCanonical", "PLSRegression", "CCA", "PLSSVD"] + +# The following dictionary is to indicate constructor arguments suitable for the test +# suite, which uses very small datasets, and is intended to run rather quickly. +TEST_PARAMS = { + AdaBoostClassifier: dict(n_estimators=5), + AdaBoostRegressor: dict(n_estimators=5), + AffinityPropagation: dict(max_iter=5), + AgglomerativeClustering: dict(n_clusters=2), + ARDRegression: dict(max_iter=5), + BaggingClassifier: dict(n_estimators=5), + BaggingRegressor: dict(n_estimators=5), + BayesianGaussianMixture: dict(n_init=2, max_iter=5), + BayesianRidge: dict(max_iter=5), + BernoulliRBM: dict(n_iter=5, batch_size=10), + Birch: dict(n_clusters=2), + BisectingKMeans: dict(n_init=2, n_clusters=2, max_iter=5), + CalibratedClassifierCV: dict(cv=3), + CCA: dict(n_components=1, max_iter=5), + ClassifierChain: dict(cv=3), + DictionaryLearning: dict(max_iter=20, transform_algorithm="lasso_lars"), + # the default strategy prior would output constant predictions and fail + # for check_classifiers_predictions + DummyClassifier: dict(strategy="stratified"), + ElasticNetCV: dict(max_iter=5, cv=3), + ElasticNet: dict(max_iter=5), + ExtraTreesClassifier: dict(n_estimators=5), + ExtraTreesRegressor: dict(n_estimators=5), + FactorAnalysis: dict(max_iter=5), + FastICA: dict(max_iter=5), + FeatureAgglomeration: dict(n_clusters=2), + GammaRegressor: dict(max_iter=5), + GaussianMixture: dict(n_init=2, max_iter=5), + # Due to the jl lemma and often very few samples, the number + # of components of the random matrix projection will be probably + # greater than the number of features. + # So we impose a smaller number (avoid "auto" mode) + GaussianRandomProjection: dict(n_components=2), + GradientBoostingClassifier: dict(n_estimators=5), + GradientBoostingRegressor: dict(n_estimators=5), + GraphicalLassoCV: dict(max_iter=5, cv=3), + GraphicalLasso: dict(max_iter=5), + GridSearchCV: dict(cv=3), + HalvingGridSearchCV: dict(cv=3), + HalvingRandomSearchCV: dict(cv=3), + HDBSCAN: dict(min_samples=1), + # The default min_samples_leaf (20) isn't appropriate for small + # datasets (only very shallow trees are built) that the checks use. + HistGradientBoostingClassifier: dict(max_iter=5, min_samples_leaf=5), + HistGradientBoostingRegressor: dict(max_iter=5, min_samples_leaf=5), + HuberRegressor: dict(max_iter=5), + IncrementalPCA: dict(batch_size=10), + IsolationForest: dict(n_estimators=5), + KMeans: dict(n_init=2, n_clusters=2, max_iter=5), + LabelPropagation: dict(max_iter=5), + LabelSpreading: dict(max_iter=5), + LarsCV: dict(max_iter=5, cv=3), + LassoCV: dict(max_iter=5, cv=3), + Lasso: dict(max_iter=5), + LassoLarsCV: dict(max_iter=5, cv=3), + LassoLars: dict(max_iter=5), + # Noise variance estimation does not work when `n_samples < n_features`. + # We need to provide the noise variance explicitly. + LassoLarsIC: dict(max_iter=5, noise_variance=1.0), + LatentDirichletAllocation: dict(max_iter=5, batch_size=10), + LinearSVR: dict(max_iter=20), + LinearSVC: dict(max_iter=20), + LocallyLinearEmbedding: dict(max_iter=5), + LogisticRegressionCV: dict(max_iter=5, cv=3), + LogisticRegression: dict(max_iter=5), + MDS: dict(n_init=2, max_iter=5), + # In the case of check_fit2d_1sample, bandwidth is set to None and + # is thus estimated. De facto it is 0.0 as a single sample is provided + # and this makes the test fails. Hence we give it a placeholder value. + MeanShift: dict(max_iter=5, bandwidth=1.0), + MiniBatchDictionaryLearning: dict(batch_size=10, max_iter=5), + MiniBatchKMeans: dict(n_init=2, n_clusters=2, max_iter=5, batch_size=10), + MiniBatchNMF: dict(batch_size=10, max_iter=20, fresh_restarts=True), + MiniBatchSparsePCA: dict(max_iter=5, batch_size=10), + MLPClassifier: dict(max_iter=100), + MLPRegressor: dict(max_iter=100), + MultiTaskElasticNetCV: dict(max_iter=5, cv=3), + MultiTaskElasticNet: dict(max_iter=5), + MultiTaskLassoCV: dict(max_iter=5, cv=3), + MultiTaskLasso: dict(max_iter=5), + NeighborhoodComponentsAnalysis: dict(max_iter=5), + NMF: dict(max_iter=500), + NuSVC: dict(max_iter=-1), + NuSVR: dict(max_iter=-1), + OneClassSVM: dict(max_iter=-1), + OneHotEncoder: dict(handle_unknown="ignore"), + OrthogonalMatchingPursuitCV: dict(cv=3), + PassiveAggressiveClassifier: dict(max_iter=5), + PassiveAggressiveRegressor: dict(max_iter=5), + Perceptron: dict(max_iter=5), + PLSCanonical: dict(n_components=1, max_iter=5), + PLSRegression: dict(n_components=1, max_iter=5), + PLSSVD: dict(n_components=1), + PoissonRegressor: dict(max_iter=5), + RandomForestClassifier: dict(n_estimators=5), + RandomForestRegressor: dict(n_estimators=5), + RandomizedSearchCV: dict(n_iter=5, cv=3), + RandomTreesEmbedding: dict(n_estimators=5), + RANSACRegressor: dict(max_trials=10), + RegressorChain: dict(cv=3), + RFECV: dict(cv=3), + # be tolerant of noisy datasets (not actually speed) + SelectFdr: dict(alpha=0.5), + # SelectKBest has a default of k=10 + # which is more feature than we have in most case. + SelectKBest: dict(k=1), + SelfTrainingClassifier: dict(max_iter=5), + SequentialFeatureSelector: dict(cv=3), + SGDClassifier: dict(max_iter=5), + SGDOneClassSVM: dict(max_iter=5), + SGDRegressor: dict(max_iter=5), + SparsePCA: dict(max_iter=5), + # Due to the jl lemma and often very few samples, the number + # of components of the random matrix projection will be probably + # greater than the number of features. + # So we impose a smaller number (avoid "auto" mode) + SparseRandomProjection: dict(n_components=2), + SpectralBiclustering: dict(n_init=2, n_best=1, n_clusters=2), + SpectralClustering: dict(n_init=2, n_clusters=2), + SpectralCoclustering: dict(n_init=2, n_clusters=2), + # Default "auto" parameter can lead to different ordering of eigenvalues on + # windows: #24105 + SpectralEmbedding: dict(eigen_tol=1e-5), + StackingClassifier: dict(cv=3), + StackingRegressor: dict(cv=3), + SVC: dict(max_iter=-1), + SVR: dict(max_iter=-1), + TargetEncoder: dict(cv=3), + TheilSenRegressor: dict(max_iter=5, max_subpopulation=100), + # TruncatedSVD doesn't run with n_components = n_features + TruncatedSVD: dict(n_iter=5, n_components=1), + TSNE: dict(perplexity=2), + TunedThresholdClassifierCV: dict(cv=3), + TweedieRegressor: dict(max_iter=5), +} + + +def _set_checking_parameters(estimator): + # set parameters to speed up some estimators and + # avoid deprecated behaviour + params = estimator.get_params() + name = estimator.__class__.__name__ + + if type(estimator) in TEST_PARAMS: + test_params = TEST_PARAMS[type(estimator)] + estimator.set_params(**test_params) + + +def _tested_estimators(type_filter=None): + for name, Estimator in all_estimators(type_filter=type_filter): + try: + estimator = _construct_instance(Estimator) + except SkipTest: + continue + + yield estimator + + +def _generate_pipeline(): + for final_estimator in [Ridge(), LogisticRegression()]: + yield Pipeline( + steps=[ + ("scaler", StandardScaler()), + ("final_estimator", final_estimator), + ] + ) + + +def _construct_instance(Estimator): + """Construct Estimator instance if possible.""" + required_parameters = getattr(Estimator, "_required_parameters", []) + if len(required_parameters): + if required_parameters in (["estimator"], ["base_estimator"]): + # `RANSACRegressor` will raise an error with any model other + # than `LinearRegression` if we don't fix `min_samples` parameter. + # For common test, we can enforce using `LinearRegression` that + # is the default estimator in `RANSACRegressor` instead of `Ridge`. + if issubclass(Estimator, RANSACRegressor): + estimator = Estimator(LinearRegression()) + elif issubclass(Estimator, RegressorMixin): + estimator = Estimator(Ridge()) + elif issubclass(Estimator, SelectFromModel): + # Increases coverage because SGDRegressor has partial_fit + estimator = Estimator(SGDRegressor(random_state=0)) + else: + estimator = Estimator(LogisticRegression(C=1)) + elif required_parameters in (["estimators"],): + # Heterogeneous ensemble classes (i.e. stacking, voting) + if issubclass(Estimator, RegressorMixin): + estimator = Estimator( + estimators=[ + ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)), + ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)), + ] + ) + else: + estimator = Estimator( + estimators=[ + ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)), + ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)), + ] + ) + else: + msg = ( + f"Can't instantiate estimator {Estimator.__name__} " + f"parameters {required_parameters}" + ) + # raise additional warning to be shown by pytest + warnings.warn(msg, SkipTestWarning) + raise SkipTest(msg) + else: + estimator = Estimator() + return estimator + + +def _get_check_estimator_ids(obj): + """Create pytest ids for checks. + + When `obj` is an estimator, this returns the pprint version of the + estimator (with `print_changed_only=True`). When `obj` is a function, the + name of the function is returned with its keyword arguments. + + `_get_check_estimator_ids` is designed to be used as the `id` in + `pytest.mark.parametrize` where `check_estimator(..., generate_only=True)` + is yielding estimators and checks. + + Parameters + ---------- + obj : estimator or function + Items generated by `check_estimator`. + + Returns + ------- + id : str or None + + See Also + -------- + check_estimator + """ + if isfunction(obj): + return obj.__name__ + if isinstance(obj, partial): + if not obj.keywords: + return obj.func.__name__ + kwstring = ",".join(["{}={}".format(k, v) for k, v in obj.keywords.items()]) + return "{}({})".format(obj.func.__name__, kwstring) + if hasattr(obj, "get_params"): + with config_context(print_changed_only=True): + return re.sub(r"\s", "", str(obj)) + + +def _generate_column_transformer_instances(): + yield ColumnTransformer( + transformers=[ + ("trans1", StandardScaler(), [0, 1]), + ] + ) + + +def _generate_search_cv_instances(): + for SearchCV, (Estimator, param_grid) in product( + [ + GridSearchCV, + HalvingGridSearchCV, + RandomizedSearchCV, + HalvingGridSearchCV, + ], + [ + (Ridge, {"alpha": [0.1, 1.0]}), + (LogisticRegression, {"C": [0.1, 1.0]}), + ], + ): + init_params = signature(SearchCV).parameters + extra_params = ( + {"min_resources": "smallest"} if "min_resources" in init_params else {} + ) + search_cv = SearchCV( + Estimator(), param_grid, cv=2, error_score="raise", **extra_params + ) + set_random_state(search_cv) + yield search_cv + + for SearchCV, (Estimator, param_grid) in product( + [ + GridSearchCV, + HalvingGridSearchCV, + RandomizedSearchCV, + HalvingRandomSearchCV, + ], + [ + (Ridge, {"ridge__alpha": [0.1, 1.0]}), + (LogisticRegression, {"logisticregression__C": [0.1, 1.0]}), + ], + ): + init_params = signature(SearchCV).parameters + extra_params = ( + {"min_resources": "smallest"} if "min_resources" in init_params else {} + ) + search_cv = SearchCV( + make_pipeline(PCA(), Estimator()), param_grid, cv=2, **extra_params + ).set_params(error_score="raise") + set_random_state(search_cv) + yield search_cv diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 42edfe0d4d3c4..745503c54a7aa 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -9,7 +9,7 @@ from contextlib import nullcontext from copy import deepcopy from functools import partial, wraps -from inspect import isfunction, signature +from inspect import signature from numbers import Integral, Real import joblib @@ -20,7 +20,6 @@ from .. import config_context from ..base import ( ClusterMixin, - RegressorMixin, clone, is_classifier, is_outlier_detector, @@ -34,22 +33,12 @@ make_regression, ) from ..exceptions import DataConversionWarning, NotFittedError, SkipTestWarning -from ..feature_selection import SelectFromModel, SelectKBest -from ..linear_model import ( - LinearRegression, - LogisticRegression, - RANSACRegressor, - Ridge, - SGDRegressor, -) from ..metrics import accuracy_score, adjusted_rand_score, f1_score from ..metrics.pairwise import linear_kernel, pairwise_distances, rbf_kernel from ..model_selection import ShuffleSplit, train_test_split from ..model_selection._validation import _safe_split from ..pipeline import make_pipeline from ..preprocessing import StandardScaler, scale -from ..random_projection import BaseRandomProjection -from ..tree import DecisionTreeClassifier, DecisionTreeRegressor from ..utils._array_api import ( _atol_for_type, _convert_to_numpy, @@ -69,6 +58,11 @@ _DEFAULT_TAGS, _safe_tags, ) +from ._test_common.instance_generator import ( + CROSS_DECOMPOSITION, + _construct_instance, + _get_check_estimator_ids, +) from ._testing import ( SkipTest, _array_api_for_tests, @@ -87,7 +81,6 @@ from .validation import _num_samples, check_is_fitted, has_fit_parameter REGRESSION_DATASET = None -CROSS_DECOMPOSITION = ["PLSCanonical", "PLSRegression", "CCA", "PLSSVD"] def _yield_checks(estimator): @@ -380,89 +373,6 @@ def _yield_all_checks(estimator): yield check_fit_non_negative -def _get_check_estimator_ids(obj): - """Create pytest ids for checks. - - When `obj` is an estimator, this returns the pprint version of the - estimator (with `print_changed_only=True`). When `obj` is a function, the - name of the function is returned with its keyword arguments. - - `_get_check_estimator_ids` is designed to be used as the `id` in - `pytest.mark.parametrize` where `check_estimator(..., generate_only=True)` - is yielding estimators and checks. - - Parameters - ---------- - obj : estimator or function - Items generated by `check_estimator`. - - Returns - ------- - id : str or None - - See Also - -------- - check_estimator - """ - if isfunction(obj): - return obj.__name__ - if isinstance(obj, partial): - if not obj.keywords: - return obj.func.__name__ - kwstring = ",".join(["{}={}".format(k, v) for k, v in obj.keywords.items()]) - return "{}({})".format(obj.func.__name__, kwstring) - if hasattr(obj, "get_params"): - with config_context(print_changed_only=True): - return re.sub(r"\s", "", str(obj)) - - -def _construct_instance(Estimator): - """Construct Estimator instance if possible.""" - required_parameters = getattr(Estimator, "_required_parameters", []) - if len(required_parameters): - if required_parameters in (["estimator"], ["base_estimator"]): - # `RANSACRegressor` will raise an error with any model other - # than `LinearRegression` if we don't fix `min_samples` parameter. - # For common test, we can enforce using `LinearRegression` that - # is the default estimator in `RANSACRegressor` instead of `Ridge`. - if issubclass(Estimator, RANSACRegressor): - estimator = Estimator(LinearRegression()) - elif issubclass(Estimator, RegressorMixin): - estimator = Estimator(Ridge()) - elif issubclass(Estimator, SelectFromModel): - # Increases coverage because SGDRegressor has partial_fit - estimator = Estimator(SGDRegressor(random_state=0)) - else: - estimator = Estimator(LogisticRegression(C=1)) - elif required_parameters in (["estimators"],): - # Heterogeneous ensemble classes (i.e. stacking, voting) - if issubclass(Estimator, RegressorMixin): - estimator = Estimator( - estimators=[ - ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)), - ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)), - ] - ) - else: - estimator = Estimator( - estimators=[ - ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)), - ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)), - ] - ) - else: - msg = ( - f"Can't instantiate estimator {Estimator.__name__} " - f"parameters {required_parameters}" - ) - # raise additional warning to be shown by pytest - warnings.warn(msg, SkipTestWarning) - raise SkipTest(msg) - else: - estimator = Estimator() - return estimator - - def _maybe_mark_xfail(estimator, check, pytest): # Mark (estimator, check) pairs as XFAIL if needed (see conditions in # _should_be_skipped_or_marked()) @@ -672,124 +582,6 @@ def _regression_dataset(): return REGRESSION_DATASET -def _set_checking_parameters(estimator): - # set parameters to speed up some estimators and - # avoid deprecated behaviour - params = estimator.get_params() - name = estimator.__class__.__name__ - if name == "TSNE": - estimator.set_params(perplexity=2) - if "n_iter" in params and name != "TSNE": - estimator.set_params(n_iter=5) - if "max_iter" in params: - if estimator.max_iter is not None: - estimator.set_params(max_iter=min(5, estimator.max_iter)) - # LinearSVR, LinearSVC - if name in ["LinearSVR", "LinearSVC"]: - estimator.set_params(max_iter=20) - # NMF - if name == "NMF": - estimator.set_params(max_iter=500) - # DictionaryLearning - if name == "DictionaryLearning": - estimator.set_params(max_iter=20, transform_algorithm="lasso_lars") - # MiniBatchNMF - if estimator.__class__.__name__ == "MiniBatchNMF": - estimator.set_params(max_iter=20, fresh_restarts=True) - # MLP - if name in ["MLPClassifier", "MLPRegressor"]: - estimator.set_params(max_iter=100) - # MiniBatchDictionaryLearning - if name == "MiniBatchDictionaryLearning": - estimator.set_params(max_iter=5) - - if "n_resampling" in params: - # randomized lasso - estimator.set_params(n_resampling=5) - if "n_estimators" in params: - estimator.set_params(n_estimators=min(5, estimator.n_estimators)) - if "max_trials" in params: - # RANSAC - estimator.set_params(max_trials=10) - if "n_init" in params: - # K-Means - estimator.set_params(n_init=2) - if "batch_size" in params and not name.startswith("MLP"): - estimator.set_params(batch_size=10) - - if name == "MeanShift": - # In the case of check_fit2d_1sample, bandwidth is set to None and - # is thus estimated. De facto it is 0.0 as a single sample is provided - # and this makes the test fails. Hence we give it a placeholder value. - estimator.set_params(bandwidth=1.0) - - if name == "TruncatedSVD": - # TruncatedSVD doesn't run with n_components = n_features - # This is ugly :-/ - estimator.n_components = 1 - - if name == "LassoLarsIC": - # Noise variance estimation does not work when `n_samples < n_features`. - # We need to provide the noise variance explicitly. - estimator.set_params(noise_variance=1.0) - - if hasattr(estimator, "n_clusters"): - estimator.n_clusters = min(estimator.n_clusters, 2) - - if hasattr(estimator, "n_best"): - estimator.n_best = 1 - - if name == "SelectFdr": - # be tolerant of noisy datasets (not actually speed) - estimator.set_params(alpha=0.5) - - if name == "TheilSenRegressor": - estimator.max_subpopulation = 100 - - if isinstance(estimator, BaseRandomProjection): - # Due to the jl lemma and often very few samples, the number - # of components of the random matrix projection will be probably - # greater than the number of features. - # So we impose a smaller number (avoid "auto" mode) - estimator.set_params(n_components=2) - - if isinstance(estimator, SelectKBest): - # SelectKBest has a default of k=10 - # which is more feature than we have in most case. - estimator.set_params(k=1) - - if name in ("HistGradientBoostingClassifier", "HistGradientBoostingRegressor"): - # The default min_samples_leaf (20) isn't appropriate for small - # datasets (only very shallow trees are built) that the checks use. - estimator.set_params(min_samples_leaf=5) - - if name == "DummyClassifier": - # the default strategy prior would output constant predictions and fail - # for check_classifiers_predictions - estimator.set_params(strategy="stratified") - - # Speed-up by reducing the number of CV or splits for CV estimators - loo_cv = ["RidgeCV", "RidgeClassifierCV"] - if name not in loo_cv and hasattr(estimator, "cv"): - estimator.set_params(cv=3) - if hasattr(estimator, "n_splits"): - estimator.set_params(n_splits=3) - - if name == "OneHotEncoder": - estimator.set_params(handle_unknown="ignore") - - if name in CROSS_DECOMPOSITION: - estimator.set_params(n_components=1) - - # Default "auto" parameter can lead to different ordering of eigenvalues on - # windows: #24105 - if name == "SpectralEmbedding": - estimator.set_params(eigen_tol=1e-5) - - if name == "HDBSCAN": - estimator.set_params(min_samples=1) - - class _NotAnArray: """An object that is convertible to an array. diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 8ac7ac9db2e9a..066277ff24af6 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -30,6 +30,7 @@ from sklearn.svm import SVC, NuSVC from sklearn.utils import _array_api, all_estimators, deprecated from sklearn.utils._param_validation import Interval, StrOptions +from sklearn.utils._test_common.instance_generator import _set_checking_parameters from sklearn.utils._testing import ( MinimalClassifier, MinimalRegressor, @@ -40,7 +41,6 @@ ) from sklearn.utils.estimator_checks import ( _NotAnArray, - _set_checking_parameters, _yield_all_checks, check_array_api_input, check_class_weight_balanced_linear_classifier, From 3474eea41c12ded3e7b9db287b309d11f3bd3abd Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 22 Aug 2024 08:19:22 +0200 Subject: [PATCH 03/17] add legacy to check_estimator --- sklearn/utils/estimator_checks.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 01b1276edcdaa..dbd15bd1c2089 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -594,7 +594,7 @@ def checks_generator(): ) -def check_estimator(estimator=None, generate_only=False): +def check_estimator(estimator=None, generate_only=False, legacy=True): """Check if estimator adheres to scikit-learn conventions. This function will run an extensive test-suite for input validation, @@ -613,6 +613,11 @@ def check_estimator(estimator=None, generate_only=False): :func:`~sklearn.utils.estimator_checks.parametrize_with_checks`, making it easier to test multiple estimators. + Checks are categorised into the following groups: + + - API checks: a set of checks to ensure API compatibility with scikit-learn + - legacy: a set of checks which gradually will be grouped into other categories + Parameters ---------- estimator : estimator object @@ -630,6 +635,11 @@ def check_estimator(estimator=None, generate_only=False): .. versionadded:: 0.22 + legacy : bool (default=True) + Whether to include legacy checks. + + .. versionadded:: 1.6 + Returns ------- checks_generator : generator @@ -659,7 +669,7 @@ def check_estimator(estimator=None, generate_only=False): name = type(estimator).__name__ def checks_generator(): - for check in _yield_all_checks(estimator): + for check in _yield_all_checks(estimator, legacy=legacy): check = _maybe_skip(estimator, check) yield estimator, partial(check, name) From 3975f17f28d8ea98fb2f4fcc1911694348ae7ee8 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 22 Aug 2024 17:44:08 +0200 Subject: [PATCH 04/17] fix tests --- sklearn/utils/tests/test_estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 8ac7ac9db2e9a..7cf7e19f70cfe 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -1210,7 +1210,7 @@ def test_non_deterministic_estimator_skip_tests(): # check estimators with non_deterministic tag set to True # will skip certain tests, refer to issue #22313 for details for est in [MinimalTransformer, MinimalRegressor, MinimalClassifier]: - all_tests = list(_yield_all_checks(est())) + all_tests = list(_yield_all_checks(est(), legacy=True)) assert check_methods_sample_order_invariance in all_tests assert check_methods_subset_invariance in all_tests @@ -1218,7 +1218,7 @@ class Estimator(est): def _more_tags(self): return {"non_deterministic": True} - all_tests = list(_yield_all_checks(Estimator())) + all_tests = list(_yield_all_checks(Estimator(), legacy=True)) assert check_methods_sample_order_invariance not in all_tests assert check_methods_subset_invariance not in all_tests From e27edd3dc930a6fb213ce44b442045a2b4c6932e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 23 Aug 2024 09:46:41 +0200 Subject: [PATCH 05/17] remove unnecessary vars --- sklearn/utils/_test_common/instance_generator.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 3e215111adcda..c8887ad524dd4 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -276,9 +276,6 @@ def _set_checking_parameters(estimator): # set parameters to speed up some estimators and # avoid deprecated behaviour - params = estimator.get_params() - name = estimator.__class__.__name__ - if type(estimator) in TEST_PARAMS: test_params = TEST_PARAMS[type(estimator)] estimator.set_params(**test_params) From 7eec0682bd5e5a6b33d77bc73cb0328a6e935164 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 23 Aug 2024 14:25:29 +0200 Subject: [PATCH 06/17] TST remove _required_parameters --- sklearn/tests/test_common.py | 9 +++------ sklearn/utils/estimator_checks.py | 27 +++++++++++++++++++++------ 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 467c7db9a3d21..d00b3779b77a7 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -51,6 +51,7 @@ from sklearn.utils import all_estimators from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags from sklearn.utils._test_common.instance_generator import ( + _construct_instance, _generate_column_transformer_instances, _generate_pipeline, _generate_search_cv_instances, @@ -151,12 +152,8 @@ def _tested_linear_classifiers(): with warnings.catch_warnings(record=True): for name, clazz in classifiers: - required_parameters = getattr(clazz, "_required_parameters", []) - if len(required_parameters): - # FIXME - continue - - if "class_weight" in clazz().get_params().keys() and issubclass( + instance = _construct_instance(clazz) + if "class_weight" in instance.get_params().keys() and issubclass( clazz, LinearClassifierMixin ): yield name, clazz diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 745503c54a7aa..e0a38492b0091 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -87,6 +87,8 @@ def _yield_checks(estimator): name = estimator.__class__.__name__ tags = _safe_tags(estimator) + yield check_estimator_cloneable + yield check_estimator_repr yield check_no_attributes_set_in_init yield check_estimators_dtypes yield check_fit_score_takes_y @@ -3208,6 +3210,23 @@ def check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type): assert_allclose(pred1, pred2, atol=1e-2, err_msg=name) +def check_estimator_cloneable(name, estimator_orig): + """Checks whether the estimator can be cloned.""" + try: + clone(estimator_orig) + except Exception as e: + raise AssertionError(f"Cloning of {name} failed with error: {e}.") from e + + +def check_estimator_repr(name, estimator_orig): + """Check that the estimator has a functioning repr.""" + estimator = clone(estimator_orig) + try: + repr(estimator) + except Exception as e: + raise AssertionError(f"Repr of {name} failed with error: {e}.") from e + + def check_parameters_default_constructible(name, Estimator): # test default-constructibility # get rid of deprecation warnings @@ -3216,10 +3235,6 @@ def check_parameters_default_constructible(name, Estimator): with ignore_warnings(category=FutureWarning): estimator = _construct_instance(Estimator) - # test cloning - clone(estimator) - # test __repr__ - repr(estimator) # test that set_params returns self assert estimator.set_params() is estimator @@ -3239,6 +3254,8 @@ def param_filter(p): p.name != "self" and p.kind != p.VAR_KEYWORD and p.kind != p.VAR_POSITIONAL + # and it should have a default value for this test + and p.default != p.empty ) init_params = [ @@ -3250,8 +3267,6 @@ def param_filter(p): # true for mixins return params = estimator.get_params() - # they can need a non-default argument - init_params = init_params[len(getattr(estimator, "_required_parameters", [])) :] for init_param in init_params: assert ( From c3f1249be0418a67972a69ba585cd08bd5879a27 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 23 Aug 2024 15:24:38 +0200 Subject: [PATCH 07/17] TST remove _required_parameters --- .../utils/_test_common/instance_generator.py | 136 ++++++++++++------ sklearn/utils/estimator_checks.py | 12 +- 2 files changed, 101 insertions(+), 47 deletions(-) diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index c8887ad524dd4..3e56eec8abca8 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -3,13 +3,11 @@ import re -import warnings from functools import partial from inspect import isfunction, signature from itertools import product from sklearn import config_context -from sklearn.base import RegressorMixin from sklearn.calibration import CalibratedClassifierCV from sklearn.cluster import ( HDBSCAN, @@ -60,10 +58,12 @@ RandomTreesEmbedding, StackingClassifier, StackingRegressor, + VotingClassifier, + VotingRegressor, ) -from sklearn.exceptions import SkipTestWarning from sklearn.experimental import enable_halving_search_cv # noqa from sklearn.feature_selection import ( + RFE, RFECV, SelectFdr, SelectFromModel, @@ -106,16 +106,27 @@ from sklearn.manifold import MDS, TSNE, LocallyLinearEmbedding, SpectralEmbedding from sklearn.mixture import BayesianGaussianMixture, GaussianMixture from sklearn.model_selection import ( + FixedThresholdClassifier, GridSearchCV, HalvingGridSearchCV, HalvingRandomSearchCV, RandomizedSearchCV, TunedThresholdClassifierCV, ) -from sklearn.multioutput import ClassifierChain, RegressorChain +from sklearn.multiclass import ( + OneVsOneClassifier, + OneVsRestClassifier, + OutputCodeClassifier, +) +from sklearn.multioutput import ( + ClassifierChain, + MultiOutputClassifier, + MultiOutputRegressor, + RegressorChain, +) from sklearn.neighbors import NeighborhoodComponentsAnalysis from sklearn.neural_network import BernoulliRBM, MLPClassifier, MLPRegressor -from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder from sklearn.random_projection import ( GaussianRandomProjection, @@ -301,51 +312,86 @@ def _generate_pipeline(): ) +INIT_PARAMS = { + SelfTrainingClassifier: dict(estimator=LogisticRegression(C=1)), + CalibratedClassifierCV: dict(estimator=LogisticRegression(C=1)), + ClassifierChain: dict(base_estimator=LogisticRegression(C=1)), + ColumnTransformer: dict(transformers=[("trans1", StandardScaler(), [0])]), + FeatureUnion: dict(transformer_list=[("trans1", StandardScaler())]), + FixedThresholdClassifier: dict(estimator=LogisticRegression(C=1)), + GridSearchCV: dict(estimator=LogisticRegression(C=1), param_grid={"C": [1.0]}), + HalvingGridSearchCV: dict( + estimator=LogisticRegression(C=1), param_grid={"C": [1.0]} + ), + HalvingRandomSearchCV: dict( + estimator=LogisticRegression(C=1), param_distributions={"C": [1.0]} + ), + MultiOutputClassifier: dict(estimator=LogisticRegression(C=1)), + MultiOutputRegressor: dict(estimator=Ridge()), + OneVsOneClassifier: dict(estimator=LogisticRegression(C=1)), + OneVsRestClassifier: dict(estimator=LogisticRegression(C=1)), + OutputCodeClassifier: dict(estimator=LogisticRegression(C=1)), + Pipeline: dict(steps=[("scaler", StandardScaler()), ("est", Ridge())]), + RandomizedSearchCV: dict( + estimator=LogisticRegression(C=1), param_distributions={"C": [1.0]} + ), + # `RANSACRegressor` will raise an error with any model other + # than `LinearRegression` if we don't fix `min_samples` parameter. + # For common test, we can enforce using `LinearRegression` that + # is the default estimator in `RANSACRegressor` instead of `Ridge`. + RANSACRegressor: dict(estimator=LinearRegression()), + RegressorChain: dict(base_estimator=Ridge()), + RFECV: dict(estimator=LogisticRegression(C=1)), + RFE: dict(estimator=LogisticRegression(C=1)), + # Increases coverage because SGDRegressor has partial_fit + SelectFromModel: dict(estimator=SGDRegressor(random_state=0)), + SequentialFeatureSelector: dict(estimator=LogisticRegression(C=1)), + StackingClassifier: dict( + estimators=[ + ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)), + ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)), + ] + ), + StackingRegressor: dict( + estimators=[ + ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)), + ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)), + ] + ), + TunedThresholdClassifierCV: dict(estimator=LogisticRegression(C=1)), + VotingClassifier: dict( + estimators=[ + ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)), + ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)), + ] + ), + VotingRegressor: dict( + estimators=[ + ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)), + ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)), + ] + ), +} + + def _construct_instance(Estimator): """Construct Estimator instance if possible.""" - required_parameters = getattr(Estimator, "_required_parameters", []) - if len(required_parameters): - if required_parameters in (["estimator"], ["base_estimator"]): - # `RANSACRegressor` will raise an error with any model other - # than `LinearRegression` if we don't fix `min_samples` parameter. - # For common test, we can enforce using `LinearRegression` that - # is the default estimator in `RANSACRegressor` instead of `Ridge`. - if issubclass(Estimator, RANSACRegressor): - estimator = Estimator(LinearRegression()) - elif issubclass(Estimator, RegressorMixin): - estimator = Estimator(Ridge()) - elif issubclass(Estimator, SelectFromModel): - # Increases coverage because SGDRegressor has partial_fit - estimator = Estimator(SGDRegressor(random_state=0)) - else: - estimator = Estimator(LogisticRegression(C=1)) - elif required_parameters in (["estimators"],): - # Heterogeneous ensemble classes (i.e. stacking, voting) - if issubclass(Estimator, RegressorMixin): - estimator = Estimator( - estimators=[ - ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)), - ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)), - ] - ) - else: - estimator = Estimator( - estimators=[ - ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)), - ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)), - ] - ) - else: - msg = ( - f"Can't instantiate estimator {Estimator.__name__} " - f"parameters {required_parameters}" - ) - # raise additional warning to be shown by pytest - warnings.warn(msg, SkipTestWarning) - raise SkipTest(msg) + if Estimator in INIT_PARAMS: + estimator = Estimator(**INIT_PARAMS[Estimator]) else: estimator = Estimator() return estimator + # else: + # msg = ( + # f"Can't instantiate estimator {Estimator.__name__} " + # f"parameters {required_parameters}" + # ) + # # raise additional warning to be shown by pytest + # warnings.warn(msg, SkipTestWarning) + # raise SkipTest(msg) + # else: + # estimator = Estimator() + # return estimator def _get_check_estimator_ids(obj): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index e84cd57ff01ea..6785a8bd99937 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -60,6 +60,7 @@ ) from ._test_common.instance_generator import ( CROSS_DECOMPOSITION, + INIT_PARAMS, _construct_instance, _get_check_estimator_ids, ) @@ -3256,11 +3257,11 @@ def check_estimator_repr(name, estimator_orig): raise AssertionError(f"Repr of {name} failed with error: {e}.") from e -def check_parameters_default_constructible(name, Estimator): +def check_parameters_default_constructible(name, estimator_orig): # test default-constructibility # get rid of deprecation warnings - Estimator = Estimator.__class__ + Estimator = estimator_orig.__class__ with ignore_warnings(category=FutureWarning): estimator = _construct_instance(Estimator) @@ -3298,6 +3299,13 @@ def param_filter(p): params = estimator.get_params() for init_param in init_params: + if ( + type(estimator) in INIT_PARAMS + and init_param.name in INIT_PARAMS[type(estimator)] + ): + # these parameters are coming from INIT_PARAMS and not the default + # values, therefore ignored. + continue assert ( init_param.default != init_param.empty ), "parameter %s for %s has no default value" % ( From 7c4a3b28049e87584cd82564acf277257e675335 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sat, 24 Aug 2024 11:35:35 +0200 Subject: [PATCH 08/17] ignore failing tests --- sklearn/compose/_column_transformer.py | 13 +++++++++++ .../_search_successive_halving.py | 3 +++ sklearn/pipeline.py | 9 ++++++++ .../utils/_test_common/instance_generator.py | 22 +++++++++---------- 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 2f8c28d4b02b9..325c90d697acc 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -1320,6 +1320,19 @@ def get_metadata_routing(self): return router + def _more_tags(self): + return { + "_xfail_checks": { + "check_estimators_empty_data_messages": "FIXME", + "check_estimators_nan_inf": "FIXME", + "check_estimator_sparse_array": "FIXME", + "check_estimator_sparse_matrix": "FIXME", + "check_transformer_data_not_an_array": "FIXME", + "check_fit1d": "FIXME", + "check_fit2d_predict1d": "FIXME", + } + } + def _check_X(X): """Use check_array only when necessary, e.g. on lists and other non-array-likes.""" diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 5980d40cb8e40..e5e3096a85379 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -379,6 +379,9 @@ def _more_tags(self): "Fail during parameter check since min/max resources requires" " more samples" ), + "check_estimators_nan_inf": "FIXME", + "check_classifiers_one_label_sample_weights": "FIXME", + "check_fit2d_1feature": "FIXME", } ) return tags diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 47faefcfd56ff..db0bdf0b6440c 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -1881,6 +1881,15 @@ def get_metadata_routing(self): return router + def _more_tags(self): + return { + "_xfail_checks": { + "check_estimators_overwrite_params": "FIXME", + "check_estimators_nan_inf": "FIXME", + "check_dont_overwrite_parameters": "FIXME", + } + } + def make_union(*transformers, n_jobs=None, verbose=False): """Construct a :class:`FeatureUnion` from the given transformers. diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 3e56eec8abca8..541819259a9f9 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -3,6 +3,7 @@ import re +import warnings from functools import partial from inspect import isfunction, signature from itertools import product @@ -37,6 +38,7 @@ MiniBatchDictionaryLearning, MiniBatchNMF, MiniBatchSparsePCA, + SparseCoder, SparsePCA, TruncatedSVD, ) @@ -61,6 +63,7 @@ VotingClassifier, VotingRegressor, ) +from sklearn.exceptions import SkipTestWarning from sklearn.experimental import enable_halving_search_cv # noqa from sklearn.feature_selection import ( RFE, @@ -373,25 +376,22 @@ def _generate_pipeline(): ), } +SKIPPED_ESTIMATORS = [SparseCoder] + def _construct_instance(Estimator): """Construct Estimator instance if possible.""" + if Estimator in SKIPPED_ESTIMATORS: + msg = f"Can't instantiate estimator {Estimator.__name__}" + # raise additional warning to be shown by pytest + warnings.warn(msg, SkipTestWarning) + raise SkipTest(msg) + if Estimator in INIT_PARAMS: estimator = Estimator(**INIT_PARAMS[Estimator]) else: estimator = Estimator() return estimator - # else: - # msg = ( - # f"Can't instantiate estimator {Estimator.__name__} " - # f"parameters {required_parameters}" - # ) - # # raise additional warning to be shown by pytest - # warnings.warn(msg, SkipTestWarning) - # raise SkipTest(msg) - # else: - # estimator = Estimator() - # return estimator def _get_check_estimator_ids(obj): From 1f220c48620c4395f854dca861f05d0271d96971 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 4 Sep 2024 15:07:11 +0200 Subject: [PATCH 09/17] skipping tests that should be skipped --- sklearn/compose/_column_transformer.py | 2 ++ sklearn/tests/test_common.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 325c90d697acc..60c0feb9cb279 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -1330,6 +1330,8 @@ def _more_tags(self): "check_transformer_data_not_an_array": "FIXME", "check_fit1d": "FIXME", "check_fit2d_predict1d": "FIXME", + "check_complex_data": "FIXME", + "check_fit2d_1feature": "FIXME", } } diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index d2df478e7899e..ebaa7a6037979 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -26,6 +26,7 @@ MeanShift, SpectralClustering, ) +from sklearn.compose import ColumnTransformer from sklearn.datasets import make_blobs from sklearn.exceptions import ConvergenceWarning, FitFailedWarning from sklearn.experimental import ( @@ -45,7 +46,7 @@ RadiusNeighborsClassifier, RadiusNeighborsRegressor, ) -from sklearn.pipeline import make_pipeline +from sklearn.pipeline import FeatureUnion, make_pipeline from sklearn.preprocessing import ( FunctionTransformer, MinMaxScaler, @@ -310,6 +311,8 @@ def _estimators_that_predict_in_fit(): "estimator", column_name_estimators, ids=_get_check_estimator_ids ) def test_pandas_column_name_consistency(estimator): + if isinstance(estimator, ColumnTransformer): + pytest.skip("ColumnTransformer is not tested here") _set_checking_parameters(estimator) with ignore_warnings(category=(FutureWarning)): with warnings.catch_warnings(record=True) as record: @@ -400,6 +403,8 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator): ids=_get_check_estimator_ids, ) def test_check_param_validation(estimator): + if isinstance(estimator, FeatureUnion): + pytest.skip("FeatureUnion is not tested here") name = estimator.__class__.__name__ _set_checking_parameters(estimator) check_param_validation(name, estimator) From f434406db8afa6e20dada0e7cae66c9d0329ffc8 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 4 Sep 2024 15:49:22 +0200 Subject: [PATCH 10/17] remove _required_parameters --- doc/developers/develop.rst | 9 ------ sklearn/base.py | 28 ++++--------------- sklearn/compose/_column_transformer.py | 2 -- sklearn/decomposition/_dict_learning.py | 2 -- sklearn/ensemble/_base.py | 6 ---- .../_classification_threshold.py | 1 - sklearn/model_selection/_search.py | 4 --- .../_search_successive_halving.py | 4 --- sklearn/pipeline.py | 4 --- 9 files changed, 5 insertions(+), 55 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index 97cb156da5812..c7e66d8df8ce5 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -659,15 +659,6 @@ Even if it is not recommended, it is possible to override the method any of the keys documented above is not present in the output of `_get_tags()`, an error will occur. -In addition to the tags, estimators also need to declare any non-optional -parameters to ``__init__`` in the ``_required_parameters`` class attribute, -which is a list or tuple. If ``_required_parameters`` is only -``["estimator"]`` or ``["base_estimator"]``, then the estimator will be -instantiated with an instance of ``LogisticRegression`` (or -``RidgeRegression`` if the estimator is a regressor) in the tests. The choice -of these two models is somewhat idiosyncratic but both should provide robust -closed-form solutions. - .. _developer_api_set_output: Developer API for `set_output` diff --git a/sklearn/base.py b/sklearn/base.py index 48b9081b8ade3..bc20b2282698f 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -1314,32 +1314,14 @@ def fit_predict(self, X, y=None, **kwargs): class MetaEstimatorMixin: """Mixin class for all meta estimators in scikit-learn. - This mixin defines the following functionality: - - - define `_required_parameters` that specify the mandatory `estimator` parameter. + This mixin is empty, and only exists to indicate that the estimator is a + meta-estimator. - Examples - -------- - >>> from sklearn.base import MetaEstimatorMixin - >>> from sklearn.datasets import load_iris - >>> from sklearn.linear_model import LogisticRegression - >>> class MyEstimator(MetaEstimatorMixin): - ... def __init__(self, *, estimator=None): - ... self.estimator = estimator - ... def fit(self, X, y=None): - ... if self.estimator is None: - ... self.estimator_ = LogisticRegression() - ... else: - ... self.estimator_ = self.estimator - ... return self - >>> X, y = load_iris(return_X_y=True) - >>> estimator = MyEstimator().fit(X, y) - >>> estimator.estimator_ - LogisticRegression() + .. versionchanged:: 1.6 + The `_required_parameters` is now removed and is unnecessary since tests are + refactored and don't use this anymore. """ - _required_parameters = ["estimator"] - class MultiOutputMixin: """Mixin to mark estimators that support multioutput.""" diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 60c0feb9cb279..fa97d6ff4edbd 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -285,8 +285,6 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): :ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`. """ - _required_parameters = ["transformers"] - _parameter_constraints: dict = { "transformers": [list, Hidden(tuple)], "remainder": [ diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 451d120756e9c..bb0131753929b 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1279,8 +1279,6 @@ class SparseCoder(_BaseSparseCoding, BaseEstimator): [ 0., 1., 1., 0., 0.]]) """ - _required_parameters = ["dictionary"] - def __init__( self, dictionary, diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 289c7c9b46f4a..a1527ecc088d8 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: BSD-3-Clause from abc import ABCMeta, abstractmethod -from typing import List import numpy as np from joblib import effective_n_jobs @@ -106,9 +105,6 @@ class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta): The collection of fitted base estimators. """ - # overwrite _required_parameters from MetaEstimatorMixin - _required_parameters: List[str] = [] - @abstractmethod def __init__( self, @@ -200,8 +196,6 @@ class _BaseHeterogeneousEnsemble( appear in `estimators_`. """ - _required_parameters = ["estimators"] - @property def named_estimators(self): """Dictionary to access any fitted sub-estimators by name. diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py index bd30a98ac7cc9..1c63d64a42252 100644 --- a/sklearn/model_selection/_classification_threshold.py +++ b/sklearn/model_selection/_classification_threshold.py @@ -87,7 +87,6 @@ class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator error. """ - _required_parameters = ["estimator"] _parameter_constraints: dict = { "estimator": [ HasMethods(["fit", "predict_proba"]), diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 9218b5bb6b3be..428f5bcdfe4eb 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1531,8 +1531,6 @@ class GridSearchCV(BaseSearchCV): 'std_fit_time', 'std_score_time', 'std_test_score'] """ - _required_parameters = ["estimator", "param_grid"] - _parameter_constraints: dict = { **BaseSearchCV._parameter_constraints, "param_grid": [dict, list], @@ -1912,8 +1910,6 @@ class RandomizedSearchCV(BaseSearchCV): {'C': np.float64(2...), 'penalty': 'l1'} """ - _required_parameters = ["estimator", "param_distributions"] - _parameter_constraints: dict = { **BaseSearchCV._parameter_constraints, "param_distributions": [dict, list], diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index e5e3096a85379..5f001d4ea6071 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -672,8 +672,6 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9} """ - _required_parameters = ["estimator", "param_grid"] - _parameter_constraints: dict = { **BaseSuccessiveHalving._parameter_constraints, "param_grid": [dict, list], @@ -1022,8 +1020,6 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9} """ - _required_parameters = ["estimator", "param_distributions"] - _parameter_constraints: dict = { **BaseSuccessiveHalving._parameter_constraints, "param_distributions": [dict, list], diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index db0bdf0b6440c..a6bf9c99cb694 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -152,8 +152,6 @@ class Pipeline(_BaseComposition): """ # BaseEstimator interface - _required_parameters = ["steps"] - _parameter_constraints: dict = { "steps": [list, Hidden(tuple)], "memory": [None, str, HasMethods(["cache"])], @@ -1426,8 +1424,6 @@ class FeatureUnion(TransformerMixin, _BaseComposition): :ref:`sphx_glr_auto_examples_compose_plot_feature_union.py`. """ - _required_parameters = ["transformer_list"] - def __init__( self, transformer_list, From 650bb8eb85d65c883cd643d9010e1d4490df5fea Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 5 Sep 2024 09:16:10 +0200 Subject: [PATCH 11/17] trying different params --- sklearn/base.py | 19 +++++++++++++++++++ sklearn/tests/test_common.py | 6 ++---- .../utils/_test_common/instance_generator.py | 5 +++-- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index bc20b2282698f..477d27cec34b0 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -1320,6 +1320,25 @@ class MetaEstimatorMixin: .. versionchanged:: 1.6 The `_required_parameters` is now removed and is unnecessary since tests are refactored and don't use this anymore. + + Examples + -------- + >>> from sklearn.base import MetaEstimatorMixin + >>> from sklearn.datasets import load_iris + >>> from sklearn.linear_model import LogisticRegression + >>> class MyEstimator(MetaEstimatorMixin): + ... def __init__(self, *, estimator=None): + ... self.estimator = estimator + ... def fit(self, X, y=None): + ... if self.estimator is None: + ... self.estimator_ = LogisticRegression() + ... else: + ... self.estimator_ = self.estimator + ... return self + >>> X, y = load_iris(return_X_y=True) + >>> estimator = MyEstimator().fit(X, y) + >>> estimator.estimator_ + LogisticRegression() """ diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index ebaa7a6037979..6f56bbbce84cb 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -29,15 +29,13 @@ from sklearn.compose import ColumnTransformer from sklearn.datasets import make_blobs from sklearn.exceptions import ConvergenceWarning, FitFailedWarning + +# make it possible to discover experimental estimators when calling `all_estimators` from sklearn.experimental import ( enable_halving_search_cv, # noqa enable_iterative_imputer, # noqa ) - -# make it possible to discover experimental estimators when calling `all_estimators` from sklearn.linear_model import LogisticRegression - -# make it possible to discover experimental estimators when calling `all_estimators` from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding from sklearn.neighbors import ( KNeighborsClassifier, diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 218969dfa5310..8ce1e5e65c05c 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -337,11 +337,12 @@ def _generate_pipeline(): error_score="raise", ), HalvingRandomSearchCV: dict( - estimator=LogisticRegression(), - param_distributions={"C": [0.1, 1.0]}, + estimator=Ridge(), + param_distributions={"alpha": [0.1, 1.0]}, min_resources="smallest", cv=2, error_score="raise", + random_state=0, ), MultiOutputClassifier: dict(estimator=LogisticRegression(C=1)), MultiOutputRegressor: dict(estimator=Ridge()), From 148e5d54156e2c587dedcac1b365e9b4d9fb9c56 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 5 Sep 2024 09:49:28 +0200 Subject: [PATCH 12/17] merge the two dicts --- sklearn/tests/test_common.py | 11 - .../utils/_test_common/instance_generator.py | 368 +++++++++--------- sklearn/utils/tests/test_estimator_checks.py | 3 - 3 files changed, 175 insertions(+), 207 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 6f56bbbce84cb..ae86c602cef7e 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -58,7 +58,6 @@ _generate_pipeline, _generate_search_cv_instances, _get_check_estimator_ids, - _set_checking_parameters, _tested_estimators, ) from sklearn.utils._testing import ( @@ -139,7 +138,6 @@ def test_estimators(estimator, check, request): with ignore_warnings( category=(FutureWarning, ConvergenceWarning, UserWarning, LinAlgWarning) ): - _set_checking_parameters(estimator) check(estimator) @@ -270,7 +268,6 @@ def test_valid_tag_types(estimator): "estimator", _tested_estimators(), ids=_get_check_estimator_ids ) def test_check_n_features_in_after_fitting(estimator): - _set_checking_parameters(estimator) check_n_features_in_after_fitting(estimator.__class__.__name__, estimator) @@ -311,7 +308,6 @@ def _estimators_that_predict_in_fit(): def test_pandas_column_name_consistency(estimator): if isinstance(estimator, ColumnTransformer): pytest.skip("ColumnTransformer is not tested here") - _set_checking_parameters(estimator) with ignore_warnings(category=(FutureWarning)): with warnings.catch_warnings(record=True) as record: check_dataframe_column_names_consistency( @@ -347,7 +343,6 @@ def _include_in_get_feature_names_out_check(transformer): "transformer", GET_FEATURES_OUT_ESTIMATORS, ids=_get_check_estimator_ids ) def test_transformers_get_feature_names_out(transformer): - _set_checking_parameters(transformer) with ignore_warnings(category=(FutureWarning)): check_transformer_get_feature_names_out( @@ -368,7 +363,6 @@ def test_transformers_get_feature_names_out(transformer): ) def test_estimators_get_feature_names_out_error(estimator): estimator_name = estimator.__class__.__name__ - _set_checking_parameters(estimator) check_get_feature_names_out_error(estimator_name, estimator) @@ -404,7 +398,6 @@ def test_check_param_validation(estimator): if isinstance(estimator, FeatureUnion): pytest.skip("FeatureUnion is not tested here") name = estimator.__class__.__name__ - _set_checking_parameters(estimator) check_param_validation(name, estimator) @@ -469,7 +462,6 @@ def test_set_output_transform(estimator): f"Skipping check_set_output_transform for {name}: Does not support" " set_output API" ) - _set_checking_parameters(estimator) with ignore_warnings(category=(FutureWarning)): check_set_output_transform(estimator.__class__.__name__, estimator) @@ -493,7 +485,6 @@ def test_set_output_transform_configured(estimator, check_func): f"Skipping {check_func.__name__} for {name}: Does not support" " set_output API yet" ) - _set_checking_parameters(estimator) with ignore_warnings(category=(FutureWarning)): check_func(estimator.__class__.__name__, estimator) @@ -511,8 +502,6 @@ def test_check_inplace_ensure_writeable(estimator): else: raise SkipTest(f"{name} doesn't require writeable input.") - _set_checking_parameters(estimator) - # The following estimators can work inplace only with certain settings if name == "HDBSCAN": estimator.set_params(metric="precomputed", algorithm="brute") diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 8ce1e5e65c05c..4f6b98917d260 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -149,150 +149,207 @@ # The following dictionary is to indicate constructor arguments suitable for the test # suite, which uses very small datasets, and is intended to run rather quickly. -TEST_PARAMS = { - AdaBoostClassifier: dict(n_estimators=5), - AdaBoostRegressor: dict(n_estimators=5), - AffinityPropagation: dict(max_iter=5), - AgglomerativeClustering: dict(n_clusters=2), - ARDRegression: dict(max_iter=5), - BaggingClassifier: dict(n_estimators=5), - BaggingRegressor: dict(n_estimators=5), - BayesianGaussianMixture: dict(n_init=2, max_iter=5), - BayesianRidge: dict(max_iter=5), - BernoulliRBM: dict(n_iter=5, batch_size=10), - Birch: dict(n_clusters=2), - BisectingKMeans: dict(n_init=2, n_clusters=2, max_iter=5), - CalibratedClassifierCV: dict(cv=3), - CCA: dict(n_components=1, max_iter=5), - ClassifierChain: dict(cv=3), - DictionaryLearning: dict(max_iter=20, transform_algorithm="lasso_lars"), +INIT_PARAMS = { + AdaBoostClassifier: {"n_estimators": 5}, + AdaBoostRegressor: {"n_estimators": 5}, + AffinityPropagation: {"max_iter": 5}, + AgglomerativeClustering: {"n_clusters": 2}, + ARDRegression: {"max_iter": 5}, + BaggingClassifier: {"n_estimators": 5}, + BaggingRegressor: {"n_estimators": 5}, + BayesianGaussianMixture: {"n_init": 2, "max_iter": 5}, + BayesianRidge: {"max_iter": 5}, + BernoulliRBM: {"n_iter": 5, "batch_size": 10}, + Birch: {"n_clusters": 2}, + BisectingKMeans: {"n_init": 2, "n_clusters": 2, "max_iter": 5}, + CalibratedClassifierCV: {"estimator": LogisticRegression(C=1), "cv": 3}, + CCA: {"n_components": 1, "max_iter": 5}, + ClassifierChain: {"base_estimator": LogisticRegression(C=1), "cv": 3}, + ColumnTransformer: {"transformers": [("trans1", StandardScaler(), [0, 1])]}, + DictionaryLearning: {"max_iter": 20, "transform_algorithm": "lasso_lars"}, # the default strategy prior would output constant predictions and fail # for check_classifiers_predictions - DummyClassifier: dict(strategy="stratified"), - ElasticNetCV: dict(max_iter=5, cv=3), - ElasticNet: dict(max_iter=5), - ExtraTreesClassifier: dict(n_estimators=5), - ExtraTreesRegressor: dict(n_estimators=5), - FactorAnalysis: dict(max_iter=5), - FastICA: dict(max_iter=5), - FeatureAgglomeration: dict(n_clusters=2), - GammaRegressor: dict(max_iter=5), - GaussianMixture: dict(n_init=2, max_iter=5), + DummyClassifier: {"strategy": "stratified"}, + ElasticNetCV: {"max_iter": 5, "cv": 3}, + ElasticNet: {"max_iter": 5}, + ExtraTreesClassifier: {"n_estimators": 5}, + ExtraTreesRegressor: {"n_estimators": 5}, + FactorAnalysis: {"max_iter": 5}, + FastICA: {"max_iter": 5}, + FeatureAgglomeration: {"n_clusters": 2}, + FeatureUnion: {"transformer_list": [("trans1", StandardScaler())]}, + FixedThresholdClassifier: {"estimator": LogisticRegression(C=1)}, + GammaRegressor: {"max_iter": 5}, + GaussianMixture: {"n_init": 2, "max_iter": 5}, # Due to the jl lemma and often very few samples, the number # of components of the random matrix projection will be probably # greater than the number of features. # So we impose a smaller number (avoid "auto" mode) - GaussianRandomProjection: dict(n_components=2), - GradientBoostingClassifier: dict(n_estimators=5), - GradientBoostingRegressor: dict(n_estimators=5), - GraphicalLassoCV: dict(max_iter=5, cv=3), - GraphicalLasso: dict(max_iter=5), - GridSearchCV: dict(cv=3), - HDBSCAN: dict(min_samples=1), + GaussianRandomProjection: {"n_components": 2}, + GradientBoostingClassifier: {"n_estimators": 5}, + GradientBoostingRegressor: {"n_estimators": 5}, + GraphicalLassoCV: {"max_iter": 5, "cv": 3}, + GraphicalLasso: {"max_iter": 5}, + GridSearchCV: { + "estimator": LogisticRegression(C=1), + "param_grid": {"C": [1.0]}, + "cv": 3, + }, + HalvingGridSearchCV: { + "estimator": Ridge(), + "min_resources": "smallest", + "param_grid": {"alpha": [0.1, 1.0]}, + "random_state": 0, + "cv": 2, + "error_score": "raise", + }, + HalvingRandomSearchCV: { + "estimator": Ridge(), + "param_distributions": {"alpha": [0.1, 1.0]}, + "min_resources": "smallest", + "cv": 2, + "error_score": "raise", + "random_state": 0, + }, + HDBSCAN: {"min_samples": 1}, # The default min_samples_leaf (20) isn't appropriate for small # datasets (only very shallow trees are built) that the checks use. - HistGradientBoostingClassifier: dict(max_iter=5, min_samples_leaf=5), - HistGradientBoostingRegressor: dict(max_iter=5, min_samples_leaf=5), - HuberRegressor: dict(max_iter=5), - IncrementalPCA: dict(batch_size=10), - IsolationForest: dict(n_estimators=5), - KMeans: dict(n_init=2, n_clusters=2, max_iter=5), - LabelPropagation: dict(max_iter=5), - LabelSpreading: dict(max_iter=5), - LarsCV: dict(max_iter=5, cv=3), - LassoCV: dict(max_iter=5, cv=3), - Lasso: dict(max_iter=5), - LassoLarsCV: dict(max_iter=5, cv=3), - LassoLars: dict(max_iter=5), + HistGradientBoostingClassifier: {"max_iter": 5, "min_samples_leaf": 5}, + HistGradientBoostingRegressor: {"max_iter": 5, "min_samples_leaf": 5}, + HuberRegressor: {"max_iter": 5}, + IncrementalPCA: {"batch_size": 10}, + IsolationForest: {"n_estimators": 5}, + KMeans: {"n_init": 2, "n_clusters": 2, "max_iter": 5}, + LabelPropagation: {"max_iter": 5}, + LabelSpreading: {"max_iter": 5}, + LarsCV: {"max_iter": 5, "cv": 3}, + LassoCV: {"max_iter": 5, "cv": 3}, + LassoLarsCV: {"max_iter": 5, "cv": 3}, # Noise variance estimation does not work when `n_samples < n_features`. # We need to provide the noise variance explicitly. - LassoLarsIC: dict(max_iter=5, noise_variance=1.0), - LatentDirichletAllocation: dict(max_iter=5, batch_size=10), - LinearSVR: dict(max_iter=20), - LinearSVC: dict(max_iter=20), - LocallyLinearEmbedding: dict(max_iter=5), - LogisticRegressionCV: dict(max_iter=5, cv=3), - LogisticRegression: dict(max_iter=5), - MDS: dict(n_init=2, max_iter=5), + LassoLarsIC: {"max_iter": 5, "noise_variance": 1.0}, + LassoLars: {"max_iter": 5}, + Lasso: {"max_iter": 5}, + LatentDirichletAllocation: {"max_iter": 5, "batch_size": 10}, + LinearSVC: {"max_iter": 20}, + LinearSVR: {"max_iter": 20}, + LocallyLinearEmbedding: {"max_iter": 5}, + LogisticRegressionCV: {"max_iter": 5, "cv": 3}, + LogisticRegression: {"max_iter": 5}, + MDS: {"n_init": 2, "max_iter": 5}, # In the case of check_fit2d_1sample, bandwidth is set to None and # is thus estimated. De facto it is 0.0 as a single sample is provided # and this makes the test fails. Hence we give it a placeholder value. - MeanShift: dict(max_iter=5, bandwidth=1.0), - MiniBatchDictionaryLearning: dict(batch_size=10, max_iter=5), - MiniBatchKMeans: dict(n_init=2, n_clusters=2, max_iter=5, batch_size=10), - MiniBatchNMF: dict(batch_size=10, max_iter=20, fresh_restarts=True), - MiniBatchSparsePCA: dict(max_iter=5, batch_size=10), - MLPClassifier: dict(max_iter=100), - MLPRegressor: dict(max_iter=100), - MultiTaskElasticNetCV: dict(max_iter=5, cv=3), - MultiTaskElasticNet: dict(max_iter=5), - MultiTaskLassoCV: dict(max_iter=5, cv=3), - MultiTaskLasso: dict(max_iter=5), - NeighborhoodComponentsAnalysis: dict(max_iter=5), - NMF: dict(max_iter=500), - NuSVC: dict(max_iter=-1), - NuSVR: dict(max_iter=-1), - OneClassSVM: dict(max_iter=-1), - OneHotEncoder: dict(handle_unknown="ignore"), - OrthogonalMatchingPursuitCV: dict(cv=3), - PassiveAggressiveClassifier: dict(max_iter=5), - PassiveAggressiveRegressor: dict(max_iter=5), - Perceptron: dict(max_iter=5), - PLSCanonical: dict(n_components=1, max_iter=5), - PLSRegression: dict(n_components=1, max_iter=5), - PLSSVD: dict(n_components=1), - PoissonRegressor: dict(max_iter=5), - RandomForestClassifier: dict(n_estimators=5), - RandomForestRegressor: dict(n_estimators=5), - RandomizedSearchCV: dict(n_iter=5, cv=3), - RandomTreesEmbedding: dict(n_estimators=5), - RANSACRegressor: dict(max_trials=10), - RegressorChain: dict(cv=3), - RFECV: dict(cv=3), + MeanShift: {"max_iter": 5, "bandwidth": 1.0}, + MiniBatchDictionaryLearning: {"batch_size": 10, "max_iter": 5}, + MiniBatchKMeans: {"n_init": 2, "n_clusters": 2, "max_iter": 5, "batch_size": 10}, + MiniBatchNMF: {"batch_size": 10, "max_iter": 20, "fresh_restarts": True}, + MiniBatchSparsePCA: {"max_iter": 5, "batch_size": 10}, + MLPClassifier: {"max_iter": 100}, + MLPRegressor: {"max_iter": 100}, + MultiOutputClassifier: {"estimator": LogisticRegression(C=1)}, + MultiOutputRegressor: {"estimator": Ridge()}, + MultiTaskElasticNetCV: {"max_iter": 5, "cv": 3}, + MultiTaskElasticNet: {"max_iter": 5}, + MultiTaskLassoCV: {"max_iter": 5, "cv": 3}, + MultiTaskLasso: {"max_iter": 5}, + NeighborhoodComponentsAnalysis: {"max_iter": 5}, + NMF: {"max_iter": 500}, + NuSVC: {"max_iter": -1}, + NuSVR: {"max_iter": -1}, + OneClassSVM: {"max_iter": -1}, + OneHotEncoder: {"handle_unknown": "ignore"}, + OneVsOneClassifier: {"estimator": LogisticRegression(C=1)}, + OneVsRestClassifier: {"estimator": LogisticRegression(C=1)}, + OrthogonalMatchingPursuitCV: {"cv": 3}, + OutputCodeClassifier: {"estimator": LogisticRegression(C=1)}, + PassiveAggressiveClassifier: {"max_iter": 5}, + PassiveAggressiveRegressor: {"max_iter": 5}, + Perceptron: {"max_iter": 5}, + Pipeline: {"steps": [("scaler", StandardScaler()), ("est", Ridge())]}, + PLSCanonical: {"n_components": 1, "max_iter": 5}, + PLSRegression: {"n_components": 1, "max_iter": 5}, + PLSSVD: {"n_components": 1}, + PoissonRegressor: {"max_iter": 5}, + RandomForestClassifier: {"n_estimators": 5}, + RandomForestRegressor: {"n_estimators": 5}, + RandomizedSearchCV: { + "estimator": LogisticRegression(C=1), + "param_distributions": {"C": [1.0]}, + "n_iter": 5, + "cv": 3, + }, + RandomTreesEmbedding: {"n_estimators": 5}, + # `RANSACRegressor` will raise an error with any model other + # than `LinearRegression` if we don't fix `min_samples` parameter. + # For common test, we can enforce using `LinearRegression` that + # is the default estimator in `RANSACRegressor` instead of `Ridge`. + RANSACRegressor: {"estimator": LinearRegression(), "max_trials": 10}, + RegressorChain: {"base_estimator": Ridge(), "cv": 3}, + RFECV: {"estimator": LogisticRegression(C=1), "cv": 3}, + RFE: {"estimator": LogisticRegression(C=1)}, # be tolerant of noisy datasets (not actually speed) - SelectFdr: dict(alpha=0.5), + SelectFdr: {"alpha": 0.5}, + # Increases coverage because SGDRegressor has partial_fit + SelectFromModel: {"estimator": SGDRegressor(random_state=0)}, # SelectKBest has a default of k=10 # which is more feature than we have in most case. - SelectKBest: dict(k=1), - SelfTrainingClassifier: dict(max_iter=5), - SequentialFeatureSelector: dict(cv=3), - SGDClassifier: dict(max_iter=5), - SGDOneClassSVM: dict(max_iter=5), - SGDRegressor: dict(max_iter=5), - SparsePCA: dict(max_iter=5), + SelectKBest: {"k": 1}, + SelfTrainingClassifier: {"estimator": LogisticRegression(C=1), "max_iter": 5}, + SequentialFeatureSelector: {"estimator": LogisticRegression(C=1), "cv": 3}, + SGDClassifier: {"max_iter": 5}, + SGDOneClassSVM: {"max_iter": 5}, + SGDRegressor: {"max_iter": 5}, + SparsePCA: {"max_iter": 5}, # Due to the jl lemma and often very few samples, the number # of components of the random matrix projection will be probably # greater than the number of features. # So we impose a smaller number (avoid "auto" mode) - SparseRandomProjection: dict(n_components=2), - SpectralBiclustering: dict(n_init=2, n_best=1, n_clusters=2), - SpectralClustering: dict(n_init=2, n_clusters=2), - SpectralCoclustering: dict(n_init=2, n_clusters=2), + SparseRandomProjection: {"n_components": 2}, + SpectralBiclustering: {"n_init": 2, "n_best": 1, "n_clusters": 2}, + SpectralClustering: {"n_init": 2, "n_clusters": 2}, + SpectralCoclustering: {"n_init": 2, "n_clusters": 2}, # Default "auto" parameter can lead to different ordering of eigenvalues on # windows: #24105 - SpectralEmbedding: dict(eigen_tol=1e-5), - StackingClassifier: dict(cv=3), - StackingRegressor: dict(cv=3), - SVC: dict(max_iter=-1), - SVR: dict(max_iter=-1), - TargetEncoder: dict(cv=3), - TheilSenRegressor: dict(max_iter=5, max_subpopulation=100), + SpectralEmbedding: {"eigen_tol": 1e-05}, + StackingClassifier: { + "estimators": [ + ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)), + ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)), + ], + "cv": 3, + }, + StackingRegressor: { + "estimators": [ + ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)), + ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)), + ], + "cv": 3, + }, + SVC: {"max_iter": -1}, + SVR: {"max_iter": -1}, + TargetEncoder: {"cv": 3}, + TheilSenRegressor: {"max_iter": 5, "max_subpopulation": 100}, # TruncatedSVD doesn't run with n_components = n_features - TruncatedSVD: dict(n_iter=5, n_components=1), - TSNE: dict(perplexity=2), - TunedThresholdClassifierCV: dict(cv=3), - TweedieRegressor: dict(max_iter=5), + TruncatedSVD: {"n_iter": 5, "n_components": 1}, + TSNE: {"perplexity": 2}, + TunedThresholdClassifierCV: {"estimator": LogisticRegression(C=1), "cv": 3}, + TweedieRegressor: {"max_iter": 5}, + VotingClassifier: { + "estimators": [ + ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)), + ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)), + ] + }, + VotingRegressor: { + "estimators": [ + ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)), + ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)), + ] + }, } -def _set_checking_parameters(estimator): - """Set the parameters of an estimator instance to speed-up tests and avoid - deprecation warnings in common test.""" - if type(estimator) in TEST_PARAMS: - test_params = TEST_PARAMS[type(estimator)] - estimator.set_params(**test_params) - - def _tested_estimators(type_filter=None): for name, Estimator in all_estimators(type_filter=type_filter): try: @@ -316,81 +373,6 @@ def _generate_pipeline(): ) -INIT_PARAMS = { - SelfTrainingClassifier: dict(estimator=LogisticRegression(C=1)), - CalibratedClassifierCV: dict(estimator=LogisticRegression(C=1)), - ClassifierChain: dict(base_estimator=LogisticRegression(C=1)), - ColumnTransformer: dict( - transformers=[ - ("trans1", StandardScaler(), [0, 1]), - ] - ), - FeatureUnion: dict(transformer_list=[("trans1", StandardScaler())]), - FixedThresholdClassifier: dict(estimator=LogisticRegression(C=1)), - GridSearchCV: dict(estimator=LogisticRegression(C=1), param_grid={"C": [1.0]}), - HalvingGridSearchCV: dict( - estimator=Ridge(), - min_resources="smallest", - param_grid={"alpha": [0.1, 1.0]}, - random_state=0, - cv=2, - error_score="raise", - ), - HalvingRandomSearchCV: dict( - estimator=Ridge(), - param_distributions={"alpha": [0.1, 1.0]}, - min_resources="smallest", - cv=2, - error_score="raise", - random_state=0, - ), - MultiOutputClassifier: dict(estimator=LogisticRegression(C=1)), - MultiOutputRegressor: dict(estimator=Ridge()), - OneVsOneClassifier: dict(estimator=LogisticRegression(C=1)), - OneVsRestClassifier: dict(estimator=LogisticRegression(C=1)), - OutputCodeClassifier: dict(estimator=LogisticRegression(C=1)), - Pipeline: dict(steps=[("scaler", StandardScaler()), ("est", Ridge())]), - RandomizedSearchCV: dict( - estimator=LogisticRegression(C=1), param_distributions={"C": [1.0]} - ), - # `RANSACRegressor` will raise an error with any model other - # than `LinearRegression` if we don't fix `min_samples` parameter. - # For common test, we can enforce using `LinearRegression` that - # is the default estimator in `RANSACRegressor` instead of `Ridge`. - RANSACRegressor: dict(estimator=LinearRegression()), - RegressorChain: dict(base_estimator=Ridge()), - RFECV: dict(estimator=LogisticRegression(C=1)), - RFE: dict(estimator=LogisticRegression(C=1)), - # Increases coverage because SGDRegressor has partial_fit - SelectFromModel: dict(estimator=SGDRegressor(random_state=0)), - SequentialFeatureSelector: dict(estimator=LogisticRegression(C=1)), - StackingClassifier: dict( - estimators=[ - ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)), - ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)), - ] - ), - StackingRegressor: dict( - estimators=[ - ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)), - ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)), - ] - ), - TunedThresholdClassifierCV: dict(estimator=LogisticRegression(C=1)), - VotingClassifier: dict( - estimators=[ - ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)), - ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)), - ] - ), - VotingRegressor: dict( - estimators=[ - ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)), - ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)), - ] - ), -} - SKIPPED_ESTIMATORS = [SparseCoder] diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index b90c8d0133dbe..65fbd8a50f3ba 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -30,7 +30,6 @@ from sklearn.svm import SVC, NuSVC from sklearn.utils import _array_api, all_estimators, deprecated from sklearn.utils._param_validation import Interval, StrOptions -from sklearn.utils._test_common.instance_generator import _set_checking_parameters from sklearn.utils._testing import ( MinimalClassifier, MinimalRegressor, @@ -745,7 +744,6 @@ def test_check_estimator_clones(): # without fitting with ignore_warnings(category=ConvergenceWarning): est = Estimator() - _set_checking_parameters(est) set_random_state(est) old_hash = joblib.hash(est) check_estimator(est) @@ -754,7 +752,6 @@ def test_check_estimator_clones(): # with fitting with ignore_warnings(category=ConvergenceWarning): est = Estimator() - _set_checking_parameters(est) set_random_state(est) est.fit(iris.data + 10, iris.target) old_hash = joblib.hash(est) From 32a6ec67506ab6d9dd36845dcad5499b3c8034ba Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 5 Sep 2024 09:58:13 +0200 Subject: [PATCH 13/17] reduce diff --- .../utils/_test_common/instance_generator.py | 312 +++++++++--------- 1 file changed, 155 insertions(+), 157 deletions(-) diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 4f6b98917d260..01a028aaf201e 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -150,203 +150,201 @@ # The following dictionary is to indicate constructor arguments suitable for the test # suite, which uses very small datasets, and is intended to run rather quickly. INIT_PARAMS = { - AdaBoostClassifier: {"n_estimators": 5}, - AdaBoostRegressor: {"n_estimators": 5}, - AffinityPropagation: {"max_iter": 5}, - AgglomerativeClustering: {"n_clusters": 2}, - ARDRegression: {"max_iter": 5}, - BaggingClassifier: {"n_estimators": 5}, - BaggingRegressor: {"n_estimators": 5}, - BayesianGaussianMixture: {"n_init": 2, "max_iter": 5}, - BayesianRidge: {"max_iter": 5}, - BernoulliRBM: {"n_iter": 5, "batch_size": 10}, - Birch: {"n_clusters": 2}, - BisectingKMeans: {"n_init": 2, "n_clusters": 2, "max_iter": 5}, - CalibratedClassifierCV: {"estimator": LogisticRegression(C=1), "cv": 3}, - CCA: {"n_components": 1, "max_iter": 5}, - ClassifierChain: {"base_estimator": LogisticRegression(C=1), "cv": 3}, - ColumnTransformer: {"transformers": [("trans1", StandardScaler(), [0, 1])]}, - DictionaryLearning: {"max_iter": 20, "transform_algorithm": "lasso_lars"}, + AdaBoostClassifier: dict(n_estimators=5), + AdaBoostRegressor: dict(n_estimators=5), + AffinityPropagation: dict(max_iter=5), + AgglomerativeClustering: dict(n_clusters=2), + ARDRegression: dict(max_iter=5), + BaggingClassifier: dict(n_estimators=5), + BaggingRegressor: dict(n_estimators=5), + BayesianGaussianMixture: dict(n_init=2, max_iter=5), + BayesianRidge: dict(max_iter=5), + BernoulliRBM: dict(n_iter=5, batch_size=10), + Birch: dict(n_clusters=2), + BisectingKMeans: dict(n_init=2, n_clusters=2, max_iter=5), + CalibratedClassifierCV: dict(estimator=LogisticRegression(C=1), cv=3), + CCA: dict(n_components=1, max_iter=5), + ClassifierChain: dict(base_estimator=LogisticRegression(C=1), cv=3), + ColumnTransformer: dict(transformers=[("trans1", StandardScaler(), [0, 1])]), + DictionaryLearning: dict(max_iter=20, transform_algorithm="lasso_lars"), # the default strategy prior would output constant predictions and fail # for check_classifiers_predictions - DummyClassifier: {"strategy": "stratified"}, - ElasticNetCV: {"max_iter": 5, "cv": 3}, - ElasticNet: {"max_iter": 5}, - ExtraTreesClassifier: {"n_estimators": 5}, - ExtraTreesRegressor: {"n_estimators": 5}, - FactorAnalysis: {"max_iter": 5}, - FastICA: {"max_iter": 5}, - FeatureAgglomeration: {"n_clusters": 2}, - FeatureUnion: {"transformer_list": [("trans1", StandardScaler())]}, - FixedThresholdClassifier: {"estimator": LogisticRegression(C=1)}, - GammaRegressor: {"max_iter": 5}, - GaussianMixture: {"n_init": 2, "max_iter": 5}, + DummyClassifier: dict(strategy="stratified"), + ElasticNetCV: dict(max_iter=5, cv=3), + ElasticNet: dict(max_iter=5), + ExtraTreesClassifier: dict(n_estimators=5), + ExtraTreesRegressor: dict(n_estimators=5), + FactorAnalysis: dict(max_iter=5), + FastICA: dict(max_iter=5), + FeatureAgglomeration: dict(n_clusters=2), + FeatureUnion: dict(transformer_list=[("trans1", StandardScaler())]), + FixedThresholdClassifier: dict(estimator=LogisticRegression(C=1)), + GammaRegressor: dict(max_iter=5), + GaussianMixture: dict(n_init=2, max_iter=5), # Due to the jl lemma and often very few samples, the number # of components of the random matrix projection will be probably # greater than the number of features. # So we impose a smaller number (avoid "auto" mode) - GaussianRandomProjection: {"n_components": 2}, - GradientBoostingClassifier: {"n_estimators": 5}, - GradientBoostingRegressor: {"n_estimators": 5}, - GraphicalLassoCV: {"max_iter": 5, "cv": 3}, - GraphicalLasso: {"max_iter": 5}, - GridSearchCV: { - "estimator": LogisticRegression(C=1), - "param_grid": {"C": [1.0]}, - "cv": 3, - }, - HalvingGridSearchCV: { - "estimator": Ridge(), - "min_resources": "smallest", - "param_grid": {"alpha": [0.1, 1.0]}, - "random_state": 0, - "cv": 2, - "error_score": "raise", - }, - HalvingRandomSearchCV: { - "estimator": Ridge(), - "param_distributions": {"alpha": [0.1, 1.0]}, - "min_resources": "smallest", - "cv": 2, - "error_score": "raise", - "random_state": 0, - }, - HDBSCAN: {"min_samples": 1}, + GaussianRandomProjection: dict(n_components=2), + GradientBoostingClassifier: dict(n_estimators=5), + GradientBoostingRegressor: dict(n_estimators=5), + GraphicalLassoCV: dict(max_iter=5, cv=3), + GraphicalLasso: dict(max_iter=5), + GridSearchCV: dict( + estimator=LogisticRegression(C=1), param_grid={"C": [1.0]}, cv=3 + ), + HalvingGridSearchCV: dict( + estimator=Ridge(), + min_resources="smallest", + param_grid={"alpha": [0.1, 1.0]}, + random_state=0, + cv=2, + error_score="raise", + ), + HalvingRandomSearchCV: dict( + estimator=Ridge(), + param_distributions={"alpha": [0.1, 1.0]}, + min_resources="smallest", + cv=2, + error_score="raise", + random_state=0, + ), + HDBSCAN: dict(min_samples=1), # The default min_samples_leaf (20) isn't appropriate for small # datasets (only very shallow trees are built) that the checks use. - HistGradientBoostingClassifier: {"max_iter": 5, "min_samples_leaf": 5}, - HistGradientBoostingRegressor: {"max_iter": 5, "min_samples_leaf": 5}, - HuberRegressor: {"max_iter": 5}, - IncrementalPCA: {"batch_size": 10}, - IsolationForest: {"n_estimators": 5}, - KMeans: {"n_init": 2, "n_clusters": 2, "max_iter": 5}, - LabelPropagation: {"max_iter": 5}, - LabelSpreading: {"max_iter": 5}, - LarsCV: {"max_iter": 5, "cv": 3}, - LassoCV: {"max_iter": 5, "cv": 3}, - LassoLarsCV: {"max_iter": 5, "cv": 3}, + HistGradientBoostingClassifier: dict(max_iter=5, min_samples_leaf=5), + HistGradientBoostingRegressor: dict(max_iter=5, min_samples_leaf=5), + HuberRegressor: dict(max_iter=5), + IncrementalPCA: dict(batch_size=10), + IsolationForest: dict(n_estimators=5), + KMeans: dict(n_init=2, n_clusters=2, max_iter=5), + LabelPropagation: dict(max_iter=5), + LabelSpreading: dict(max_iter=5), + LarsCV: dict(max_iter=5, cv=3), + LassoCV: dict(max_iter=5, cv=3), + Lasso: dict(max_iter=5), + LassoLarsCV: dict(max_iter=5, cv=3), + LassoLars: dict(max_iter=5), # Noise variance estimation does not work when `n_samples < n_features`. # We need to provide the noise variance explicitly. - LassoLarsIC: {"max_iter": 5, "noise_variance": 1.0}, - LassoLars: {"max_iter": 5}, - Lasso: {"max_iter": 5}, - LatentDirichletAllocation: {"max_iter": 5, "batch_size": 10}, - LinearSVC: {"max_iter": 20}, - LinearSVR: {"max_iter": 20}, - LocallyLinearEmbedding: {"max_iter": 5}, - LogisticRegressionCV: {"max_iter": 5, "cv": 3}, - LogisticRegression: {"max_iter": 5}, - MDS: {"n_init": 2, "max_iter": 5}, + LassoLarsIC: dict(max_iter=5, noise_variance=1.0), + LatentDirichletAllocation: dict(max_iter=5, batch_size=10), + LinearSVC: dict(max_iter=20), + LinearSVR: dict(max_iter=20), + LocallyLinearEmbedding: dict(max_iter=5), + LogisticRegressionCV: dict(max_iter=5, cv=3), + LogisticRegression: dict(max_iter=5), + MDS: dict(n_init=2, max_iter=5), # In the case of check_fit2d_1sample, bandwidth is set to None and # is thus estimated. De facto it is 0.0 as a single sample is provided # and this makes the test fails. Hence we give it a placeholder value. - MeanShift: {"max_iter": 5, "bandwidth": 1.0}, - MiniBatchDictionaryLearning: {"batch_size": 10, "max_iter": 5}, - MiniBatchKMeans: {"n_init": 2, "n_clusters": 2, "max_iter": 5, "batch_size": 10}, - MiniBatchNMF: {"batch_size": 10, "max_iter": 20, "fresh_restarts": True}, - MiniBatchSparsePCA: {"max_iter": 5, "batch_size": 10}, - MLPClassifier: {"max_iter": 100}, - MLPRegressor: {"max_iter": 100}, - MultiOutputClassifier: {"estimator": LogisticRegression(C=1)}, - MultiOutputRegressor: {"estimator": Ridge()}, - MultiTaskElasticNetCV: {"max_iter": 5, "cv": 3}, - MultiTaskElasticNet: {"max_iter": 5}, - MultiTaskLassoCV: {"max_iter": 5, "cv": 3}, - MultiTaskLasso: {"max_iter": 5}, - NeighborhoodComponentsAnalysis: {"max_iter": 5}, - NMF: {"max_iter": 500}, - NuSVC: {"max_iter": -1}, - NuSVR: {"max_iter": -1}, - OneClassSVM: {"max_iter": -1}, - OneHotEncoder: {"handle_unknown": "ignore"}, - OneVsOneClassifier: {"estimator": LogisticRegression(C=1)}, - OneVsRestClassifier: {"estimator": LogisticRegression(C=1)}, - OrthogonalMatchingPursuitCV: {"cv": 3}, - OutputCodeClassifier: {"estimator": LogisticRegression(C=1)}, - PassiveAggressiveClassifier: {"max_iter": 5}, - PassiveAggressiveRegressor: {"max_iter": 5}, - Perceptron: {"max_iter": 5}, - Pipeline: {"steps": [("scaler", StandardScaler()), ("est", Ridge())]}, - PLSCanonical: {"n_components": 1, "max_iter": 5}, - PLSRegression: {"n_components": 1, "max_iter": 5}, - PLSSVD: {"n_components": 1}, - PoissonRegressor: {"max_iter": 5}, - RandomForestClassifier: {"n_estimators": 5}, - RandomForestRegressor: {"n_estimators": 5}, - RandomizedSearchCV: { - "estimator": LogisticRegression(C=1), - "param_distributions": {"C": [1.0]}, - "n_iter": 5, - "cv": 3, - }, - RandomTreesEmbedding: {"n_estimators": 5}, + MeanShift: dict(max_iter=5, bandwidth=1.0), + MiniBatchDictionaryLearning: dict(batch_size=10, max_iter=5), + MiniBatchKMeans: dict(n_init=2, n_clusters=2, max_iter=5, batch_size=10), + MiniBatchNMF: dict(batch_size=10, max_iter=20, fresh_restarts=True), + MiniBatchSparsePCA: dict(max_iter=5, batch_size=10), + MLPClassifier: dict(max_iter=100), + MLPRegressor: dict(max_iter=100), + MultiOutputClassifier: dict(estimator=LogisticRegression(C=1)), + MultiOutputRegressor: dict(estimator=Ridge()), + MultiTaskElasticNetCV: dict(max_iter=5, cv=3), + MultiTaskElasticNet: dict(max_iter=5), + MultiTaskLassoCV: dict(max_iter=5, cv=3), + MultiTaskLasso: dict(max_iter=5), + NeighborhoodComponentsAnalysis: dict(max_iter=5), + NMF: dict(max_iter=500), + NuSVC: dict(max_iter=-1), + NuSVR: dict(max_iter=-1), + OneClassSVM: dict(max_iter=-1), + OneHotEncoder: dict(handle_unknown="ignore"), + OneVsOneClassifier: dict(estimator=LogisticRegression(C=1)), + OneVsRestClassifier: dict(estimator=LogisticRegression(C=1)), + OrthogonalMatchingPursuitCV: dict(cv=3), + OutputCodeClassifier: dict(estimator=LogisticRegression(C=1)), + PassiveAggressiveClassifier: dict(max_iter=5), + PassiveAggressiveRegressor: dict(max_iter=5), + Perceptron: dict(max_iter=5), + Pipeline: dict(steps=[("scaler", StandardScaler()), ("est", Ridge())]), + PLSCanonical: dict(n_components=1, max_iter=5), + PLSRegression: dict(n_components=1, max_iter=5), + PLSSVD: dict(n_components=1), + PoissonRegressor: dict(max_iter=5), + RandomForestClassifier: dict(n_estimators=5), + RandomForestRegressor: dict(n_estimators=5), + RandomizedSearchCV: dict( + estimator=LogisticRegression(C=1), + param_distributions={"C": [1.0]}, + n_iter=5, + cv=3, + ), + RandomTreesEmbedding: dict(n_estimators=5), # `RANSACRegressor` will raise an error with any model other # than `LinearRegression` if we don't fix `min_samples` parameter. # For common test, we can enforce using `LinearRegression` that # is the default estimator in `RANSACRegressor` instead of `Ridge`. - RANSACRegressor: {"estimator": LinearRegression(), "max_trials": 10}, - RegressorChain: {"base_estimator": Ridge(), "cv": 3}, - RFECV: {"estimator": LogisticRegression(C=1), "cv": 3}, - RFE: {"estimator": LogisticRegression(C=1)}, + RANSACRegressor: dict(estimator=LinearRegression(), max_trials=10), + RegressorChain: dict(base_estimator=Ridge(), cv=3), + RFECV: dict(estimator=LogisticRegression(C=1), cv=3), + RFE: dict(estimator=LogisticRegression(C=1)), # be tolerant of noisy datasets (not actually speed) - SelectFdr: {"alpha": 0.5}, + SelectFdr: dict(alpha=0.5), # Increases coverage because SGDRegressor has partial_fit - SelectFromModel: {"estimator": SGDRegressor(random_state=0)}, + SelectFromModel: dict(estimator=SGDRegressor(random_state=0)), # SelectKBest has a default of k=10 # which is more feature than we have in most case. - SelectKBest: {"k": 1}, - SelfTrainingClassifier: {"estimator": LogisticRegression(C=1), "max_iter": 5}, - SequentialFeatureSelector: {"estimator": LogisticRegression(C=1), "cv": 3}, - SGDClassifier: {"max_iter": 5}, - SGDOneClassSVM: {"max_iter": 5}, - SGDRegressor: {"max_iter": 5}, - SparsePCA: {"max_iter": 5}, + SelectKBest: dict(k=1), + SelfTrainingClassifier: dict(estimator=LogisticRegression(C=1), max_iter=5), + SequentialFeatureSelector: dict(estimator=LogisticRegression(C=1), cv=3), + SGDClassifier: dict(max_iter=5), + SGDOneClassSVM: dict(max_iter=5), + SGDRegressor: dict(max_iter=5), + SparsePCA: dict(max_iter=5), # Due to the jl lemma and often very few samples, the number # of components of the random matrix projection will be probably # greater than the number of features. # So we impose a smaller number (avoid "auto" mode) - SparseRandomProjection: {"n_components": 2}, - SpectralBiclustering: {"n_init": 2, "n_best": 1, "n_clusters": 2}, - SpectralClustering: {"n_init": 2, "n_clusters": 2}, - SpectralCoclustering: {"n_init": 2, "n_clusters": 2}, + SparseRandomProjection: dict(n_components=2), + SpectralBiclustering: dict(n_init=2, n_best=1, n_clusters=2), + SpectralClustering: dict(n_init=2, n_clusters=2), + SpectralCoclustering: dict(n_init=2, n_clusters=2), # Default "auto" parameter can lead to different ordering of eigenvalues on # windows: #24105 - SpectralEmbedding: {"eigen_tol": 1e-05}, - StackingClassifier: { - "estimators": [ + SpectralEmbedding: dict(eigen_tol=1e-05), + StackingClassifier: dict( + estimators=[ ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)), ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)), ], - "cv": 3, - }, - StackingRegressor: { - "estimators": [ + cv=3, + ), + StackingRegressor: dict( + estimators=[ ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)), ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)), ], - "cv": 3, - }, - SVC: {"max_iter": -1}, - SVR: {"max_iter": -1}, - TargetEncoder: {"cv": 3}, - TheilSenRegressor: {"max_iter": 5, "max_subpopulation": 100}, + cv=3, + ), + SVC: dict(max_iter=-1), + SVR: dict(max_iter=-1), + TargetEncoder: dict(cv=3), + TheilSenRegressor: dict(max_iter=5, max_subpopulation=100), # TruncatedSVD doesn't run with n_components = n_features - TruncatedSVD: {"n_iter": 5, "n_components": 1}, - TSNE: {"perplexity": 2}, - TunedThresholdClassifierCV: {"estimator": LogisticRegression(C=1), "cv": 3}, - TweedieRegressor: {"max_iter": 5}, - VotingClassifier: { - "estimators": [ + TruncatedSVD: dict(n_iter=5, n_components=1), + TSNE: dict(perplexity=2), + TunedThresholdClassifierCV: dict(estimator=LogisticRegression(C=1), cv=3), + TweedieRegressor: dict(max_iter=5), + VotingClassifier: dict( + estimators=[ ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)), ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)), ] - }, - VotingRegressor: { - "estimators": [ + ), + VotingRegressor: dict( + estimators=[ ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)), ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)), ] - }, + ), } From 0c7366c3e071fe732e887975ab3561fc706e875b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 5 Sep 2024 10:13:51 +0200 Subject: [PATCH 14/17] use new tags --- sklearn/compose/_column_transformer.py | 26 +++++++++++++------------- sklearn/pipeline.py | 14 +++++++------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index fa97d6ff4edbd..3099e25e64bed 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -1318,20 +1318,20 @@ def get_metadata_routing(self): return router - def _more_tags(self): - return { - "_xfail_checks": { - "check_estimators_empty_data_messages": "FIXME", - "check_estimators_nan_inf": "FIXME", - "check_estimator_sparse_array": "FIXME", - "check_estimator_sparse_matrix": "FIXME", - "check_transformer_data_not_an_array": "FIXME", - "check_fit1d": "FIXME", - "check_fit2d_predict1d": "FIXME", - "check_complex_data": "FIXME", - "check_fit2d_1feature": "FIXME", - } + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_estimators_empty_data_messages": "FIXME", + "check_estimators_nan_inf": "FIXME", + "check_estimator_sparse_array": "FIXME", + "check_estimator_sparse_matrix": "FIXME", + "check_transformer_data_not_an_array": "FIXME", + "check_fit1d": "FIXME", + "check_fit2d_predict1d": "FIXME", + "check_complex_data": "FIXME", + "check_fit2d_1feature": "FIXME", } + return tags def _check_X(X): diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index b09ae8f01381c..41daced76c1a9 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -1878,14 +1878,14 @@ def get_metadata_routing(self): return router - def _more_tags(self): - return { - "_xfail_checks": { - "check_estimators_overwrite_params": "FIXME", - "check_estimators_nan_inf": "FIXME", - "check_dont_overwrite_parameters": "FIXME", - } + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_estimators_overwrite_params": "FIXME", + "check_estimators_nan_inf": "FIXME", + "check_dont_overwrite_parameters": "FIXME", } + return tags def make_union(*transformers, n_jobs=None, verbose=False): From 27d315d5ff1b91fb4739dbb8f9671432a1f5c4de Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 5 Sep 2024 13:27:02 +0200 Subject: [PATCH 15/17] test error messages of tests --- sklearn/utils/tests/test_estimator_checks.py | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index a5c2a73aa5061..7a851bb4e7a6c 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -51,6 +51,8 @@ check_dataframe_column_names_consistency, check_decision_proba_consistency, check_estimator, + check_estimator_cloneable, + check_estimator_repr, check_estimators_unfitted, check_fit_check_is_fitted, check_fit_score_takes_y, @@ -1294,3 +1296,29 @@ def get_check_name(check): non_legacy_check_names = {get_check_name(check) for check in non_legacy_checks} legacy_check_names = {get_check_name(check) for check in legacy_checks} assert non_legacy_check_names.issubset(legacy_check_names) + + +def test_check_estimator_cloneable_error(): + """Check that the right error is raised when the estimator is not cloneable.""" + + class NotCloneable(BaseEstimator): + def __sklearn_clone__(self): + raise NotImplementedError("This estimator is not cloneable.") + + estimator = NotCloneable() + msg = "Cloning of .* failed with error" + with raises(AssertionError, match=msg): + check_estimator_cloneable("NotCloneable", estimator) + + +def test_estimator_repr_error(): + """Check that the right error is raised when the estimator does not do a repr.""" + + class NotRepr(BaseEstimator): + def __repr__(self): + raise NotImplementedError("This estimator does not have a repr.") + + estimator = NotRepr() + msg = "Repr of .* failed with error" + with raises(AssertionError, match=msg): + check_estimator_repr("NotRepr", estimator) From 912b14a43c5cc971da1ac7b0d3513ec274ac1a78 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 6 Sep 2024 11:14:58 +0500 Subject: [PATCH 16/17] Update sklearn/utils/_test_common/instance_generator.py --- sklearn/utils/_test_common/instance_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 01a028aaf201e..aff5d58a8f3a7 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -279,8 +279,8 @@ ), RandomTreesEmbedding: dict(n_estimators=5), # `RANSACRegressor` will raise an error with any model other - # than `LinearRegression` if we don't fix `min_samples` parameter. - # For common test, we can enforce using `LinearRegression` that + # than `LinearRegression` if we don't fix the `min_samples` parameter. + # For common tests, we can enforce using `LinearRegression` that # is the default estimator in `RANSACRegressor` instead of `Ridge`. RANSACRegressor: dict(estimator=LinearRegression(), max_trials=10), RegressorChain: dict(base_estimator=Ridge(), cv=3), From fd0332906f983264cc56a953c66d5b900572e877 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 6 Sep 2024 11:15:13 +0500 Subject: [PATCH 17/17] Update sklearn/utils/tests/test_estimator_checks.py --- sklearn/utils/tests/test_estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 7a851bb4e7a6c..984a6f3ff5b63 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -1312,7 +1312,7 @@ def __sklearn_clone__(self): def test_estimator_repr_error(): - """Check that the right error is raised when the estimator does not do a repr.""" + """Check that the right error is raised when the estimator does not have a repr.""" class NotRepr(BaseEstimator): def __repr__(self):