diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index e86ca4e7268fe..08cd63b58cbaa 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -362,6 +362,17 @@ def _fit(self, X): [self.column_labels_ == c for c in range(self.n_clusters)] ) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags._xfail_checks.update( + { + # ValueError: Found array with 0 feature(s) (shape=(23, 0)) + # while a minimum of 1 is required. + "check_dict_unchanged": "FIXME", + } + ) + return tags + class SpectralBiclustering(BaseSpectral): """Spectral biclustering (Kluger, 2003). diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 0f0210e58d5f4..c2c6be238d52d 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -7,7 +7,7 @@ from functools import partial from inspect import isfunction -from sklearn import config_context +from sklearn import clone, config_context from sklearn.calibration import CalibratedClassifierCV from sklearn.cluster import ( HDBSCAN, @@ -33,6 +33,7 @@ FactorAnalysis, FastICA, IncrementalPCA, + KernelPCA, LatentDirichletAllocation, MiniBatchDictionaryLearning, MiniBatchNMF, @@ -41,6 +42,7 @@ SparsePCA, TruncatedSVD, ) +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.dummy import DummyClassifier from sklearn.ensemble import ( AdaBoostClassifier, @@ -72,6 +74,12 @@ SelectKBest, SequentialFeatureSelector, ) +from sklearn.kernel_approximation import ( + Nystroem, + PolynomialCountSketch, + RBFSampler, + SkewedChi2Sampler, +) from sklearn.linear_model import ( ARDRegression, BayesianRidge, @@ -105,7 +113,13 @@ TheilSenRegressor, TweedieRegressor, ) -from sklearn.manifold import MDS, TSNE, LocallyLinearEmbedding, SpectralEmbedding +from sklearn.manifold import ( + MDS, + TSNE, + Isomap, + LocallyLinearEmbedding, + SpectralEmbedding, +) from sklearn.mixture import BayesianGaussianMixture, GaussianMixture from sklearn.model_selection import ( FixedThresholdClassifier, @@ -457,6 +471,79 @@ ), } +# This dictionary stores parameters for specific checks. It also enables running the +# same check with multiple instances of the same estimator with different parameters. +# The special key "*" allows to apply the parameters to all checks. +# TODO(devtools): allow third-party developers to pass test specific params to checks +PER_ESTIMATOR_CHECK_PARAMS: dict = { + # TODO(devtools): check that function names here exist in checks for the estimator + # TODO(devtools): write a test for the same thing with tags._xfail_checks + AgglomerativeClustering: {"check_dict_unchanged": dict(n_clusters=1)}, + BayesianGaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)}, + BernoulliRBM: {"check_dict_unchanged": dict(n_components=1, n_iter=5)}, + Birch: {"check_dict_unchanged": dict(n_clusters=1)}, + BisectingKMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)}, + CCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + DictionaryLearning: { + "check_dict_unchanged": dict( + max_iter=20, n_components=1, transform_algorithm="lasso_lars" + ) + }, + FactorAnalysis: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + FastICA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + FeatureAgglomeration: {"check_dict_unchanged": dict(n_clusters=1)}, + GaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)}, + GaussianRandomProjection: {"check_dict_unchanged": dict(n_components=1)}, + IncrementalPCA: {"check_dict_unchanged": dict(batch_size=10, n_components=1)}, + Isomap: {"check_dict_unchanged": dict(n_components=1)}, + KMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)}, + KernelPCA: {"check_dict_unchanged": dict(n_components=1)}, + LatentDirichletAllocation: { + "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1) + }, + LinearDiscriminantAnalysis: {"check_dict_unchanged": dict(n_components=1)}, + LocallyLinearEmbedding: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + MDS: {"check_dict_unchanged": dict(max_iter=5, n_components=1, n_init=2)}, + MiniBatchDictionaryLearning: { + "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1) + }, + MiniBatchKMeans: { + "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_clusters=1, n_init=2) + }, + MiniBatchNMF: { + "check_dict_unchanged": dict( + batch_size=10, fresh_restarts=True, max_iter=20, n_components=1 + ) + }, + MiniBatchSparsePCA: { + "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1) + }, + NMF: {"check_dict_unchanged": dict(max_iter=500, n_components=1)}, + NeighborhoodComponentsAnalysis: { + "check_dict_unchanged": dict(max_iter=5, n_components=1) + }, + Nystroem: {"check_dict_unchanged": dict(n_components=1)}, + PCA: {"check_dict_unchanged": dict(n_components=1)}, + PLSCanonical: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + PLSRegression: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + PLSSVD: {"check_dict_unchanged": dict(n_components=1)}, + PolynomialCountSketch: {"check_dict_unchanged": dict(n_components=1)}, + RBFSampler: {"check_dict_unchanged": dict(n_components=1)}, + SkewedChi2Sampler: {"check_dict_unchanged": dict(n_components=1)}, + SparsePCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, + SparseRandomProjection: {"check_dict_unchanged": dict(n_components=1)}, + SpectralBiclustering: { + "check_dict_unchanged": dict(n_best=1, n_clusters=1, n_components=1, n_init=2) + }, + SpectralClustering: { + "check_dict_unchanged": dict(n_clusters=1, n_components=1, n_init=2) + }, + SpectralCoclustering: {"check_dict_unchanged": dict(n_clusters=1, n_init=2)}, + SpectralEmbedding: {"check_dict_unchanged": dict(eigen_tol=1e-05, n_components=1)}, + TSNE: {"check_dict_unchanged": dict(n_components=1, perplexity=2)}, + TruncatedSVD: {"check_dict_unchanged": dict(n_components=1)}, +} + def _tested_estimators(type_filter=None): for name, Estimator in all_estimators(type_filter=type_filter): @@ -527,3 +614,38 @@ def _get_check_estimator_ids(obj): if hasattr(obj, "get_params"): with config_context(print_changed_only=True): return re.sub(r"\s", "", str(obj)) + + +def _yield_instances_for_check(check, estimator_orig): + """Yield instances for a check. + + For most estimators, this is a no-op. + + For estimators which have an entry in PER_ESTIMATOR_CHECK_PARAMS, this will yield + an estimator for each parameter set in PER_ESTIMATOR_CHECK_PARAMS[estimator]. + """ + # TODO(devtools): enable this behavior for third party estimators as well + if type(estimator_orig) not in PER_ESTIMATOR_CHECK_PARAMS: + yield estimator_orig + return + + check_params = PER_ESTIMATOR_CHECK_PARAMS[type(estimator_orig)] + + try: + check_name = check.__name__ + except AttributeError: + # partial tests + check_name = check.func.__name__ + + if check_name not in check_params: + yield estimator_orig + return + + param_set = check_params[check_name] + if isinstance(param_set, dict): + param_set = [param_set] + + for params in param_set: + estimator = clone(estimator_orig) + estimator.set_params(**params) + yield estimator diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index cfe028cbb91bb..558df31461b4c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -59,6 +59,7 @@ from ._test_common.instance_generator import ( CROSS_DECOMPOSITION, _get_check_estimator_ids, + _yield_instances_for_check, ) from ._testing import ( SkipTest, @@ -81,7 +82,6 @@ def _yield_api_checks(estimator): - yield check_estimator_cloneable yield check_estimator_repr yield check_no_attributes_set_in_init yield check_fit_score_takes_y @@ -509,10 +509,14 @@ def parametrize_with_checks(estimators, *, legacy=True): def checks_generator(): for estimator in estimators: + # First check that the estimator is cloneable which is needed for the rest + # of the checks to run name = type(estimator).__name__ + yield estimator, partial(check_estimator_cloneable, name) for check in _yield_all_checks(estimator, legacy=legacy): - check = partial(check, name) - yield _maybe_mark_xfail(estimator, check, pytest) + check_with_name = partial(check, name) + for check_instance in _yield_instances_for_check(check, estimator): + yield _maybe_mark_xfail(check_instance, check_with_name, pytest) return pytest.mark.parametrize( "estimator, check", checks_generator(), ids=_get_check_estimator_ids @@ -597,9 +601,13 @@ def check_estimator(estimator=None, generate_only=False, *, legacy=True): name = type(estimator).__name__ def checks_generator(): + # we first need to check if the estimator is cloneable for the rest of the tests + # to run + yield estimator, partial(check_estimator_cloneable, name) for check in _yield_all_checks(estimator, legacy=legacy): check = _maybe_skip(estimator, check) - yield estimator, partial(check, name) + for check_instance in _yield_instances_for_check(check, estimator): + yield check_instance, partial(check, name) if generate_only: return checks_generator() @@ -1257,32 +1265,13 @@ def check_complex_data(name, estimator_orig): @ignore_warnings def check_dict_unchanged(name, estimator_orig): - # this estimator raises - # ValueError: Found array with 0 feature(s) (shape=(23, 0)) - # while a minimum of 1 is required. - # error - if name in ["SpectralCoclustering"]: - return rnd = np.random.RandomState(0) - if name in ["RANSACRegressor"]: - X = 3 * rnd.uniform(size=(20, 3)) - else: - X = 2 * rnd.uniform(size=(20, 3)) - + X = 3 * rnd.uniform(size=(20, 3)) X = _enforce_estimator_tags_X(estimator_orig, X) y = X[:, 0].astype(int) estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) - if hasattr(estimator, "n_components"): - estimator.n_components = 1 - - if hasattr(estimator, "n_clusters"): - estimator.n_clusters = 1 - - if hasattr(estimator, "n_best"): - estimator.n_best = 1 - set_random_state(estimator, 1) estimator.fit(X, y)