scikit-learn · glemaitre · Sep 12, 2024 · Sep 9, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
@@ -362,6 +362,17 @@ def _fit(self, X):
             [self.column_labels_ == c for c in range(self.n_clusters)]
         )
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags._xfail_checks.update(
+            {
+                # ValueError: Found array with 0 feature(s) (shape=(23, 0))
+                # while a minimum of 1 is required.
+                "check_dict_unchanged": "FIXME",
+            }
+        )
+        return tags
+
 
 class SpectralBiclustering(BaseSpectral):
     """Spectral biclustering (Kluger, 2003).

diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
@@ -7,7 +7,7 @@
 from functools import partial
 from inspect import isfunction
 
-from sklearn import config_context
+from sklearn import clone, config_context
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.cluster import (
     HDBSCAN,
@@ -33,6 +33,7 @@
     FactorAnalysis,
     FastICA,
     IncrementalPCA,
+    KernelPCA,
     LatentDirichletAllocation,
     MiniBatchDictionaryLearning,
     MiniBatchNMF,
@@ -41,6 +42,7 @@
     SparsePCA,
     TruncatedSVD,
 )
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.dummy import DummyClassifier
 from sklearn.ensemble import (
     AdaBoostClassifier,
@@ -72,6 +74,12 @@
     SelectKBest,
     SequentialFeatureSelector,
 )
+from sklearn.kernel_approximation import (
+    Nystroem,
+    PolynomialCountSketch,
+    RBFSampler,
+    SkewedChi2Sampler,
+)
 from sklearn.linear_model import (
     ARDRegression,
     BayesianRidge,
@@ -105,7 +113,13 @@
     TheilSenRegressor,
     TweedieRegressor,
 )
-from sklearn.manifold import MDS, TSNE, LocallyLinearEmbedding, SpectralEmbedding
+from sklearn.manifold import (
+    MDS,
+    TSNE,
+    Isomap,
+    LocallyLinearEmbedding,
+    SpectralEmbedding,
+)
 from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
 from sklearn.model_selection import (
     FixedThresholdClassifier,
@@ -457,6 +471,79 @@
     ),
 }
 
+# This dictionary stores parameters for specific checks. It also enables running the
+# same check with multiple instances of the same estimator with different parameters.
+# The special key "*" allows to apply the parameters to all checks.
+# TODO(devtools): allow third-party developers to pass test specific params to checks
+PER_ESTIMATOR_CHECK_PARAMS: dict = {
+    # TODO(devtools): check that function names here exist in checks for the estimator
+    # TODO(devtools): write a test for the same thing with tags._xfail_checks
+    AgglomerativeClustering: {"check_dict_unchanged": dict(n_clusters=1)},
+    BayesianGaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)},
+    BernoulliRBM: {"check_dict_unchanged": dict(n_components=1, n_iter=5)},
+    Birch: {"check_dict_unchanged": dict(n_clusters=1)},
+    BisectingKMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)},
+    CCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    DictionaryLearning: {
+        "check_dict_unchanged": dict(
+            max_iter=20, n_components=1, transform_algorithm="lasso_lars"
+        )
+    },
+    FactorAnalysis: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    FastICA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    FeatureAgglomeration: {"check_dict_unchanged": dict(n_clusters=1)},
+    GaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)},
+    GaussianRandomProjection: {"check_dict_unchanged": dict(n_components=1)},
+    IncrementalPCA: {"check_dict_unchanged": dict(batch_size=10, n_components=1)},
+    Isomap: {"check_dict_unchanged": dict(n_components=1)},
+    KMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)},
+    KernelPCA: {"check_dict_unchanged": dict(n_components=1)},
+    LatentDirichletAllocation: {
+        "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
+    },
+    LinearDiscriminantAnalysis: {"check_dict_unchanged": dict(n_components=1)},
+    LocallyLinearEmbedding: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    MDS: {"check_dict_unchanged": dict(max_iter=5, n_components=1, n_init=2)},
+    MiniBatchDictionaryLearning: {
+        "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
+    },
+    MiniBatchKMeans: {
+        "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_clusters=1, n_init=2)
+    },
+    MiniBatchNMF: {
+        "check_dict_unchanged": dict(
+            batch_size=10, fresh_restarts=True, max_iter=20, n_components=1
+        )
+    },
+    MiniBatchSparsePCA: {
+        "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
+    },
+    NMF: {"check_dict_unchanged": dict(max_iter=500, n_components=1)},
+    NeighborhoodComponentsAnalysis: {
+        "check_dict_unchanged": dict(max_iter=5, n_components=1)
+    },
+    Nystroem: {"check_dict_unchanged": dict(n_components=1)},
+    PCA: {"check_dict_unchanged": dict(n_components=1)},
+    PLSCanonical: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    PLSRegression: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    PLSSVD: {"check_dict_unchanged": dict(n_components=1)},
+    PolynomialCountSketch: {"check_dict_unchanged": dict(n_components=1)},
+    RBFSampler: {"check_dict_unchanged": dict(n_components=1)},
+    SkewedChi2Sampler: {"check_dict_unchanged": dict(n_components=1)},
+    SparsePCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    SparseRandomProjection: {"check_dict_unchanged": dict(n_components=1)},
+    SpectralBiclustering: {
+        "check_dict_unchanged": dict(n_best=1, n_clusters=1, n_components=1, n_init=2)
+    },
+    SpectralClustering: {
+        "check_dict_unchanged": dict(n_clusters=1, n_components=1, n_init=2)
+    },
+    SpectralCoclustering: {"check_dict_unchanged": dict(n_clusters=1, n_init=2)},
+    SpectralEmbedding: {"check_dict_unchanged": dict(eigen_tol=1e-05, n_components=1)},
+    TSNE: {"check_dict_unchanged": dict(n_components=1, perplexity=2)},
+    TruncatedSVD: {"check_dict_unchanged": dict(n_components=1)},
+}
+
 
 def _tested_estimators(type_filter=None):
     for name, Estimator in all_estimators(type_filter=type_filter):
@@ -527,3 +614,38 @@ def _get_check_estimator_ids(obj):
     if hasattr(obj, "get_params"):
         with config_context(print_changed_only=True):
             return re.sub(r"\s", "", str(obj))
+
+
+def _yield_instances_for_check(check, estimator_orig):
+    """Yield instances for a check.
+
+    For most estimators, this is a no-op.
+
+    For estimators which have an entry in PER_ESTIMATOR_CHECK_PARAMS, this will yield
+    an estimator for each parameter set in PER_ESTIMATOR_CHECK_PARAMS[estimator].
+    """
+    # TODO(devtools): enable this behavior for third party estimators as well
+    if type(estimator_orig) not in PER_ESTIMATOR_CHECK_PARAMS:
+        yield estimator_orig
+        return
+
+    check_params = PER_ESTIMATOR_CHECK_PARAMS[type(estimator_orig)]
+
+    try:
+        check_name = check.__name__
+    except AttributeError:
+        # partial tests
+        check_name = check.func.__name__
+
+    if check_name not in check_params:
+        yield estimator_orig
+        return
+
+    param_set = check_params[check_name]
+    if isinstance(param_set, dict):
+        param_set = [param_set]
+
+    for params in param_set:
+        estimator = clone(estimator_orig)
+        estimator.set_params(**params)
+        yield estimator
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -59,6 +59,7 @@
 from ._test_common.instance_generator import (
     CROSS_DECOMPOSITION,
     _get_check_estimator_ids,
+    _yield_instances_for_check,
 )
 from ._testing import (
     SkipTest,
@@ -81,7 +82,6 @@
 
 
 def _yield_api_checks(estimator):
-    yield check_estimator_cloneable
     yield check_estimator_repr
     yield check_no_attributes_set_in_init
     yield check_fit_score_takes_y
@@ -509,10 +509,14 @@ def parametrize_with_checks(estimators, *, legacy=True):
 
     def checks_generator():
         for estimator in estimators:
+            # First check that the estimator is cloneable which is needed for the rest
+            # of the checks to run
             name = type(estimator).__name__
+            yield estimator, partial(check_estimator_cloneable, name)
             for check in _yield_all_checks(estimator, legacy=legacy):
-                check = partial(check, name)
-                yield _maybe_mark_xfail(estimator, check, pytest)
+                check_with_name = partial(check, name)
+                for check_instance in _yield_instances_for_check(check, estimator):
+                    yield _maybe_mark_xfail(check_instance, check_with_name, pytest)
 
     return pytest.mark.parametrize(
         "estimator, check", checks_generator(), ids=_get_check_estimator_ids
@@ -597,9 +601,13 @@ def check_estimator(estimator=None, generate_only=False, *, legacy=True):
     name = type(estimator).__name__
 
     def checks_generator():
+        # we first need to check if the estimator is cloneable for the rest of the tests
+        # to run
+        yield estimator, partial(check_estimator_cloneable, name)
         for check in _yield_all_checks(estimator, legacy=legacy):
             check = _maybe_skip(estimator, check)
-            yield estimator, partial(check, name)
+            for check_instance in _yield_instances_for_check(check, estimator):
+                yield check_instance, partial(check, name)
 
     if generate_only:
         return checks_generator()
@@ -1257,32 +1265,13 @@ def check_complex_data(name, estimator_orig):
 
 @ignore_warnings
 def check_dict_unchanged(name, estimator_orig):
-    # this estimator raises
-    # ValueError: Found array with 0 feature(s) (shape=(23, 0))
-    # while a minimum of 1 is required.
-    # error
-    if name in ["SpectralCoclustering"]:
-        return
     rnd = np.random.RandomState(0)
-    if name in ["RANSACRegressor"]:
-        X = 3 * rnd.uniform(size=(20, 3))
-    else:
-        X = 2 * rnd.uniform(size=(20, 3))
-
+    X = 3 * rnd.uniform(size=(20, 3))
     X = _enforce_estimator_tags_X(estimator_orig, X)
 
     y = X[:, 0].astype(int)
     estimator = clone(estimator_orig)
     y = _enforce_estimator_tags_y(estimator, y)
-    if hasattr(estimator, "n_components"):
-        estimator.n_components = 1
-
-    if hasattr(estimator, "n_clusters"):
-        estimator.n_clusters = 1
-
-    if hasattr(estimator, "n_best"):
-        estimator.n_best = 1
-
     set_random_state(estimator, 1)
 
     estimator.fit(X, y)