From 3338c71576d4fb58b8e38c1f85da0b68623888ed Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 9 Sep 2024 23:19:37 +0200
Subject: [PATCH 1/6] TST allow testing on multiple instances of estimators

---
 doc/sphinxext/allow_nan_estimators.py         |   7 +-
 sklearn/tests/test_common.py                  |  31 +-
 sklearn/tests/test_docstring_parameters.py    |   6 +-
 .../utils/_test_common/instance_generator.py  | 271 ++++++++++++------
 sklearn/utils/estimator_checks.py             |  22 +-
 5 files changed, 209 insertions(+), 128 deletions(-)

diff --git a/doc/sphinxext/allow_nan_estimators.py b/doc/sphinxext/allow_nan_estimators.py
index a1c77d81d1460..d2eb0e940b6a1 100755
--- a/doc/sphinxext/allow_nan_estimators.py
+++ b/doc/sphinxext/allow_nan_estimators.py
@@ -4,7 +4,7 @@
 from docutils.parsers.rst import Directive
 
 from sklearn.utils import all_estimators
-from sklearn.utils._test_common.instance_generator import _construct_instance
+from sklearn.utils._test_common.instance_generator import _construct_instances
 from sklearn.utils._testing import SkipTest
 
 
@@ -19,7 +19,10 @@ def make_paragraph_for_estimator_type(estimator_type):
         lst = nodes.bullet_list()
         for name, est_class in all_estimators(type_filter=estimator_type):
             with suppress(SkipTest):
-                est = _construct_instance(est_class)
+                # Here we generate the text only for one instance. This directive
+                # should not be used for meta-estimators where tags depend on the
+                # sub-estimator.
+                est = next(_construct_instances(est_class))
 
             if est.__sklearn_tags__().input_tags.allow_nan:
                 module_name = ".".join(est_class.__module__.split(".")[:2])
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index aefc7b03e1615..e66fd96921640 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -28,7 +28,7 @@
 )
 from sklearn.compose import ColumnTransformer
 from sklearn.datasets import make_blobs
-from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
+from sklearn.exceptions import ConvergenceWarning
 
 # make it possible to discover experimental estimators when calling `all_estimators`
 from sklearn.experimental import (
@@ -55,8 +55,6 @@
 from sklearn.utils import all_estimators
 from sklearn.utils._tags import get_tags
 from sklearn.utils._test_common.instance_generator import (
-    _generate_pipeline,
-    _generate_search_cv_instances,
     _get_check_estimator_ids,
     _tested_estimators,
 )
@@ -132,7 +130,7 @@ def test_get_check_estimator_ids(val, expected):
     assert _get_check_estimator_ids(val) == expected
 
 
-@parametrize_with_checks(list(chain(_tested_estimators(), _generate_pipeline())))
+@parametrize_with_checks(list(_tested_estimators()))
 def test_estimators(estimator, check, request):
     # Common tests for estimator instances
     with ignore_warnings(
@@ -233,22 +231,6 @@ def test_class_support_removed():
         parametrize_with_checks([LogisticRegression])
 
 
-@parametrize_with_checks(list(_generate_search_cv_instances()))
-def test_search_cv(estimator, check, request):
-    # Common tests for SearchCV instances
-    # We have a separate test because those meta-estimators can accept a
-    # wide range of base estimators (classifiers, regressors, pipelines)
-    with ignore_warnings(
-        category=(
-            FutureWarning,
-            ConvergenceWarning,
-            UserWarning,
-            FitFailedWarning,
-        )
-    ):
-        check(estimator)
-
-
 @pytest.mark.parametrize(
     "estimator", _tested_estimators(), ids=_get_check_estimator_ids
 )
@@ -311,7 +293,6 @@ def _estimators_that_predict_in_fit():
     chain(
         _tested_estimators(),
         [make_pipeline(LogisticRegression(C=1))],
-        list(_generate_search_cv_instances()),
         _estimators_that_predict_in_fit(),
     )
 )
@@ -401,13 +382,7 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator):
 
 
 @pytest.mark.parametrize(
-    "estimator",
-    chain(
-        _tested_estimators(),
-        _generate_pipeline(),
-        _generate_search_cv_instances(),
-    ),
-    ids=_get_check_estimator_ids,
+    "estimator", list(_tested_estimators()), ids=_get_check_estimator_ids
 )
 def test_check_param_validation(estimator):
     if isinstance(estimator, FeatureUnion):
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 3211f918aea36..34b698e3f2073 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -23,7 +23,7 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import FunctionTransformer
 from sklearn.utils import all_estimators
-from sklearn.utils._test_common.instance_generator import _construct_instance
+from sklearn.utils._test_common.instance_generator import _construct_instances
 from sklearn.utils._testing import (
     _get_func_name,
     assert_docstring_consistency,
@@ -202,7 +202,9 @@ def test_fit_docstring_attributes(name, Estimator):
     elif Estimator.__name__ == "SparseCoder":
         est = _construct_sparse_coder(Estimator)
     else:
-        est = _construct_instance(Estimator)
+        # TODO(devtools): use _tested_estimators instead of all_estimators in the
+        # decorator
+        est = next(_construct_instances(Estimator))
 
     if Estimator.__name__ == "SelectKBest":
         est.set_params(k=2)
diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
index aff5d58a8f3a7..9ad5b85b82978 100644
--- a/sklearn/utils/_test_common/instance_generator.py
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -5,10 +5,10 @@
 import re
 import warnings
 from functools import partial
-from inspect import isfunction, signature
-from itertools import product
+from inspect import isfunction
 
 from sklearn import config_context
+from sklearn.base import clone
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.cluster import (
     HDBSCAN,
@@ -129,7 +129,7 @@
 )
 from sklearn.neighbors import NeighborhoodComponentsAnalysis
 from sklearn.neural_network import BernoulliRBM, MLPClassifier, MLPRegressor
-from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
+from sklearn.pipeline import FeatureUnion, Pipeline
 from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder
 from sklearn.random_projection import (
     GaussianRandomProjection,
@@ -143,7 +143,7 @@
 from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.utils import all_estimators
-from sklearn.utils._testing import SkipTest, set_random_state
+from sklearn.utils._testing import SkipTest
 
 CROSS_DECOMPOSITION = ["PLSCanonical", "PLSRegression", "CCA", "PLSSVD"]
 
@@ -190,25 +190,102 @@
     GradientBoostingRegressor: dict(n_estimators=5),
     GraphicalLassoCV: dict(max_iter=5, cv=3),
     GraphicalLasso: dict(max_iter=5),
-    GridSearchCV: dict(
-        estimator=LogisticRegression(C=1), param_grid={"C": [1.0]}, cv=3
-    ),
-    HalvingGridSearchCV: dict(
-        estimator=Ridge(),
-        min_resources="smallest",
-        param_grid={"alpha": [0.1, 1.0]},
-        random_state=0,
-        cv=2,
-        error_score="raise",
-    ),
-    HalvingRandomSearchCV: dict(
-        estimator=Ridge(),
-        param_distributions={"alpha": [0.1, 1.0]},
-        min_resources="smallest",
-        cv=2,
-        error_score="raise",
-        random_state=0,
-    ),
+    GridSearchCV: [
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Ridge(),
+            param_grid={"alpha": [0.1, 1.0]},
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=LogisticRegression(),
+            param_grid={"C": [0.1, 1.0]},
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]),
+            param_grid={"ridge__alpha": [0.1, 1.0]},
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(
+                steps=[("pca", PCA()), ("logisticregression", LogisticRegression())]
+            ),
+            param_grid={"logisticregression__C": [0.1, 1.0]},
+        ),
+    ],
+    HalvingGridSearchCV: [
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Ridge(),
+            min_resources="smallest",
+            param_grid={"alpha": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=LogisticRegression(),
+            min_resources="smallest",
+            param_grid={"C": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]),
+            min_resources="smallest",
+            param_grid={"ridge__alpha": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(
+                steps=[("pca", PCA()), ("logisticregression", LogisticRegression())]
+            ),
+            min_resources="smallest",
+            param_grid={"logisticregression__C": [0.1, 1.0]},
+            random_state=0,
+        ),
+    ],
+    HalvingRandomSearchCV: [
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Ridge(),
+            param_distributions={"alpha": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=LogisticRegression(),
+            param_distributions={"C": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]),
+            param_distributions={"ridge__alpha": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(
+                steps=[("pca", PCA()), ("logisticregression", LogisticRegression())]
+            ),
+            param_distributions={"logisticregression__C": [0.1, 1.0]},
+            random_state=0,
+        ),
+    ],
     HDBSCAN: dict(min_samples=1),
     # The default min_samples_leaf (20) isn't appropriate for small
     # datasets (only very shallow trees are built) that the checks use.
@@ -264,19 +341,53 @@
     PassiveAggressiveClassifier: dict(max_iter=5),
     PassiveAggressiveRegressor: dict(max_iter=5),
     Perceptron: dict(max_iter=5),
-    Pipeline: dict(steps=[("scaler", StandardScaler()), ("est", Ridge())]),
+    Pipeline: [
+        {"steps": [("scaler", StandardScaler()), ("final_estimator", Ridge())]},
+        {
+            "steps": [
+                ("scaler", StandardScaler()),
+                ("final_estimator", LogisticRegression()),
+            ]
+        },
+    ],
     PLSCanonical: dict(n_components=1, max_iter=5),
     PLSRegression: dict(n_components=1, max_iter=5),
     PLSSVD: dict(n_components=1),
     PoissonRegressor: dict(max_iter=5),
     RandomForestClassifier: dict(n_estimators=5),
     RandomForestRegressor: dict(n_estimators=5),
-    RandomizedSearchCV: dict(
-        estimator=LogisticRegression(C=1),
-        param_distributions={"C": [1.0]},
-        n_iter=5,
-        cv=3,
-    ),
+    RandomizedSearchCV: [
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Ridge(),
+            param_distributions={"alpha": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=LogisticRegression(),
+            param_distributions={"C": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]),
+            param_distributions={"ridge__alpha": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(
+                steps=[("pca", PCA()), ("logisticregression", LogisticRegression())]
+            ),
+            param_distributions={"logisticregression__C": [0.1, 1.0]},
+            random_state=0,
+        ),
+    ],
     RandomTreesEmbedding: dict(n_estimators=5),
     # `RANSACRegressor` will raise an error with any model other
     # than `LinearRegression` if we don't fix the `min_samples` parameter.
@@ -348,33 +459,31 @@
 }
 
 
+# This dictionary stores parameters for specific checks. It also enables running the
+# same check with multiple instances of the same estimator with different parameters.
+# The special key "*" allows to apply the parameters to all checks.
+CHECK_PARAMS = {
+    Pipeline: {"*": []},
+    GridSearchCV: {"*": []},
+    HalvingGridSearchCV: {"*": []},
+    RandomizedSearchCV: {"*": []},
+    HalvingRandomSearchCV: {"*": []},
+}
+
+
 def _tested_estimators(type_filter=None):
     for name, Estimator in all_estimators(type_filter=type_filter):
         try:
-            estimator = _construct_instance(Estimator)
+            for estimator in _construct_instances(Estimator):
+                yield estimator
         except SkipTest:
             continue
 
-        yield estimator
-
-
-def _generate_pipeline():
-    """Generator of simple pipeline to check compliance of the
-    :class:`~sklearn.pipeline.Pipeline` class.
-    """
-    for final_estimator in [Ridge(), LogisticRegression()]:
-        yield Pipeline(
-            steps=[
-                ("scaler", StandardScaler()),
-                ("final_estimator", final_estimator),
-            ]
-        )
-
 
 SKIPPED_ESTIMATORS = [SparseCoder]
 
 
-def _construct_instance(Estimator):
+def _construct_instances(Estimator):
     """Construct Estimator instance if possible."""
     if Estimator in SKIPPED_ESTIMATORS:
         msg = f"Can't instantiate estimator {Estimator.__name__}"
@@ -425,48 +534,28 @@ def _get_check_estimator_ids(obj):
             return re.sub(r"\s", "", str(obj))
 
 
-def _generate_search_cv_instances():
-    """Generator of `SearchCV` instances to check their compliance with scikit-learn."""
-    for SearchCV, (Estimator, param_grid) in product(
-        [
-            GridSearchCV,
-            HalvingGridSearchCV,
-            RandomizedSearchCV,
-            HalvingGridSearchCV,
-        ],
-        [
-            (Ridge, {"alpha": [0.1, 1.0]}),
-            (LogisticRegression, {"C": [0.1, 1.0]}),
-        ],
-    ):
-        init_params = signature(SearchCV).parameters
-        extra_params = (
-            {"min_resources": "smallest"} if "min_resources" in init_params else {}
-        )
-        search_cv = SearchCV(
-            Estimator(), param_grid, cv=2, error_score="raise", **extra_params
-        )
-        set_random_state(search_cv)
-        yield search_cv
+def _yield_instances_for_check(check, estimator_orig):
+    """Yield instances for a check.
 
-    for SearchCV, (Estimator, param_grid) in product(
-        [
-            GridSearchCV,
-            HalvingGridSearchCV,
-            RandomizedSearchCV,
-            HalvingRandomSearchCV,
-        ],
-        [
-            (Ridge, {"ridge__alpha": [0.1, 1.0]}),
-            (LogisticRegression, {"logisticregression__C": [0.1, 1.0]}),
-        ],
-    ):
-        init_params = signature(SearchCV).parameters
-        extra_params = (
-            {"min_resources": "smallest"} if "min_resources" in init_params else {}
-        )
-        search_cv = SearchCV(
-            make_pipeline(PCA(), Estimator()), param_grid, cv=2, **extra_params
-        ).set_params(error_score="raise")
-        set_random_state(search_cv)
-        yield search_cv
+    For most estimators, this is a no-op.
+
+    For estimators which have an entry in CHECK_PARAMS, this will yield
+    an estimator for each parameter set in CHECK_PARAMS[estimator].
+    """
+    if type(estimator_orig) not in CHECK_PARAMS:
+        return (estimator_orig,)
+
+    check_params = CHECK_PARAMS[type(estimator_orig)]
+
+    if "*" not in check_params and check not in check_params:
+        return (estimator_orig,)
+
+    if "*" in check_params:
+        param_set = check_params["*"]
+    else:
+        param_set = check_params[check]
+
+    for params in param_set:
+        estimator = clone(estimator_orig)
+        estimator.set_params(**params)
+        yield estimator
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 6da7c8eb1c7ff..30b961228e80f 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -59,8 +59,9 @@
 from ._test_common.instance_generator import (
     CROSS_DECOMPOSITION,
     INIT_PARAMS,
-    _construct_instance,
+    _construct_instances,
     _get_check_estimator_ids,
+    _yield_instances_for_check,
 )
 from ._testing import (
     SkipTest,
@@ -508,10 +509,14 @@ def parametrize_with_checks(estimators, *, legacy=True):
 
     def checks_generator():
         for estimator in estimators:
+            # First check that the estimator is cloneable which is needed for the rest
+            # of the checks to run
             name = type(estimator).__name__
+            yield estimator, partial(check_estimator_cloneable, name)
             for check in _yield_all_checks(estimator, legacy=legacy):
-                check = partial(check, name)
-                yield _maybe_mark_xfail(estimator, check, pytest)
+                check_with_name = partial(check, name)
+                for check_instance in _yield_instances_for_check(check, estimator):
+                    yield _maybe_mark_xfail(check_instance, check_with_name, pytest)
 
     return pytest.mark.parametrize(
         "estimator, check", checks_generator(), ids=_get_check_estimator_ids
@@ -596,9 +601,13 @@ def check_estimator(estimator=None, generate_only=False, *, legacy=True):
     name = type(estimator).__name__
 
     def checks_generator():
+        # we first need to check if the estimator is cloneable for the rest of the tests
+        # to run
+        yield estimator, partial(check_estimator_cloneable, name)
         for check in _yield_all_checks(estimator, legacy=legacy):
             check = _maybe_skip(estimator, check)
-            yield estimator, partial(check, name)
+            for check_instance in _yield_instances_for_check(check, estimator):
+                yield check_instance, partial(check, name)
 
     if generate_only:
         return checks_generator()
@@ -3303,7 +3312,10 @@ def check_parameters_default_constructible(name, estimator_orig):
     Estimator = estimator_orig.__class__
 
     with ignore_warnings(category=FutureWarning):
-        estimator = _construct_instance(Estimator)
+        # TODO(devtools): this test is flawed because _construct_instances doesn't
+        # construct with default parameters for sklearn estimators, while it does
+        # for third party estimators.
+        estimator = next(_construct_instances(Estimator))
         # test that set_params returns self
         assert estimator.set_params() is estimator
 

From c1509d8f063b189d30db777597a0907fb5d5c9f7 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Tue, 10 Sep 2024 13:37:34 +0200
Subject: [PATCH 2/6] cleanup

---
 doc/sphinxext/allow_nan_estimators.py         |   7 +-
 sklearn/tests/test_common.py                  |  31 ++-
 sklearn/tests/test_docstring_parameters.py    |   6 +-
 .../utils/_test_common/instance_generator.py  | 249 +++++++-----------
 sklearn/utils/estimator_checks.py             |   4 +-
 5 files changed, 130 insertions(+), 167 deletions(-)

diff --git a/doc/sphinxext/allow_nan_estimators.py b/doc/sphinxext/allow_nan_estimators.py
index d2eb0e940b6a1..a1c77d81d1460 100755
--- a/doc/sphinxext/allow_nan_estimators.py
+++ b/doc/sphinxext/allow_nan_estimators.py
@@ -4,7 +4,7 @@
 from docutils.parsers.rst import Directive
 
 from sklearn.utils import all_estimators
-from sklearn.utils._test_common.instance_generator import _construct_instances
+from sklearn.utils._test_common.instance_generator import _construct_instance
 from sklearn.utils._testing import SkipTest
 
 
@@ -19,10 +19,7 @@ def make_paragraph_for_estimator_type(estimator_type):
         lst = nodes.bullet_list()
         for name, est_class in all_estimators(type_filter=estimator_type):
             with suppress(SkipTest):
-                # Here we generate the text only for one instance. This directive
-                # should not be used for meta-estimators where tags depend on the
-                # sub-estimator.
-                est = next(_construct_instances(est_class))
+                est = _construct_instance(est_class)
 
             if est.__sklearn_tags__().input_tags.allow_nan:
                 module_name = ".".join(est_class.__module__.split(".")[:2])
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index e66fd96921640..aefc7b03e1615 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -28,7 +28,7 @@
 )
 from sklearn.compose import ColumnTransformer
 from sklearn.datasets import make_blobs
-from sklearn.exceptions import ConvergenceWarning
+from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
 
 # make it possible to discover experimental estimators when calling `all_estimators`
 from sklearn.experimental import (
@@ -55,6 +55,8 @@
 from sklearn.utils import all_estimators
 from sklearn.utils._tags import get_tags
 from sklearn.utils._test_common.instance_generator import (
+    _generate_pipeline,
+    _generate_search_cv_instances,
     _get_check_estimator_ids,
     _tested_estimators,
 )
@@ -130,7 +132,7 @@ def test_get_check_estimator_ids(val, expected):
     assert _get_check_estimator_ids(val) == expected
 
 
-@parametrize_with_checks(list(_tested_estimators()))
+@parametrize_with_checks(list(chain(_tested_estimators(), _generate_pipeline())))
 def test_estimators(estimator, check, request):
     # Common tests for estimator instances
     with ignore_warnings(
@@ -231,6 +233,22 @@ def test_class_support_removed():
         parametrize_with_checks([LogisticRegression])
 
 
+@parametrize_with_checks(list(_generate_search_cv_instances()))
+def test_search_cv(estimator, check, request):
+    # Common tests for SearchCV instances
+    # We have a separate test because those meta-estimators can accept a
+    # wide range of base estimators (classifiers, regressors, pipelines)
+    with ignore_warnings(
+        category=(
+            FutureWarning,
+            ConvergenceWarning,
+            UserWarning,
+            FitFailedWarning,
+        )
+    ):
+        check(estimator)
+
+
 @pytest.mark.parametrize(
     "estimator", _tested_estimators(), ids=_get_check_estimator_ids
 )
@@ -293,6 +311,7 @@ def _estimators_that_predict_in_fit():
     chain(
         _tested_estimators(),
         [make_pipeline(LogisticRegression(C=1))],
+        list(_generate_search_cv_instances()),
         _estimators_that_predict_in_fit(),
     )
 )
@@ -382,7 +401,13 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator):
 
 
 @pytest.mark.parametrize(
-    "estimator", list(_tested_estimators()), ids=_get_check_estimator_ids
+    "estimator",
+    chain(
+        _tested_estimators(),
+        _generate_pipeline(),
+        _generate_search_cv_instances(),
+    ),
+    ids=_get_check_estimator_ids,
 )
 def test_check_param_validation(estimator):
     if isinstance(estimator, FeatureUnion):
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 34b698e3f2073..3211f918aea36 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -23,7 +23,7 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import FunctionTransformer
 from sklearn.utils import all_estimators
-from sklearn.utils._test_common.instance_generator import _construct_instances
+from sklearn.utils._test_common.instance_generator import _construct_instance
 from sklearn.utils._testing import (
     _get_func_name,
     assert_docstring_consistency,
@@ -202,9 +202,7 @@ def test_fit_docstring_attributes(name, Estimator):
     elif Estimator.__name__ == "SparseCoder":
         est = _construct_sparse_coder(Estimator)
     else:
-        # TODO(devtools): use _tested_estimators instead of all_estimators in the
-        # decorator
-        est = next(_construct_instances(Estimator))
+        est = _construct_instance(Estimator)
 
     if Estimator.__name__ == "SelectKBest":
         est.set_params(k=2)
diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
index 9ad5b85b82978..5235f8dd4dad2 100644
--- a/sklearn/utils/_test_common/instance_generator.py
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -5,10 +5,10 @@
 import re
 import warnings
 from functools import partial
-from inspect import isfunction
+from inspect import isfunction, signature
+from itertools import product
 
-from sklearn import config_context
-from sklearn.base import clone
+from sklearn import clone, config_context
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.cluster import (
     HDBSCAN,
@@ -129,7 +129,7 @@
 )
 from sklearn.neighbors import NeighborhoodComponentsAnalysis
 from sklearn.neural_network import BernoulliRBM, MLPClassifier, MLPRegressor
-from sklearn.pipeline import FeatureUnion, Pipeline
+from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
 from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder
 from sklearn.random_projection import (
     GaussianRandomProjection,
@@ -143,7 +143,7 @@
 from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.utils import all_estimators
-from sklearn.utils._testing import SkipTest
+from sklearn.utils._testing import SkipTest, set_random_state
 
 CROSS_DECOMPOSITION = ["PLSCanonical", "PLSRegression", "CCA", "PLSSVD"]
 
@@ -190,102 +190,25 @@
     GradientBoostingRegressor: dict(n_estimators=5),
     GraphicalLassoCV: dict(max_iter=5, cv=3),
     GraphicalLasso: dict(max_iter=5),
-    GridSearchCV: [
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=Ridge(),
-            param_grid={"alpha": [0.1, 1.0]},
-        ),
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=LogisticRegression(),
-            param_grid={"C": [0.1, 1.0]},
-        ),
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]),
-            param_grid={"ridge__alpha": [0.1, 1.0]},
-        ),
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=Pipeline(
-                steps=[("pca", PCA()), ("logisticregression", LogisticRegression())]
-            ),
-            param_grid={"logisticregression__C": [0.1, 1.0]},
-        ),
-    ],
-    HalvingGridSearchCV: [
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=Ridge(),
-            min_resources="smallest",
-            param_grid={"alpha": [0.1, 1.0]},
-            random_state=0,
-        ),
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=LogisticRegression(),
-            min_resources="smallest",
-            param_grid={"C": [0.1, 1.0]},
-            random_state=0,
-        ),
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]),
-            min_resources="smallest",
-            param_grid={"ridge__alpha": [0.1, 1.0]},
-            random_state=0,
-        ),
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=Pipeline(
-                steps=[("pca", PCA()), ("logisticregression", LogisticRegression())]
-            ),
-            min_resources="smallest",
-            param_grid={"logisticregression__C": [0.1, 1.0]},
-            random_state=0,
-        ),
-    ],
-    HalvingRandomSearchCV: [
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=Ridge(),
-            param_distributions={"alpha": [0.1, 1.0]},
-            random_state=0,
-        ),
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=LogisticRegression(),
-            param_distributions={"C": [0.1, 1.0]},
-            random_state=0,
-        ),
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]),
-            param_distributions={"ridge__alpha": [0.1, 1.0]},
-            random_state=0,
-        ),
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=Pipeline(
-                steps=[("pca", PCA()), ("logisticregression", LogisticRegression())]
-            ),
-            param_distributions={"logisticregression__C": [0.1, 1.0]},
-            random_state=0,
-        ),
-    ],
+    GridSearchCV: dict(
+        estimator=LogisticRegression(C=1), param_grid={"C": [1.0]}, cv=3
+    ),
+    HalvingGridSearchCV: dict(
+        estimator=Ridge(),
+        min_resources="smallest",
+        param_grid={"alpha": [0.1, 1.0]},
+        random_state=0,
+        cv=2,
+        error_score="raise",
+    ),
+    HalvingRandomSearchCV: dict(
+        estimator=Ridge(),
+        param_distributions={"alpha": [0.1, 1.0]},
+        min_resources="smallest",
+        cv=2,
+        error_score="raise",
+        random_state=0,
+    ),
     HDBSCAN: dict(min_samples=1),
     # The default min_samples_leaf (20) isn't appropriate for small
     # datasets (only very shallow trees are built) that the checks use.
@@ -341,53 +264,19 @@
     PassiveAggressiveClassifier: dict(max_iter=5),
     PassiveAggressiveRegressor: dict(max_iter=5),
     Perceptron: dict(max_iter=5),
-    Pipeline: [
-        {"steps": [("scaler", StandardScaler()), ("final_estimator", Ridge())]},
-        {
-            "steps": [
-                ("scaler", StandardScaler()),
-                ("final_estimator", LogisticRegression()),
-            ]
-        },
-    ],
+    Pipeline: dict(steps=[("scaler", StandardScaler()), ("est", Ridge())]),
     PLSCanonical: dict(n_components=1, max_iter=5),
     PLSRegression: dict(n_components=1, max_iter=5),
     PLSSVD: dict(n_components=1),
     PoissonRegressor: dict(max_iter=5),
     RandomForestClassifier: dict(n_estimators=5),
     RandomForestRegressor: dict(n_estimators=5),
-    RandomizedSearchCV: [
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=Ridge(),
-            param_distributions={"alpha": [0.1, 1.0]},
-            random_state=0,
-        ),
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=LogisticRegression(),
-            param_distributions={"C": [0.1, 1.0]},
-            random_state=0,
-        ),
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]),
-            param_distributions={"ridge__alpha": [0.1, 1.0]},
-            random_state=0,
-        ),
-        dict(
-            cv=2,
-            error_score="raise",
-            estimator=Pipeline(
-                steps=[("pca", PCA()), ("logisticregression", LogisticRegression())]
-            ),
-            param_distributions={"logisticregression__C": [0.1, 1.0]},
-            random_state=0,
-        ),
-    ],
+    RandomizedSearchCV: dict(
+        estimator=LogisticRegression(C=1),
+        param_distributions={"C": [1.0]},
+        n_iter=5,
+        cv=3,
+    ),
     RandomTreesEmbedding: dict(n_estimators=5),
     # `RANSACRegressor` will raise an error with any model other
     # than `LinearRegression` if we don't fix the `min_samples` parameter.
@@ -458,32 +347,39 @@
     ),
 }
 
-
 # This dictionary stores parameters for specific checks. It also enables running the
 # same check with multiple instances of the same estimator with different parameters.
 # The special key "*" allows to apply the parameters to all checks.
-CHECK_PARAMS = {
-    Pipeline: {"*": []},
-    GridSearchCV: {"*": []},
-    HalvingGridSearchCV: {"*": []},
-    RandomizedSearchCV: {"*": []},
-    HalvingRandomSearchCV: {"*": []},
-}
+CHECK_PARAMS: dict = {}
 
 
 def _tested_estimators(type_filter=None):
     for name, Estimator in all_estimators(type_filter=type_filter):
         try:
-            for estimator in _construct_instances(Estimator):
-                yield estimator
+            estimator = _construct_instance(Estimator)
         except SkipTest:
             continue
 
+        yield estimator
+
+
+def _generate_pipeline():
+    """Generator of simple pipeline to check compliance of the
+    :class:`~sklearn.pipeline.Pipeline` class.
+    """
+    for final_estimator in [Ridge(), LogisticRegression()]:
+        yield Pipeline(
+            steps=[
+                ("scaler", StandardScaler()),
+                ("final_estimator", final_estimator),
+            ]
+        )
+
 
 SKIPPED_ESTIMATORS = [SparseCoder]
 
 
-def _construct_instances(Estimator):
+def _construct_instance(Estimator):
     """Construct Estimator instance if possible."""
     if Estimator in SKIPPED_ESTIMATORS:
         msg = f"Can't instantiate estimator {Estimator.__name__}"
@@ -534,6 +430,53 @@ def _get_check_estimator_ids(obj):
             return re.sub(r"\s", "", str(obj))
 
 
+def _generate_search_cv_instances():
+    """Generator of `SearchCV` instances to check their compliance with scikit-learn."""
+    for SearchCV, (Estimator, param_grid) in product(
+        [
+            GridSearchCV,
+            HalvingGridSearchCV,
+            RandomizedSearchCV,
+            HalvingGridSearchCV,
+        ],
+        [
+            (Ridge, {"alpha": [0.1, 1.0]}),
+            (LogisticRegression, {"C": [0.1, 1.0]}),
+        ],
+    ):
+        init_params = signature(SearchCV).parameters
+        extra_params = (
+            {"min_resources": "smallest"} if "min_resources" in init_params else {}
+        )
+        search_cv = SearchCV(
+            Estimator(), param_grid, cv=2, error_score="raise", **extra_params
+        )
+        set_random_state(search_cv)
+        yield search_cv
+
+    for SearchCV, (Estimator, param_grid) in product(
+        [
+            GridSearchCV,
+            HalvingGridSearchCV,
+            RandomizedSearchCV,
+            HalvingRandomSearchCV,
+        ],
+        [
+            (Ridge, {"ridge__alpha": [0.1, 1.0]}),
+            (LogisticRegression, {"logisticregression__C": [0.1, 1.0]}),
+        ],
+    ):
+        init_params = signature(SearchCV).parameters
+        extra_params = (
+            {"min_resources": "smallest"} if "min_resources" in init_params else {}
+        )
+        search_cv = SearchCV(
+            make_pipeline(PCA(), Estimator()), param_grid, cv=2, **extra_params
+        ).set_params(error_score="raise")
+        set_random_state(search_cv)
+        yield search_cv
+
+
 def _yield_instances_for_check(check, estimator_orig):
     """Yield instances for a check.
 
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 30b961228e80f..3fda4b98af2e4 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -59,7 +59,7 @@
 from ._test_common.instance_generator import (
     CROSS_DECOMPOSITION,
     INIT_PARAMS,
-    _construct_instances,
+    _construct_instance,
     _get_check_estimator_ids,
     _yield_instances_for_check,
 )
@@ -3315,7 +3315,7 @@ def check_parameters_default_constructible(name, estimator_orig):
         # TODO(devtools): this test is flawed because _construct_instances doesn't
         # construct with default parameters for sklearn estimators, while it does
         # for third party estimators.
-        estimator = next(_construct_instances(Estimator))
+        estimator = _construct_instance(Estimator)
         # test that set_params returns self
         assert estimator.set_params() is estimator
 

From dffd6acd808f76745b48ab832ac899d3c59836dc Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Tue, 10 Sep 2024 15:14:29 +0200
Subject: [PATCH 3/6] cleanup and fix test

---
 sklearn/cluster/_bicluster.py                 |   9 ++
 .../utils/_test_common/instance_generator.py  | 106 ++++++++++++++++--
 sklearn/utils/estimator_checks.py             |  22 +---
 3 files changed, 107 insertions(+), 30 deletions(-)

diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index e86ca4e7268fe..c42f2a133fa24 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -362,6 +362,15 @@ def _fit(self, X):
             [self.column_labels_ == c for c in range(self.n_clusters)]
         )
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags._xfail_checks = {
+            # ValueError: Found array with 0 feature(s) (shape=(23, 0))
+            # while a minimum of 1 is required.
+            "check_dict_unchanged": "FIXME",
+        }
+        return tags
+
 
 class SpectralBiclustering(BaseSpectral):
     """Spectral biclustering (Kluger, 2003).
diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
index 5235f8dd4dad2..c410dcc933c15 100644
--- a/sklearn/utils/_test_common/instance_generator.py
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -34,6 +34,7 @@
     FactorAnalysis,
     FastICA,
     IncrementalPCA,
+    KernelPCA,
     LatentDirichletAllocation,
     MiniBatchDictionaryLearning,
     MiniBatchNMF,
@@ -42,6 +43,7 @@
     SparsePCA,
     TruncatedSVD,
 )
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.dummy import DummyClassifier
 from sklearn.ensemble import (
     AdaBoostClassifier,
@@ -73,6 +75,12 @@
     SelectKBest,
     SequentialFeatureSelector,
 )
+from sklearn.kernel_approximation import (
+    Nystroem,
+    PolynomialCountSketch,
+    RBFSampler,
+    SkewedChi2Sampler,
+)
 from sklearn.linear_model import (
     ARDRegression,
     BayesianRidge,
@@ -106,7 +114,13 @@
     TheilSenRegressor,
     TweedieRegressor,
 )
-from sklearn.manifold import MDS, TSNE, LocallyLinearEmbedding, SpectralEmbedding
+from sklearn.manifold import (
+    MDS,
+    TSNE,
+    Isomap,
+    LocallyLinearEmbedding,
+    SpectralEmbedding,
+)
 from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
 from sklearn.model_selection import (
     FixedThresholdClassifier,
@@ -350,7 +364,74 @@
 # This dictionary stores parameters for specific checks. It also enables running the
 # same check with multiple instances of the same estimator with different parameters.
 # The special key "*" allows to apply the parameters to all checks.
-CHECK_PARAMS: dict = {}
+CHECK_PARAMS: dict = {
+    # TODO(devtools): check that function names here exist in checks for the estimator
+    # TODO(devtools): write a test for the same thing with tags._xfail_checks
+    AgglomerativeClustering: {"check_dict_unchanged": dict(n_clusters=1)},
+    BayesianGaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)},
+    BernoulliRBM: {"check_dict_unchanged": dict(n_components=1, n_iter=5)},
+    Birch: {"check_dict_unchanged": dict(n_clusters=1)},
+    BisectingKMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)},
+    CCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    DictionaryLearning: {
+        "check_dict_unchanged": dict(
+            max_iter=20, n_components=1, transform_algorithm="lasso_lars"
+        )
+    },
+    FactorAnalysis: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    FastICA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    FeatureAgglomeration: {"check_dict_unchanged": dict(n_clusters=1)},
+    GaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)},
+    GaussianRandomProjection: {"check_dict_unchanged": dict(n_components=1)},
+    IncrementalPCA: {"check_dict_unchanged": dict(batch_size=10, n_components=1)},
+    Isomap: {"check_dict_unchanged": dict(n_components=1)},
+    KMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)},
+    KernelPCA: {"check_dict_unchanged": dict(n_components=1)},
+    LatentDirichletAllocation: {
+        "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
+    },
+    LinearDiscriminantAnalysis: {"check_dict_unchanged": dict(n_components=1)},
+    LocallyLinearEmbedding: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    MDS: {"check_dict_unchanged": dict(max_iter=5, n_components=1, n_init=2)},
+    MiniBatchDictionaryLearning: {
+        "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
+    },
+    MiniBatchKMeans: {
+        "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_clusters=1, n_init=2)
+    },
+    MiniBatchNMF: {
+        "check_dict_unchanged": dict(
+            batch_size=10, fresh_restarts=True, max_iter=20, n_components=1
+        )
+    },
+    MiniBatchSparsePCA: {
+        "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
+    },
+    NMF: {"check_dict_unchanged": dict(max_iter=500, n_components=1)},
+    NeighborhoodComponentsAnalysis: {
+        "check_dict_unchanged": dict(max_iter=5, n_components=1)
+    },
+    Nystroem: {"check_dict_unchanged": dict(n_components=1)},
+    PCA: {"check_dict_unchanged": dict(n_components=1)},
+    PLSCanonical: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    PLSRegression: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    PLSSVD: {"check_dict_unchanged": dict(n_components=1)},
+    PolynomialCountSketch: {"check_dict_unchanged": dict(n_components=1)},
+    RBFSampler: {"check_dict_unchanged": dict(n_components=1)},
+    SkewedChi2Sampler: {"check_dict_unchanged": dict(n_components=1)},
+    SparsePCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    SparseRandomProjection: {"check_dict_unchanged": dict(n_components=1)},
+    SpectralBiclustering: {
+        "check_dict_unchanged": dict(n_best=1, n_clusters=1, n_components=1, n_init=2)
+    },
+    SpectralClustering: {
+        "check_dict_unchanged": dict(n_clusters=1, n_components=1, n_init=2)
+    },
+    SpectralCoclustering: {"check_dict_unchanged": dict(n_clusters=1, n_init=2)},
+    SpectralEmbedding: {"check_dict_unchanged": dict(eigen_tol=1e-05, n_components=1)},
+    TSNE: {"check_dict_unchanged": dict(n_components=1, perplexity=2)},
+    TruncatedSVD: {"check_dict_unchanged": dict(n_components=1)},
+}
 
 
 def _tested_estimators(type_filter=None):
@@ -486,17 +567,24 @@ def _yield_instances_for_check(check, estimator_orig):
     an estimator for each parameter set in CHECK_PARAMS[estimator].
     """
     if type(estimator_orig) not in CHECK_PARAMS:
-        return (estimator_orig,)
+        yield estimator_orig
+        return
 
     check_params = CHECK_PARAMS[type(estimator_orig)]
 
-    if "*" not in check_params and check not in check_params:
-        return (estimator_orig,)
+    try:
+        check_name = check.__name__
+    except AttributeError:
+        # partial tests
+        check_name = check.func.__name__
 
-    if "*" in check_params:
-        param_set = check_params["*"]
-    else:
-        param_set = check_params[check]
+    if check_name not in check_params:
+        yield estimator_orig
+        return
+
+    param_set = check_params[check_name]
+    if isinstance(param_set, dict):
+        param_set = [param_set]
 
     for params in param_set:
         estimator = clone(estimator_orig)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 3fda4b98af2e4..1a25bc10e0ca1 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -84,7 +84,6 @@
 
 
 def _yield_api_checks(estimator):
-    yield check_estimator_cloneable
     yield check_estimator_repr
     yield check_no_attributes_set_in_init
     yield check_fit_score_takes_y
@@ -1243,32 +1242,13 @@ def check_complex_data(name, estimator_orig):
 
 @ignore_warnings
 def check_dict_unchanged(name, estimator_orig):
-    # this estimator raises
-    # ValueError: Found array with 0 feature(s) (shape=(23, 0))
-    # while a minimum of 1 is required.
-    # error
-    if name in ["SpectralCoclustering"]:
-        return
     rnd = np.random.RandomState(0)
-    if name in ["RANSACRegressor"]:
-        X = 3 * rnd.uniform(size=(20, 3))
-    else:
-        X = 2 * rnd.uniform(size=(20, 3))
-
+    X = 3 * rnd.uniform(size=(20, 3))
     X = _enforce_estimator_tags_X(estimator_orig, X)
 
     y = X[:, 0].astype(int)
     estimator = clone(estimator_orig)
     y = _enforce_estimator_tags_y(estimator, y)
-    if hasattr(estimator, "n_components"):
-        estimator.n_components = 1
-
-    if hasattr(estimator, "n_clusters"):
-        estimator.n_clusters = 1
-
-    if hasattr(estimator, "n_best"):
-        estimator.n_best = 1
-
     set_random_state(estimator, 1)
 
     estimator.fit(X, y)

From 17d15d64f76358b1885f4311fdad0efdb4a3a063 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Tue, 10 Sep 2024 15:16:45 +0200
Subject: [PATCH 4/6] ...

---
 sklearn/utils/_test_common/instance_generator.py | 1 +
 sklearn/utils/estimator_checks.py                | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
index c410dcc933c15..61f35e200834f 100644
--- a/sklearn/utils/_test_common/instance_generator.py
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -566,6 +566,7 @@ def _yield_instances_for_check(check, estimator_orig):
     For estimators which have an entry in CHECK_PARAMS, this will yield
     an estimator for each parameter set in CHECK_PARAMS[estimator].
     """
+    # TODO(devtools): enable this behavior for third party estimators as well
     if type(estimator_orig) not in CHECK_PARAMS:
         yield estimator_orig
         return
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 1a25bc10e0ca1..725d1c82443b2 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3292,9 +3292,6 @@ def check_parameters_default_constructible(name, estimator_orig):
     Estimator = estimator_orig.__class__
 
     with ignore_warnings(category=FutureWarning):
-        # TODO(devtools): this test is flawed because _construct_instances doesn't
-        # construct with default parameters for sklearn estimators, while it does
-        # for third party estimators.
         estimator = _construct_instance(Estimator)
         # test that set_params returns self
         assert estimator.set_params() is estimator

From 42e1e33bae54b00a3dd4f1cfa859ca1b67c499e9 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Tue, 10 Sep 2024 15:33:55 +0200
Subject: [PATCH 5/6] fix tags

---
 sklearn/cluster/_bicluster.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index c42f2a133fa24..08cd63b58cbaa 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -364,11 +364,13 @@ def _fit(self, X):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
-        tags._xfail_checks = {
-            # ValueError: Found array with 0 feature(s) (shape=(23, 0))
-            # while a minimum of 1 is required.
-            "check_dict_unchanged": "FIXME",
-        }
+        tags._xfail_checks.update(
+            {
+                # ValueError: Found array with 0 feature(s) (shape=(23, 0))
+                # while a minimum of 1 is required.
+                "check_dict_unchanged": "FIXME",
+            }
+        )
         return tags
 
 

From 226a4509aa859751d08ad7d5434f01deb00cceb3 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Wed, 11 Sep 2024 15:07:35 +0200
Subject: [PATCH 6/6] rename dict to PER_ESTIMATOR_CHECK_PARAMS

---
 sklearn/utils/_test_common/instance_generator.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
index 4080d47e4ab42..c2c6be238d52d 100644
--- a/sklearn/utils/_test_common/instance_generator.py
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -474,7 +474,8 @@
 # This dictionary stores parameters for specific checks. It also enables running the
 # same check with multiple instances of the same estimator with different parameters.
 # The special key "*" allows to apply the parameters to all checks.
-CHECK_PARAMS: dict = {
+# TODO(devtools): allow third-party developers to pass test specific params to checks
+PER_ESTIMATOR_CHECK_PARAMS: dict = {
     # TODO(devtools): check that function names here exist in checks for the estimator
     # TODO(devtools): write a test for the same thing with tags._xfail_checks
     AgglomerativeClustering: {"check_dict_unchanged": dict(n_clusters=1)},
@@ -620,15 +621,15 @@ def _yield_instances_for_check(check, estimator_orig):
 
     For most estimators, this is a no-op.
 
-    For estimators which have an entry in CHECK_PARAMS, this will yield
-    an estimator for each parameter set in CHECK_PARAMS[estimator].
+    For estimators which have an entry in PER_ESTIMATOR_CHECK_PARAMS, this will yield
+    an estimator for each parameter set in PER_ESTIMATOR_CHECK_PARAMS[estimator].
     """
     # TODO(devtools): enable this behavior for third party estimators as well
-    if type(estimator_orig) not in CHECK_PARAMS:
+    if type(estimator_orig) not in PER_ESTIMATOR_CHECK_PARAMS:
         yield estimator_orig
         return
 
-    check_params = CHECK_PARAMS[type(estimator_orig)]
+    check_params = PER_ESTIMATOR_CHECK_PARAMS[type(estimator_orig)]
 
     try:
         check_name = check.__name__