From d28c3cb7c3f465a668a655a36d477a31f360ae35 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Wed, 21 Aug 2024 15:50:51 +0200
Subject: [PATCH 01/17] TST allow categorisation of tests into API and legacy

---
 sklearn/utils/estimator_checks.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 42edfe0d4d3c4..01b1276edcdaa 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -90,13 +90,17 @@
 CROSS_DECOMPOSITION = ["PLSCanonical", "PLSRegression", "CCA", "PLSSVD"]
 
 
+def _yield_api_checks(estimator):
+    yield check_no_attributes_set_in_init
+    yield check_fit_score_takes_y
+    yield check_estimators_overwrite_params
+
+
 def _yield_checks(estimator):
     name = estimator.__class__.__name__
     tags = _safe_tags(estimator)
 
-    yield check_no_attributes_set_in_init
     yield check_estimators_dtypes
-    yield check_fit_score_takes_y
     if has_fit_parameter(estimator, "sample_weight"):
         yield check_sample_weights_pandas_series
         yield check_sample_weights_not_an_array
@@ -129,7 +133,6 @@ def _yield_checks(estimator):
         # Check that pairwise estimator throws error on non-square input
         yield check_nonsquare_error
 
-    yield check_estimators_overwrite_params
     if hasattr(estimator, "sparsify"):
         yield check_sparsify_coefficients
 
@@ -323,7 +326,7 @@ def _yield_array_api_checks(estimator):
         )
 
 
-def _yield_all_checks(estimator):
+def _yield_all_checks(estimator, legacy: bool):
     name = estimator.__class__.__name__
     tags = _safe_tags(estimator)
     if "2darray" not in tags["X_types"]:
@@ -341,6 +344,12 @@ def _yield_all_checks(estimator):
         )
         return
 
+    for check in _yield_api_checks(estimator):
+        yield check
+
+    if not legacy:
+        return
+
     for check in _yield_checks(estimator):
         yield check
     if is_classifier(estimator):
@@ -513,9 +522,14 @@ def _should_be_skipped_or_marked(estimator, check):
     return False, "placeholder reason that will never be used"
 
 
-def parametrize_with_checks(estimators):
+def parametrize_with_checks(estimators, legacy=True):
     """Pytest specific decorator for parametrizing estimator checks.
 
+    Checks are categorised into the following groups:
+
+        - API checks: a set of checks to ensure API compatibility with scikit-learn
+        - legacy: a set of checks which gradually will be grouped into other categories
+
     The `id` of each check is set to be a pprint version of the estimator
     and the name of the check with its keyword arguments.
     This allows to use `pytest -k` to specify which tests to run::
@@ -533,6 +547,11 @@ def parametrize_with_checks(estimators):
 
         .. versionadded:: 0.24
 
+    legacy : bool (default=True)
+        Whether to include legacy checks.
+
+        .. versionadded:: 1.6
+
     Returns
     -------
     decorator : `pytest.mark.parametrize`
@@ -566,7 +585,7 @@ def parametrize_with_checks(estimators):
     def checks_generator():
         for estimator in estimators:
             name = type(estimator).__name__
-            for check in _yield_all_checks(estimator):
+            for check in _yield_all_checks(estimator, legacy=legacy):
                 check = partial(check, name)
                 yield _maybe_mark_xfail(estimator, check, pytest)
 

From 13a8e27c422a2d383585f7b098216984e797234e Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 22 Aug 2024 08:09:41 +0200
Subject: [PATCH 02/17] TST refactor instance generation and parameter setting

---
 doc/sphinxext/allow_nan_estimators.py         |   2 +-
 sklearn/decomposition/tests/test_pca.py       |   2 +-
 sklearn/linear_model/tests/test_ridge.py      |   2 +-
 sklearn/preprocessing/tests/test_data.py      |   2 +-
 sklearn/tests/test_common.py                  | 104 +----
 sklearn/tests/test_docstring_parameters.py    |   2 +-
 sklearn/utils/_test_common/__init__.py        |   2 +
 .../utils/_test_common/instance_generator.py  | 441 ++++++++++++++++++
 sklearn/utils/estimator_checks.py             | 220 +--------
 sklearn/utils/tests/test_estimator_checks.py  |   2 +-
 10 files changed, 466 insertions(+), 313 deletions(-)
 create mode 100644 sklearn/utils/_test_common/__init__.py
 create mode 100644 sklearn/utils/_test_common/instance_generator.py

diff --git a/doc/sphinxext/allow_nan_estimators.py b/doc/sphinxext/allow_nan_estimators.py
index 89d7077bce2b5..00a6ddc0048e9 100755
--- a/doc/sphinxext/allow_nan_estimators.py
+++ b/doc/sphinxext/allow_nan_estimators.py
@@ -4,8 +4,8 @@
 from docutils.parsers.rst import Directive
 
 from sklearn.utils import all_estimators
+from sklearn.utils._test_common.instance_generator import _construct_instance
 from sklearn.utils._testing import SkipTest
-from sklearn.utils.estimator_checks import _construct_instance
 
 
 class AllowNanEstimators(Directive):
diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
index bd7f60061abdc..52f769bfb9001 100644
--- a/sklearn/decomposition/tests/test_pca.py
+++ b/sklearn/decomposition/tests/test_pca.py
@@ -17,9 +17,9 @@
     yield_namespace_device_dtype_combinations,
 )
 from sklearn.utils._array_api import device as array_device
+from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids
 from sklearn.utils._testing import _array_api_for_tests, assert_allclose
 from sklearn.utils.estimator_checks import (
-    _get_check_estimator_ids,
     check_array_api_input_and_values,
 )
 from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index 9db58dd499269..c727d268e0ebc 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -48,6 +48,7 @@
     yield_namespace_device_dtype_combinations,
     yield_namespaces,
 )
+from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids
 from sklearn.utils._testing import (
     assert_allclose,
     assert_almost_equal,
@@ -57,7 +58,6 @@
 )
 from sklearn.utils.estimator_checks import (
     _array_api_for_tests,
-    _get_check_estimator_ids,
     check_array_api_input_and_values,
 )
 from sklearn.utils.fixes import (
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 5d254e491b400..049b188cf66a7 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -40,6 +40,7 @@
 from sklearn.utils._array_api import (
     yield_namespace_device_dtype_combinations,
 )
+from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids
 from sklearn.utils._testing import (
     _convert_container,
     assert_allclose,
@@ -51,7 +52,6 @@
     skip_if_32bit,
 )
 from sklearn.utils.estimator_checks import (
-    _get_check_estimator_ids,
     check_array_api_input_and_values,
 )
 from sklearn.utils.fixes import (
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 3a61503530f23..467c7db9a3d21 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -11,7 +11,7 @@
 import warnings
 from functools import partial
 from inspect import isgenerator, signature
-from itertools import chain, product
+from itertools import chain
 
 import numpy as np
 import pytest
@@ -26,25 +26,13 @@
     MeanShift,
     SpectralClustering,
 )
-from sklearn.compose import ColumnTransformer
 from sklearn.datasets import make_blobs
-from sklearn.decomposition import PCA
 from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
 
 # make it possible to discover experimental estimators when calling `all_estimators`
-from sklearn.experimental import (
-    enable_halving_search_cv,  # noqa
-    enable_iterative_imputer,  # noqa
-)
-from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.linear_model import LogisticRegression
 from sklearn.linear_model._base import LinearClassifierMixin
 from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding
-from sklearn.model_selection import (
-    GridSearchCV,
-    HalvingGridSearchCV,
-    HalvingRandomSearchCV,
-    RandomizedSearchCV,
-)
 from sklearn.neighbors import (
     KNeighborsClassifier,
     KNeighborsRegressor,
@@ -52,7 +40,7 @@
     RadiusNeighborsClassifier,
     RadiusNeighborsRegressor,
 )
-from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import (
     FunctionTransformer,
     MinMaxScaler,
@@ -62,15 +50,19 @@
 from sklearn.semi_supervised import LabelPropagation, LabelSpreading
 from sklearn.utils import all_estimators
 from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags
+from sklearn.utils._test_common.instance_generator import (
+    _generate_column_transformer_instances,
+    _generate_pipeline,
+    _generate_search_cv_instances,
+    _get_check_estimator_ids,
+    _set_checking_parameters,
+    _tested_estimators,
+)
 from sklearn.utils._testing import (
     SkipTest,
     ignore_warnings,
-    set_random_state,
 )
 from sklearn.utils.estimator_checks import (
-    _construct_instance,
-    _get_check_estimator_ids,
-    _set_checking_parameters,
     check_class_weight_balanced_linear_classifier,
     check_dataframe_column_names_consistency,
     check_estimator,
@@ -139,26 +131,6 @@ def test_get_check_estimator_ids(val, expected):
     assert _get_check_estimator_ids(val) == expected
 
 
-def _tested_estimators(type_filter=None):
-    for name, Estimator in all_estimators(type_filter=type_filter):
-        try:
-            estimator = _construct_instance(Estimator)
-        except SkipTest:
-            continue
-
-        yield estimator
-
-
-def _generate_pipeline():
-    for final_estimator in [Ridge(), LogisticRegression()]:
-        yield Pipeline(
-            steps=[
-                ("scaler", StandardScaler()),
-                ("final_estimator", final_estimator),
-            ]
-        )
-
-
 @parametrize_with_checks(list(chain(_tested_estimators(), _generate_pipeline())))
 def test_estimators(estimator, check, request):
     # Common tests for estimator instances
@@ -282,60 +254,6 @@ def test_class_support_removed():
         parametrize_with_checks([LogisticRegression])
 
 
-def _generate_column_transformer_instances():
-    yield ColumnTransformer(
-        transformers=[
-            ("trans1", StandardScaler(), [0, 1]),
-        ]
-    )
-
-
-def _generate_search_cv_instances():
-    for SearchCV, (Estimator, param_grid) in product(
-        [
-            GridSearchCV,
-            HalvingGridSearchCV,
-            RandomizedSearchCV,
-            HalvingGridSearchCV,
-        ],
-        [
-            (Ridge, {"alpha": [0.1, 1.0]}),
-            (LogisticRegression, {"C": [0.1, 1.0]}),
-        ],
-    ):
-        init_params = signature(SearchCV).parameters
-        extra_params = (
-            {"min_resources": "smallest"} if "min_resources" in init_params else {}
-        )
-        search_cv = SearchCV(
-            Estimator(), param_grid, cv=2, error_score="raise", **extra_params
-        )
-        set_random_state(search_cv)
-        yield search_cv
-
-    for SearchCV, (Estimator, param_grid) in product(
-        [
-            GridSearchCV,
-            HalvingGridSearchCV,
-            RandomizedSearchCV,
-            HalvingRandomSearchCV,
-        ],
-        [
-            (Ridge, {"ridge__alpha": [0.1, 1.0]}),
-            (LogisticRegression, {"logisticregression__C": [0.1, 1.0]}),
-        ],
-    ):
-        init_params = signature(SearchCV).parameters
-        extra_params = (
-            {"min_resources": "smallest"} if "min_resources" in init_params else {}
-        )
-        search_cv = SearchCV(
-            make_pipeline(PCA(), Estimator()), param_grid, cv=2, **extra_params
-        ).set_params(error_score="raise")
-        set_random_state(search_cv)
-        yield search_cv
-
-
 @parametrize_with_checks(list(_generate_search_cv_instances()))
 def test_search_cv(estimator, check, request):
     # Common tests for SearchCV instances
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 3af463b783bc3..687b85ed00187 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -22,6 +22,7 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import FunctionTransformer
 from sklearn.utils import all_estimators
+from sklearn.utils._test_common.instance_generator import _construct_instance
 from sklearn.utils._testing import (
     _get_func_name,
     check_docstring_parameters,
@@ -29,7 +30,6 @@
 )
 from sklearn.utils.deprecation import _is_deprecated
 from sklearn.utils.estimator_checks import (
-    _construct_instance,
     _enforce_estimator_tags_X,
     _enforce_estimator_tags_y,
 )
diff --git a/sklearn/utils/_test_common/__init__.py b/sklearn/utils/_test_common/__init__.py
new file mode 100644
index 0000000000000..67dd18fb94b59
--- /dev/null
+++ b/sklearn/utils/_test_common/__init__.py
@@ -0,0 +1,2 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
new file mode 100644
index 0000000000000..3e215111adcda
--- /dev/null
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -0,0 +1,441 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+import re
+import warnings
+from functools import partial
+from inspect import isfunction, signature
+from itertools import product
+
+from sklearn import config_context
+from sklearn.base import RegressorMixin
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.cluster import (
+    HDBSCAN,
+    AffinityPropagation,
+    AgglomerativeClustering,
+    Birch,
+    BisectingKMeans,
+    FeatureAgglomeration,
+    KMeans,
+    MeanShift,
+    MiniBatchKMeans,
+    SpectralBiclustering,
+    SpectralClustering,
+    SpectralCoclustering,
+)
+from sklearn.compose import ColumnTransformer
+from sklearn.covariance import GraphicalLasso, GraphicalLassoCV
+from sklearn.cross_decomposition import CCA, PLSSVD, PLSCanonical, PLSRegression
+from sklearn.decomposition import (
+    NMF,
+    PCA,
+    DictionaryLearning,
+    FactorAnalysis,
+    FastICA,
+    IncrementalPCA,
+    LatentDirichletAllocation,
+    MiniBatchDictionaryLearning,
+    MiniBatchNMF,
+    MiniBatchSparsePCA,
+    SparsePCA,
+    TruncatedSVD,
+)
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import (
+    AdaBoostClassifier,
+    AdaBoostRegressor,
+    BaggingClassifier,
+    BaggingRegressor,
+    ExtraTreesClassifier,
+    ExtraTreesRegressor,
+    GradientBoostingClassifier,
+    GradientBoostingRegressor,
+    HistGradientBoostingClassifier,
+    HistGradientBoostingRegressor,
+    IsolationForest,
+    RandomForestClassifier,
+    RandomForestRegressor,
+    RandomTreesEmbedding,
+    StackingClassifier,
+    StackingRegressor,
+)
+from sklearn.exceptions import SkipTestWarning
+from sklearn.experimental import enable_halving_search_cv  # noqa
+from sklearn.feature_selection import (
+    RFECV,
+    SelectFdr,
+    SelectFromModel,
+    SelectKBest,
+    SequentialFeatureSelector,
+)
+from sklearn.linear_model import (
+    ARDRegression,
+    BayesianRidge,
+    ElasticNet,
+    ElasticNetCV,
+    GammaRegressor,
+    HuberRegressor,
+    LarsCV,
+    Lasso,
+    LassoCV,
+    LassoLars,
+    LassoLarsCV,
+    LassoLarsIC,
+    LinearRegression,
+    LogisticRegression,
+    LogisticRegressionCV,
+    MultiTaskElasticNet,
+    MultiTaskElasticNetCV,
+    MultiTaskLasso,
+    MultiTaskLassoCV,
+    OrthogonalMatchingPursuitCV,
+    PassiveAggressiveClassifier,
+    PassiveAggressiveRegressor,
+    Perceptron,
+    PoissonRegressor,
+    RANSACRegressor,
+    Ridge,
+    SGDClassifier,
+    SGDOneClassSVM,
+    SGDRegressor,
+    TheilSenRegressor,
+    TweedieRegressor,
+)
+from sklearn.manifold import MDS, TSNE, LocallyLinearEmbedding, SpectralEmbedding
+from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
+from sklearn.model_selection import (
+    GridSearchCV,
+    HalvingGridSearchCV,
+    HalvingRandomSearchCV,
+    RandomizedSearchCV,
+    TunedThresholdClassifierCV,
+)
+from sklearn.multioutput import ClassifierChain, RegressorChain
+from sklearn.neighbors import NeighborhoodComponentsAnalysis
+from sklearn.neural_network import BernoulliRBM, MLPClassifier, MLPRegressor
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder
+from sklearn.random_projection import (
+    GaussianRandomProjection,
+    SparseRandomProjection,
+)
+from sklearn.semi_supervised import (
+    LabelPropagation,
+    LabelSpreading,
+    SelfTrainingClassifier,
+)
+from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils import all_estimators
+from sklearn.utils._testing import SkipTest, set_random_state
+
+CROSS_DECOMPOSITION = ["PLSCanonical", "PLSRegression", "CCA", "PLSSVD"]
+
+# The following dictionary is to indicate constructor arguments suitable for the test
+# suite, which uses very small datasets, and is intended to run rather quickly.
+TEST_PARAMS = {
+    AdaBoostClassifier: dict(n_estimators=5),
+    AdaBoostRegressor: dict(n_estimators=5),
+    AffinityPropagation: dict(max_iter=5),
+    AgglomerativeClustering: dict(n_clusters=2),
+    ARDRegression: dict(max_iter=5),
+    BaggingClassifier: dict(n_estimators=5),
+    BaggingRegressor: dict(n_estimators=5),
+    BayesianGaussianMixture: dict(n_init=2, max_iter=5),
+    BayesianRidge: dict(max_iter=5),
+    BernoulliRBM: dict(n_iter=5, batch_size=10),
+    Birch: dict(n_clusters=2),
+    BisectingKMeans: dict(n_init=2, n_clusters=2, max_iter=5),
+    CalibratedClassifierCV: dict(cv=3),
+    CCA: dict(n_components=1, max_iter=5),
+    ClassifierChain: dict(cv=3),
+    DictionaryLearning: dict(max_iter=20, transform_algorithm="lasso_lars"),
+    # the default strategy prior would output constant predictions and fail
+    # for check_classifiers_predictions
+    DummyClassifier: dict(strategy="stratified"),
+    ElasticNetCV: dict(max_iter=5, cv=3),
+    ElasticNet: dict(max_iter=5),
+    ExtraTreesClassifier: dict(n_estimators=5),
+    ExtraTreesRegressor: dict(n_estimators=5),
+    FactorAnalysis: dict(max_iter=5),
+    FastICA: dict(max_iter=5),
+    FeatureAgglomeration: dict(n_clusters=2),
+    GammaRegressor: dict(max_iter=5),
+    GaussianMixture: dict(n_init=2, max_iter=5),
+    # Due to the jl lemma and often very few samples, the number
+    # of components of the random matrix projection will be probably
+    # greater than the number of features.
+    # So we impose a smaller number (avoid "auto" mode)
+    GaussianRandomProjection: dict(n_components=2),
+    GradientBoostingClassifier: dict(n_estimators=5),
+    GradientBoostingRegressor: dict(n_estimators=5),
+    GraphicalLassoCV: dict(max_iter=5, cv=3),
+    GraphicalLasso: dict(max_iter=5),
+    GridSearchCV: dict(cv=3),
+    HalvingGridSearchCV: dict(cv=3),
+    HalvingRandomSearchCV: dict(cv=3),
+    HDBSCAN: dict(min_samples=1),
+    # The default min_samples_leaf (20) isn't appropriate for small
+    # datasets (only very shallow trees are built) that the checks use.
+    HistGradientBoostingClassifier: dict(max_iter=5, min_samples_leaf=5),
+    HistGradientBoostingRegressor: dict(max_iter=5, min_samples_leaf=5),
+    HuberRegressor: dict(max_iter=5),
+    IncrementalPCA: dict(batch_size=10),
+    IsolationForest: dict(n_estimators=5),
+    KMeans: dict(n_init=2, n_clusters=2, max_iter=5),
+    LabelPropagation: dict(max_iter=5),
+    LabelSpreading: dict(max_iter=5),
+    LarsCV: dict(max_iter=5, cv=3),
+    LassoCV: dict(max_iter=5, cv=3),
+    Lasso: dict(max_iter=5),
+    LassoLarsCV: dict(max_iter=5, cv=3),
+    LassoLars: dict(max_iter=5),
+    # Noise variance estimation does not work when `n_samples < n_features`.
+    # We need to provide the noise variance explicitly.
+    LassoLarsIC: dict(max_iter=5, noise_variance=1.0),
+    LatentDirichletAllocation: dict(max_iter=5, batch_size=10),
+    LinearSVR: dict(max_iter=20),
+    LinearSVC: dict(max_iter=20),
+    LocallyLinearEmbedding: dict(max_iter=5),
+    LogisticRegressionCV: dict(max_iter=5, cv=3),
+    LogisticRegression: dict(max_iter=5),
+    MDS: dict(n_init=2, max_iter=5),
+    # In the case of check_fit2d_1sample, bandwidth is set to None and
+    # is thus estimated. De facto it is 0.0 as a single sample is provided
+    # and this makes the test fails. Hence we give it a placeholder value.
+    MeanShift: dict(max_iter=5, bandwidth=1.0),
+    MiniBatchDictionaryLearning: dict(batch_size=10, max_iter=5),
+    MiniBatchKMeans: dict(n_init=2, n_clusters=2, max_iter=5, batch_size=10),
+    MiniBatchNMF: dict(batch_size=10, max_iter=20, fresh_restarts=True),
+    MiniBatchSparsePCA: dict(max_iter=5, batch_size=10),
+    MLPClassifier: dict(max_iter=100),
+    MLPRegressor: dict(max_iter=100),
+    MultiTaskElasticNetCV: dict(max_iter=5, cv=3),
+    MultiTaskElasticNet: dict(max_iter=5),
+    MultiTaskLassoCV: dict(max_iter=5, cv=3),
+    MultiTaskLasso: dict(max_iter=5),
+    NeighborhoodComponentsAnalysis: dict(max_iter=5),
+    NMF: dict(max_iter=500),
+    NuSVC: dict(max_iter=-1),
+    NuSVR: dict(max_iter=-1),
+    OneClassSVM: dict(max_iter=-1),
+    OneHotEncoder: dict(handle_unknown="ignore"),
+    OrthogonalMatchingPursuitCV: dict(cv=3),
+    PassiveAggressiveClassifier: dict(max_iter=5),
+    PassiveAggressiveRegressor: dict(max_iter=5),
+    Perceptron: dict(max_iter=5),
+    PLSCanonical: dict(n_components=1, max_iter=5),
+    PLSRegression: dict(n_components=1, max_iter=5),
+    PLSSVD: dict(n_components=1),
+    PoissonRegressor: dict(max_iter=5),
+    RandomForestClassifier: dict(n_estimators=5),
+    RandomForestRegressor: dict(n_estimators=5),
+    RandomizedSearchCV: dict(n_iter=5, cv=3),
+    RandomTreesEmbedding: dict(n_estimators=5),
+    RANSACRegressor: dict(max_trials=10),
+    RegressorChain: dict(cv=3),
+    RFECV: dict(cv=3),
+    # be tolerant of noisy datasets (not actually speed)
+    SelectFdr: dict(alpha=0.5),
+    # SelectKBest has a default of k=10
+    # which is more feature than we have in most case.
+    SelectKBest: dict(k=1),
+    SelfTrainingClassifier: dict(max_iter=5),
+    SequentialFeatureSelector: dict(cv=3),
+    SGDClassifier: dict(max_iter=5),
+    SGDOneClassSVM: dict(max_iter=5),
+    SGDRegressor: dict(max_iter=5),
+    SparsePCA: dict(max_iter=5),
+    # Due to the jl lemma and often very few samples, the number
+    # of components of the random matrix projection will be probably
+    # greater than the number of features.
+    # So we impose a smaller number (avoid "auto" mode)
+    SparseRandomProjection: dict(n_components=2),
+    SpectralBiclustering: dict(n_init=2, n_best=1, n_clusters=2),
+    SpectralClustering: dict(n_init=2, n_clusters=2),
+    SpectralCoclustering: dict(n_init=2, n_clusters=2),
+    # Default "auto" parameter can lead to different ordering of eigenvalues on
+    # windows: #24105
+    SpectralEmbedding: dict(eigen_tol=1e-5),
+    StackingClassifier: dict(cv=3),
+    StackingRegressor: dict(cv=3),
+    SVC: dict(max_iter=-1),
+    SVR: dict(max_iter=-1),
+    TargetEncoder: dict(cv=3),
+    TheilSenRegressor: dict(max_iter=5, max_subpopulation=100),
+    # TruncatedSVD doesn't run with n_components = n_features
+    TruncatedSVD: dict(n_iter=5, n_components=1),
+    TSNE: dict(perplexity=2),
+    TunedThresholdClassifierCV: dict(cv=3),
+    TweedieRegressor: dict(max_iter=5),
+}
+
+
+def _set_checking_parameters(estimator):
+    # set parameters to speed up some estimators and
+    # avoid deprecated behaviour
+    params = estimator.get_params()
+    name = estimator.__class__.__name__
+
+    if type(estimator) in TEST_PARAMS:
+        test_params = TEST_PARAMS[type(estimator)]
+        estimator.set_params(**test_params)
+
+
+def _tested_estimators(type_filter=None):
+    for name, Estimator in all_estimators(type_filter=type_filter):
+        try:
+            estimator = _construct_instance(Estimator)
+        except SkipTest:
+            continue
+
+        yield estimator
+
+
+def _generate_pipeline():
+    for final_estimator in [Ridge(), LogisticRegression()]:
+        yield Pipeline(
+            steps=[
+                ("scaler", StandardScaler()),
+                ("final_estimator", final_estimator),
+            ]
+        )
+
+
+def _construct_instance(Estimator):
+    """Construct Estimator instance if possible."""
+    required_parameters = getattr(Estimator, "_required_parameters", [])
+    if len(required_parameters):
+        if required_parameters in (["estimator"], ["base_estimator"]):
+            # `RANSACRegressor` will raise an error with any model other
+            # than `LinearRegression` if we don't fix `min_samples` parameter.
+            # For common test, we can enforce using `LinearRegression` that
+            # is the default estimator in `RANSACRegressor` instead of `Ridge`.
+            if issubclass(Estimator, RANSACRegressor):
+                estimator = Estimator(LinearRegression())
+            elif issubclass(Estimator, RegressorMixin):
+                estimator = Estimator(Ridge())
+            elif issubclass(Estimator, SelectFromModel):
+                # Increases coverage because SGDRegressor has partial_fit
+                estimator = Estimator(SGDRegressor(random_state=0))
+            else:
+                estimator = Estimator(LogisticRegression(C=1))
+        elif required_parameters in (["estimators"],):
+            # Heterogeneous ensemble classes (i.e. stacking, voting)
+            if issubclass(Estimator, RegressorMixin):
+                estimator = Estimator(
+                    estimators=[
+                        ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
+                        ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
+                    ]
+                )
+            else:
+                estimator = Estimator(
+                    estimators=[
+                        ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
+                        ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
+                    ]
+                )
+        else:
+            msg = (
+                f"Can't instantiate estimator {Estimator.__name__} "
+                f"parameters {required_parameters}"
+            )
+            # raise additional warning to be shown by pytest
+            warnings.warn(msg, SkipTestWarning)
+            raise SkipTest(msg)
+    else:
+        estimator = Estimator()
+    return estimator
+
+
+def _get_check_estimator_ids(obj):
+    """Create pytest ids for checks.
+
+    When `obj` is an estimator, this returns the pprint version of the
+    estimator (with `print_changed_only=True`). When `obj` is a function, the
+    name of the function is returned with its keyword arguments.
+
+    `_get_check_estimator_ids` is designed to be used as the `id` in
+    `pytest.mark.parametrize` where `check_estimator(..., generate_only=True)`
+    is yielding estimators and checks.
+
+    Parameters
+    ----------
+    obj : estimator or function
+        Items generated by `check_estimator`.
+
+    Returns
+    -------
+    id : str or None
+
+    See Also
+    --------
+    check_estimator
+    """
+    if isfunction(obj):
+        return obj.__name__
+    if isinstance(obj, partial):
+        if not obj.keywords:
+            return obj.func.__name__
+        kwstring = ",".join(["{}={}".format(k, v) for k, v in obj.keywords.items()])
+        return "{}({})".format(obj.func.__name__, kwstring)
+    if hasattr(obj, "get_params"):
+        with config_context(print_changed_only=True):
+            return re.sub(r"\s", "", str(obj))
+
+
+def _generate_column_transformer_instances():
+    yield ColumnTransformer(
+        transformers=[
+            ("trans1", StandardScaler(), [0, 1]),
+        ]
+    )
+
+
+def _generate_search_cv_instances():
+    for SearchCV, (Estimator, param_grid) in product(
+        [
+            GridSearchCV,
+            HalvingGridSearchCV,
+            RandomizedSearchCV,
+            HalvingGridSearchCV,
+        ],
+        [
+            (Ridge, {"alpha": [0.1, 1.0]}),
+            (LogisticRegression, {"C": [0.1, 1.0]}),
+        ],
+    ):
+        init_params = signature(SearchCV).parameters
+        extra_params = (
+            {"min_resources": "smallest"} if "min_resources" in init_params else {}
+        )
+        search_cv = SearchCV(
+            Estimator(), param_grid, cv=2, error_score="raise", **extra_params
+        )
+        set_random_state(search_cv)
+        yield search_cv
+
+    for SearchCV, (Estimator, param_grid) in product(
+        [
+            GridSearchCV,
+            HalvingGridSearchCV,
+            RandomizedSearchCV,
+            HalvingRandomSearchCV,
+        ],
+        [
+            (Ridge, {"ridge__alpha": [0.1, 1.0]}),
+            (LogisticRegression, {"logisticregression__C": [0.1, 1.0]}),
+        ],
+    ):
+        init_params = signature(SearchCV).parameters
+        extra_params = (
+            {"min_resources": "smallest"} if "min_resources" in init_params else {}
+        )
+        search_cv = SearchCV(
+            make_pipeline(PCA(), Estimator()), param_grid, cv=2, **extra_params
+        ).set_params(error_score="raise")
+        set_random_state(search_cv)
+        yield search_cv
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 42edfe0d4d3c4..745503c54a7aa 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -9,7 +9,7 @@
 from contextlib import nullcontext
 from copy import deepcopy
 from functools import partial, wraps
-from inspect import isfunction, signature
+from inspect import signature
 from numbers import Integral, Real
 
 import joblib
@@ -20,7 +20,6 @@
 from .. import config_context
 from ..base import (
     ClusterMixin,
-    RegressorMixin,
     clone,
     is_classifier,
     is_outlier_detector,
@@ -34,22 +33,12 @@
     make_regression,
 )
 from ..exceptions import DataConversionWarning, NotFittedError, SkipTestWarning
-from ..feature_selection import SelectFromModel, SelectKBest
-from ..linear_model import (
-    LinearRegression,
-    LogisticRegression,
-    RANSACRegressor,
-    Ridge,
-    SGDRegressor,
-)
 from ..metrics import accuracy_score, adjusted_rand_score, f1_score
 from ..metrics.pairwise import linear_kernel, pairwise_distances, rbf_kernel
 from ..model_selection import ShuffleSplit, train_test_split
 from ..model_selection._validation import _safe_split
 from ..pipeline import make_pipeline
 from ..preprocessing import StandardScaler, scale
-from ..random_projection import BaseRandomProjection
-from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
 from ..utils._array_api import (
     _atol_for_type,
     _convert_to_numpy,
@@ -69,6 +58,11 @@
     _DEFAULT_TAGS,
     _safe_tags,
 )
+from ._test_common.instance_generator import (
+    CROSS_DECOMPOSITION,
+    _construct_instance,
+    _get_check_estimator_ids,
+)
 from ._testing import (
     SkipTest,
     _array_api_for_tests,
@@ -87,7 +81,6 @@
 from .validation import _num_samples, check_is_fitted, has_fit_parameter
 
 REGRESSION_DATASET = None
-CROSS_DECOMPOSITION = ["PLSCanonical", "PLSRegression", "CCA", "PLSSVD"]
 
 
 def _yield_checks(estimator):
@@ -380,89 +373,6 @@ def _yield_all_checks(estimator):
         yield check_fit_non_negative
 
 
-def _get_check_estimator_ids(obj):
-    """Create pytest ids for checks.
-
-    When `obj` is an estimator, this returns the pprint version of the
-    estimator (with `print_changed_only=True`). When `obj` is a function, the
-    name of the function is returned with its keyword arguments.
-
-    `_get_check_estimator_ids` is designed to be used as the `id` in
-    `pytest.mark.parametrize` where `check_estimator(..., generate_only=True)`
-    is yielding estimators and checks.
-
-    Parameters
-    ----------
-    obj : estimator or function
-        Items generated by `check_estimator`.
-
-    Returns
-    -------
-    id : str or None
-
-    See Also
-    --------
-    check_estimator
-    """
-    if isfunction(obj):
-        return obj.__name__
-    if isinstance(obj, partial):
-        if not obj.keywords:
-            return obj.func.__name__
-        kwstring = ",".join(["{}={}".format(k, v) for k, v in obj.keywords.items()])
-        return "{}({})".format(obj.func.__name__, kwstring)
-    if hasattr(obj, "get_params"):
-        with config_context(print_changed_only=True):
-            return re.sub(r"\s", "", str(obj))
-
-
-def _construct_instance(Estimator):
-    """Construct Estimator instance if possible."""
-    required_parameters = getattr(Estimator, "_required_parameters", [])
-    if len(required_parameters):
-        if required_parameters in (["estimator"], ["base_estimator"]):
-            # `RANSACRegressor` will raise an error with any model other
-            # than `LinearRegression` if we don't fix `min_samples` parameter.
-            # For common test, we can enforce using `LinearRegression` that
-            # is the default estimator in `RANSACRegressor` instead of `Ridge`.
-            if issubclass(Estimator, RANSACRegressor):
-                estimator = Estimator(LinearRegression())
-            elif issubclass(Estimator, RegressorMixin):
-                estimator = Estimator(Ridge())
-            elif issubclass(Estimator, SelectFromModel):
-                # Increases coverage because SGDRegressor has partial_fit
-                estimator = Estimator(SGDRegressor(random_state=0))
-            else:
-                estimator = Estimator(LogisticRegression(C=1))
-        elif required_parameters in (["estimators"],):
-            # Heterogeneous ensemble classes (i.e. stacking, voting)
-            if issubclass(Estimator, RegressorMixin):
-                estimator = Estimator(
-                    estimators=[
-                        ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
-                        ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
-                    ]
-                )
-            else:
-                estimator = Estimator(
-                    estimators=[
-                        ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
-                        ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
-                    ]
-                )
-        else:
-            msg = (
-                f"Can't instantiate estimator {Estimator.__name__} "
-                f"parameters {required_parameters}"
-            )
-            # raise additional warning to be shown by pytest
-            warnings.warn(msg, SkipTestWarning)
-            raise SkipTest(msg)
-    else:
-        estimator = Estimator()
-    return estimator
-
-
 def _maybe_mark_xfail(estimator, check, pytest):
     # Mark (estimator, check) pairs as XFAIL if needed (see conditions in
     # _should_be_skipped_or_marked())
@@ -672,124 +582,6 @@ def _regression_dataset():
     return REGRESSION_DATASET
 
 
-def _set_checking_parameters(estimator):
-    # set parameters to speed up some estimators and
-    # avoid deprecated behaviour
-    params = estimator.get_params()
-    name = estimator.__class__.__name__
-    if name == "TSNE":
-        estimator.set_params(perplexity=2)
-    if "n_iter" in params and name != "TSNE":
-        estimator.set_params(n_iter=5)
-    if "max_iter" in params:
-        if estimator.max_iter is not None:
-            estimator.set_params(max_iter=min(5, estimator.max_iter))
-        # LinearSVR, LinearSVC
-        if name in ["LinearSVR", "LinearSVC"]:
-            estimator.set_params(max_iter=20)
-        # NMF
-        if name == "NMF":
-            estimator.set_params(max_iter=500)
-        # DictionaryLearning
-        if name == "DictionaryLearning":
-            estimator.set_params(max_iter=20, transform_algorithm="lasso_lars")
-        # MiniBatchNMF
-        if estimator.__class__.__name__ == "MiniBatchNMF":
-            estimator.set_params(max_iter=20, fresh_restarts=True)
-        # MLP
-        if name in ["MLPClassifier", "MLPRegressor"]:
-            estimator.set_params(max_iter=100)
-        # MiniBatchDictionaryLearning
-        if name == "MiniBatchDictionaryLearning":
-            estimator.set_params(max_iter=5)
-
-    if "n_resampling" in params:
-        # randomized lasso
-        estimator.set_params(n_resampling=5)
-    if "n_estimators" in params:
-        estimator.set_params(n_estimators=min(5, estimator.n_estimators))
-    if "max_trials" in params:
-        # RANSAC
-        estimator.set_params(max_trials=10)
-    if "n_init" in params:
-        # K-Means
-        estimator.set_params(n_init=2)
-    if "batch_size" in params and not name.startswith("MLP"):
-        estimator.set_params(batch_size=10)
-
-    if name == "MeanShift":
-        # In the case of check_fit2d_1sample, bandwidth is set to None and
-        # is thus estimated. De facto it is 0.0 as a single sample is provided
-        # and this makes the test fails. Hence we give it a placeholder value.
-        estimator.set_params(bandwidth=1.0)
-
-    if name == "TruncatedSVD":
-        # TruncatedSVD doesn't run with n_components = n_features
-        # This is ugly :-/
-        estimator.n_components = 1
-
-    if name == "LassoLarsIC":
-        # Noise variance estimation does not work when `n_samples < n_features`.
-        # We need to provide the noise variance explicitly.
-        estimator.set_params(noise_variance=1.0)
-
-    if hasattr(estimator, "n_clusters"):
-        estimator.n_clusters = min(estimator.n_clusters, 2)
-
-    if hasattr(estimator, "n_best"):
-        estimator.n_best = 1
-
-    if name == "SelectFdr":
-        # be tolerant of noisy datasets (not actually speed)
-        estimator.set_params(alpha=0.5)
-
-    if name == "TheilSenRegressor":
-        estimator.max_subpopulation = 100
-
-    if isinstance(estimator, BaseRandomProjection):
-        # Due to the jl lemma and often very few samples, the number
-        # of components of the random matrix projection will be probably
-        # greater than the number of features.
-        # So we impose a smaller number (avoid "auto" mode)
-        estimator.set_params(n_components=2)
-
-    if isinstance(estimator, SelectKBest):
-        # SelectKBest has a default of k=10
-        # which is more feature than we have in most case.
-        estimator.set_params(k=1)
-
-    if name in ("HistGradientBoostingClassifier", "HistGradientBoostingRegressor"):
-        # The default min_samples_leaf (20) isn't appropriate for small
-        # datasets (only very shallow trees are built) that the checks use.
-        estimator.set_params(min_samples_leaf=5)
-
-    if name == "DummyClassifier":
-        # the default strategy prior would output constant predictions and fail
-        # for check_classifiers_predictions
-        estimator.set_params(strategy="stratified")
-
-    # Speed-up by reducing the number of CV or splits for CV estimators
-    loo_cv = ["RidgeCV", "RidgeClassifierCV"]
-    if name not in loo_cv and hasattr(estimator, "cv"):
-        estimator.set_params(cv=3)
-    if hasattr(estimator, "n_splits"):
-        estimator.set_params(n_splits=3)
-
-    if name == "OneHotEncoder":
-        estimator.set_params(handle_unknown="ignore")
-
-    if name in CROSS_DECOMPOSITION:
-        estimator.set_params(n_components=1)
-
-    # Default "auto" parameter can lead to different ordering of eigenvalues on
-    # windows: #24105
-    if name == "SpectralEmbedding":
-        estimator.set_params(eigen_tol=1e-5)
-
-    if name == "HDBSCAN":
-        estimator.set_params(min_samples=1)
-
-
 class _NotAnArray:
     """An object that is convertible to an array.
 
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index 8ac7ac9db2e9a..066277ff24af6 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -30,6 +30,7 @@
 from sklearn.svm import SVC, NuSVC
 from sklearn.utils import _array_api, all_estimators, deprecated
 from sklearn.utils._param_validation import Interval, StrOptions
+from sklearn.utils._test_common.instance_generator import _set_checking_parameters
 from sklearn.utils._testing import (
     MinimalClassifier,
     MinimalRegressor,
@@ -40,7 +41,6 @@
 )
 from sklearn.utils.estimator_checks import (
     _NotAnArray,
-    _set_checking_parameters,
     _yield_all_checks,
     check_array_api_input,
     check_class_weight_balanced_linear_classifier,

From 3474eea41c12ded3e7b9db287b309d11f3bd3abd Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 22 Aug 2024 08:19:22 +0200
Subject: [PATCH 03/17] add legacy to check_estimator

---
 sklearn/utils/estimator_checks.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 01b1276edcdaa..dbd15bd1c2089 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -594,7 +594,7 @@ def checks_generator():
     )
 
 
-def check_estimator(estimator=None, generate_only=False):
+def check_estimator(estimator=None, generate_only=False, legacy=True):
     """Check if estimator adheres to scikit-learn conventions.
 
     This function will run an extensive test-suite for input validation,
@@ -613,6 +613,11 @@ def check_estimator(estimator=None, generate_only=False):
     :func:`~sklearn.utils.estimator_checks.parametrize_with_checks`, making it
     easier to test multiple estimators.
 
+    Checks are categorised into the following groups:
+
+        - API checks: a set of checks to ensure API compatibility with scikit-learn
+        - legacy: a set of checks which gradually will be grouped into other categories
+
     Parameters
     ----------
     estimator : estimator object
@@ -630,6 +635,11 @@ def check_estimator(estimator=None, generate_only=False):
 
         .. versionadded:: 0.22
 
+    legacy : bool (default=True)
+        Whether to include legacy checks.
+
+        .. versionadded:: 1.6
+
     Returns
     -------
     checks_generator : generator
@@ -659,7 +669,7 @@ def check_estimator(estimator=None, generate_only=False):
     name = type(estimator).__name__
 
     def checks_generator():
-        for check in _yield_all_checks(estimator):
+        for check in _yield_all_checks(estimator, legacy=legacy):
             check = _maybe_skip(estimator, check)
             yield estimator, partial(check, name)
 

From 3975f17f28d8ea98fb2f4fcc1911694348ae7ee8 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 22 Aug 2024 17:44:08 +0200
Subject: [PATCH 04/17] fix tests

---
 sklearn/utils/tests/test_estimator_checks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index 8ac7ac9db2e9a..7cf7e19f70cfe 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -1210,7 +1210,7 @@ def test_non_deterministic_estimator_skip_tests():
     # check estimators with non_deterministic tag set to True
     # will skip certain tests, refer to issue #22313 for details
     for est in [MinimalTransformer, MinimalRegressor, MinimalClassifier]:
-        all_tests = list(_yield_all_checks(est()))
+        all_tests = list(_yield_all_checks(est(), legacy=True))
         assert check_methods_sample_order_invariance in all_tests
         assert check_methods_subset_invariance in all_tests
 
@@ -1218,7 +1218,7 @@ class Estimator(est):
             def _more_tags(self):
                 return {"non_deterministic": True}
 
-        all_tests = list(_yield_all_checks(Estimator()))
+        all_tests = list(_yield_all_checks(Estimator(), legacy=True))
         assert check_methods_sample_order_invariance not in all_tests
         assert check_methods_subset_invariance not in all_tests
 

From e27edd3dc930a6fb213ce44b442045a2b4c6932e Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Fri, 23 Aug 2024 09:46:41 +0200
Subject: [PATCH 05/17] remove unnecessary vars

---
 sklearn/utils/_test_common/instance_generator.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
index 3e215111adcda..c8887ad524dd4 100644
--- a/sklearn/utils/_test_common/instance_generator.py
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -276,9 +276,6 @@
 def _set_checking_parameters(estimator):
     # set parameters to speed up some estimators and
     # avoid deprecated behaviour
-    params = estimator.get_params()
-    name = estimator.__class__.__name__
-
     if type(estimator) in TEST_PARAMS:
         test_params = TEST_PARAMS[type(estimator)]
         estimator.set_params(**test_params)

From 7eec0682bd5e5a6b33d77bc73cb0328a6e935164 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Fri, 23 Aug 2024 14:25:29 +0200
Subject: [PATCH 06/17] TST remove _required_parameters

---
 sklearn/tests/test_common.py      |  9 +++------
 sklearn/utils/estimator_checks.py | 27 +++++++++++++++++++++------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 467c7db9a3d21..d00b3779b77a7 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -51,6 +51,7 @@
 from sklearn.utils import all_estimators
 from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags
 from sklearn.utils._test_common.instance_generator import (
+    _construct_instance,
     _generate_column_transformer_instances,
     _generate_pipeline,
     _generate_search_cv_instances,
@@ -151,12 +152,8 @@ def _tested_linear_classifiers():
 
     with warnings.catch_warnings(record=True):
         for name, clazz in classifiers:
-            required_parameters = getattr(clazz, "_required_parameters", [])
-            if len(required_parameters):
-                # FIXME
-                continue
-
-            if "class_weight" in clazz().get_params().keys() and issubclass(
+            instance = _construct_instance(clazz)
+            if "class_weight" in instance.get_params().keys() and issubclass(
                 clazz, LinearClassifierMixin
             ):
                 yield name, clazz
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 745503c54a7aa..e0a38492b0091 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -87,6 +87,8 @@ def _yield_checks(estimator):
     name = estimator.__class__.__name__
     tags = _safe_tags(estimator)
 
+    yield check_estimator_cloneable
+    yield check_estimator_repr
     yield check_no_attributes_set_in_init
     yield check_estimators_dtypes
     yield check_fit_score_takes_y
@@ -3208,6 +3210,23 @@ def check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type):
     assert_allclose(pred1, pred2, atol=1e-2, err_msg=name)
 
 
+def check_estimator_cloneable(name, estimator_orig):
+    """Checks whether the estimator can be cloned."""
+    try:
+        clone(estimator_orig)
+    except Exception as e:
+        raise AssertionError(f"Cloning of {name} failed with error: {e}.") from e
+
+
+def check_estimator_repr(name, estimator_orig):
+    """Check that the estimator has a functioning repr."""
+    estimator = clone(estimator_orig)
+    try:
+        repr(estimator)
+    except Exception as e:
+        raise AssertionError(f"Repr of {name} failed with error: {e}.") from e
+
+
 def check_parameters_default_constructible(name, Estimator):
     # test default-constructibility
     # get rid of deprecation warnings
@@ -3216,10 +3235,6 @@ def check_parameters_default_constructible(name, Estimator):
 
     with ignore_warnings(category=FutureWarning):
         estimator = _construct_instance(Estimator)
-        # test cloning
-        clone(estimator)
-        # test __repr__
-        repr(estimator)
         # test that set_params returns self
         assert estimator.set_params() is estimator
 
@@ -3239,6 +3254,8 @@ def param_filter(p):
                     p.name != "self"
                     and p.kind != p.VAR_KEYWORD
                     and p.kind != p.VAR_POSITIONAL
+                    # and it should have a default value for this test
+                    and p.default != p.empty
                 )
 
             init_params = [
@@ -3250,8 +3267,6 @@ def param_filter(p):
             # true for mixins
             return
         params = estimator.get_params()
-        # they can need a non-default argument
-        init_params = init_params[len(getattr(estimator, "_required_parameters", [])) :]
 
         for init_param in init_params:
             assert (

From c3f1249be0418a67972a69ba585cd08bd5879a27 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Fri, 23 Aug 2024 15:24:38 +0200
Subject: [PATCH 07/17] TST remove _required_parameters

---
 .../utils/_test_common/instance_generator.py  | 136 ++++++++++++------
 sklearn/utils/estimator_checks.py             |  12 +-
 2 files changed, 101 insertions(+), 47 deletions(-)

diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
index c8887ad524dd4..3e56eec8abca8 100644
--- a/sklearn/utils/_test_common/instance_generator.py
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -3,13 +3,11 @@
 
 
 import re
-import warnings
 from functools import partial
 from inspect import isfunction, signature
 from itertools import product
 
 from sklearn import config_context
-from sklearn.base import RegressorMixin
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.cluster import (
     HDBSCAN,
@@ -60,10 +58,12 @@
     RandomTreesEmbedding,
     StackingClassifier,
     StackingRegressor,
+    VotingClassifier,
+    VotingRegressor,
 )
-from sklearn.exceptions import SkipTestWarning
 from sklearn.experimental import enable_halving_search_cv  # noqa
 from sklearn.feature_selection import (
+    RFE,
     RFECV,
     SelectFdr,
     SelectFromModel,
@@ -106,16 +106,27 @@
 from sklearn.manifold import MDS, TSNE, LocallyLinearEmbedding, SpectralEmbedding
 from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
 from sklearn.model_selection import (
+    FixedThresholdClassifier,
     GridSearchCV,
     HalvingGridSearchCV,
     HalvingRandomSearchCV,
     RandomizedSearchCV,
     TunedThresholdClassifierCV,
 )
-from sklearn.multioutput import ClassifierChain, RegressorChain
+from sklearn.multiclass import (
+    OneVsOneClassifier,
+    OneVsRestClassifier,
+    OutputCodeClassifier,
+)
+from sklearn.multioutput import (
+    ClassifierChain,
+    MultiOutputClassifier,
+    MultiOutputRegressor,
+    RegressorChain,
+)
 from sklearn.neighbors import NeighborhoodComponentsAnalysis
 from sklearn.neural_network import BernoulliRBM, MLPClassifier, MLPRegressor
-from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
 from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder
 from sklearn.random_projection import (
     GaussianRandomProjection,
@@ -301,51 +312,86 @@ def _generate_pipeline():
         )
 
 
+INIT_PARAMS = {
+    SelfTrainingClassifier: dict(estimator=LogisticRegression(C=1)),
+    CalibratedClassifierCV: dict(estimator=LogisticRegression(C=1)),
+    ClassifierChain: dict(base_estimator=LogisticRegression(C=1)),
+    ColumnTransformer: dict(transformers=[("trans1", StandardScaler(), [0])]),
+    FeatureUnion: dict(transformer_list=[("trans1", StandardScaler())]),
+    FixedThresholdClassifier: dict(estimator=LogisticRegression(C=1)),
+    GridSearchCV: dict(estimator=LogisticRegression(C=1), param_grid={"C": [1.0]}),
+    HalvingGridSearchCV: dict(
+        estimator=LogisticRegression(C=1), param_grid={"C": [1.0]}
+    ),
+    HalvingRandomSearchCV: dict(
+        estimator=LogisticRegression(C=1), param_distributions={"C": [1.0]}
+    ),
+    MultiOutputClassifier: dict(estimator=LogisticRegression(C=1)),
+    MultiOutputRegressor: dict(estimator=Ridge()),
+    OneVsOneClassifier: dict(estimator=LogisticRegression(C=1)),
+    OneVsRestClassifier: dict(estimator=LogisticRegression(C=1)),
+    OutputCodeClassifier: dict(estimator=LogisticRegression(C=1)),
+    Pipeline: dict(steps=[("scaler", StandardScaler()), ("est", Ridge())]),
+    RandomizedSearchCV: dict(
+        estimator=LogisticRegression(C=1), param_distributions={"C": [1.0]}
+    ),
+    # `RANSACRegressor` will raise an error with any model other
+    # than `LinearRegression` if we don't fix `min_samples` parameter.
+    # For common test, we can enforce using `LinearRegression` that
+    # is the default estimator in `RANSACRegressor` instead of `Ridge`.
+    RANSACRegressor: dict(estimator=LinearRegression()),
+    RegressorChain: dict(base_estimator=Ridge()),
+    RFECV: dict(estimator=LogisticRegression(C=1)),
+    RFE: dict(estimator=LogisticRegression(C=1)),
+    # Increases coverage because SGDRegressor has partial_fit
+    SelectFromModel: dict(estimator=SGDRegressor(random_state=0)),
+    SequentialFeatureSelector: dict(estimator=LogisticRegression(C=1)),
+    StackingClassifier: dict(
+        estimators=[
+            ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
+            ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
+        ]
+    ),
+    StackingRegressor: dict(
+        estimators=[
+            ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
+            ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
+        ]
+    ),
+    TunedThresholdClassifierCV: dict(estimator=LogisticRegression(C=1)),
+    VotingClassifier: dict(
+        estimators=[
+            ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
+            ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
+        ]
+    ),
+    VotingRegressor: dict(
+        estimators=[
+            ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
+            ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
+        ]
+    ),
+}
+
+
 def _construct_instance(Estimator):
     """Construct Estimator instance if possible."""
-    required_parameters = getattr(Estimator, "_required_parameters", [])
-    if len(required_parameters):
-        if required_parameters in (["estimator"], ["base_estimator"]):
-            # `RANSACRegressor` will raise an error with any model other
-            # than `LinearRegression` if we don't fix `min_samples` parameter.
-            # For common test, we can enforce using `LinearRegression` that
-            # is the default estimator in `RANSACRegressor` instead of `Ridge`.
-            if issubclass(Estimator, RANSACRegressor):
-                estimator = Estimator(LinearRegression())
-            elif issubclass(Estimator, RegressorMixin):
-                estimator = Estimator(Ridge())
-            elif issubclass(Estimator, SelectFromModel):
-                # Increases coverage because SGDRegressor has partial_fit
-                estimator = Estimator(SGDRegressor(random_state=0))
-            else:
-                estimator = Estimator(LogisticRegression(C=1))
-        elif required_parameters in (["estimators"],):
-            # Heterogeneous ensemble classes (i.e. stacking, voting)
-            if issubclass(Estimator, RegressorMixin):
-                estimator = Estimator(
-                    estimators=[
-                        ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
-                        ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
-                    ]
-                )
-            else:
-                estimator = Estimator(
-                    estimators=[
-                        ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
-                        ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
-                    ]
-                )
-        else:
-            msg = (
-                f"Can't instantiate estimator {Estimator.__name__} "
-                f"parameters {required_parameters}"
-            )
-            # raise additional warning to be shown by pytest
-            warnings.warn(msg, SkipTestWarning)
-            raise SkipTest(msg)
+    if Estimator in INIT_PARAMS:
+        estimator = Estimator(**INIT_PARAMS[Estimator])
     else:
         estimator = Estimator()
     return estimator
+    #     else:
+    #         msg = (
+    #             f"Can't instantiate estimator {Estimator.__name__} "
+    #             f"parameters {required_parameters}"
+    #         )
+    #         # raise additional warning to be shown by pytest
+    #         warnings.warn(msg, SkipTestWarning)
+    #         raise SkipTest(msg)
+    # else:
+    #     estimator = Estimator()
+    # return estimator
 
 
 def _get_check_estimator_ids(obj):
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index e84cd57ff01ea..6785a8bd99937 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -60,6 +60,7 @@
 )
 from ._test_common.instance_generator import (
     CROSS_DECOMPOSITION,
+    INIT_PARAMS,
     _construct_instance,
     _get_check_estimator_ids,
 )
@@ -3256,11 +3257,11 @@ def check_estimator_repr(name, estimator_orig):
         raise AssertionError(f"Repr of {name} failed with error: {e}.") from e
 
 
-def check_parameters_default_constructible(name, Estimator):
+def check_parameters_default_constructible(name, estimator_orig):
     # test default-constructibility
     # get rid of deprecation warnings
 
-    Estimator = Estimator.__class__
+    Estimator = estimator_orig.__class__
 
     with ignore_warnings(category=FutureWarning):
         estimator = _construct_instance(Estimator)
@@ -3298,6 +3299,13 @@ def param_filter(p):
         params = estimator.get_params()
 
         for init_param in init_params:
+            if (
+                type(estimator) in INIT_PARAMS
+                and init_param.name in INIT_PARAMS[type(estimator)]
+            ):
+                # these parameters are coming from INIT_PARAMS and not the default
+                # values, therefore ignored.
+                continue
             assert (
                 init_param.default != init_param.empty
             ), "parameter %s for %s has no default value" % (

From 7c4a3b28049e87584cd82564acf277257e675335 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Sat, 24 Aug 2024 11:35:35 +0200
Subject: [PATCH 08/17] ignore failing tests

---
 sklearn/compose/_column_transformer.py        | 13 +++++++++++
 .../_search_successive_halving.py             |  3 +++
 sklearn/pipeline.py                           |  9 ++++++++
 .../utils/_test_common/instance_generator.py  | 22 +++++++++----------
 4 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 2f8c28d4b02b9..325c90d697acc 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -1320,6 +1320,19 @@ def get_metadata_routing(self):
 
         return router
 
+    def _more_tags(self):
+        return {
+            "_xfail_checks": {
+                "check_estimators_empty_data_messages": "FIXME",
+                "check_estimators_nan_inf": "FIXME",
+                "check_estimator_sparse_array": "FIXME",
+                "check_estimator_sparse_matrix": "FIXME",
+                "check_transformer_data_not_an_array": "FIXME",
+                "check_fit1d": "FIXME",
+                "check_fit2d_predict1d": "FIXME",
+            }
+        }
+
 
 def _check_X(X):
     """Use check_array only when necessary, e.g. on lists and other non-array-likes."""
diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py
index 5980d40cb8e40..e5e3096a85379 100644
--- a/sklearn/model_selection/_search_successive_halving.py
+++ b/sklearn/model_selection/_search_successive_halving.py
@@ -379,6 +379,9 @@ def _more_tags(self):
                     "Fail during parameter check since min/max resources requires"
                     " more samples"
                 ),
+                "check_estimators_nan_inf": "FIXME",
+                "check_classifiers_one_label_sample_weights": "FIXME",
+                "check_fit2d_1feature": "FIXME",
             }
         )
         return tags
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 47faefcfd56ff..db0bdf0b6440c 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -1881,6 +1881,15 @@ def get_metadata_routing(self):
 
         return router
 
+    def _more_tags(self):
+        return {
+            "_xfail_checks": {
+                "check_estimators_overwrite_params": "FIXME",
+                "check_estimators_nan_inf": "FIXME",
+                "check_dont_overwrite_parameters": "FIXME",
+            }
+        }
+
 
 def make_union(*transformers, n_jobs=None, verbose=False):
     """Construct a :class:`FeatureUnion` from the given transformers.
diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
index 3e56eec8abca8..541819259a9f9 100644
--- a/sklearn/utils/_test_common/instance_generator.py
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -3,6 +3,7 @@
 
 
 import re
+import warnings
 from functools import partial
 from inspect import isfunction, signature
 from itertools import product
@@ -37,6 +38,7 @@
     MiniBatchDictionaryLearning,
     MiniBatchNMF,
     MiniBatchSparsePCA,
+    SparseCoder,
     SparsePCA,
     TruncatedSVD,
 )
@@ -61,6 +63,7 @@
     VotingClassifier,
     VotingRegressor,
 )
+from sklearn.exceptions import SkipTestWarning
 from sklearn.experimental import enable_halving_search_cv  # noqa
 from sklearn.feature_selection import (
     RFE,
@@ -373,25 +376,22 @@ def _generate_pipeline():
     ),
 }
 
+SKIPPED_ESTIMATORS = [SparseCoder]
+
 
 def _construct_instance(Estimator):
     """Construct Estimator instance if possible."""
+    if Estimator in SKIPPED_ESTIMATORS:
+        msg = f"Can't instantiate estimator {Estimator.__name__}"
+        # raise additional warning to be shown by pytest
+        warnings.warn(msg, SkipTestWarning)
+        raise SkipTest(msg)
+
     if Estimator in INIT_PARAMS:
         estimator = Estimator(**INIT_PARAMS[Estimator])
     else:
         estimator = Estimator()
     return estimator
-    #     else:
-    #         msg = (
-    #             f"Can't instantiate estimator {Estimator.__name__} "
-    #             f"parameters {required_parameters}"
-    #         )
-    #         # raise additional warning to be shown by pytest
-    #         warnings.warn(msg, SkipTestWarning)
-    #         raise SkipTest(msg)
-    # else:
-    #     estimator = Estimator()
-    # return estimator
 
 
 def _get_check_estimator_ids(obj):

From 1f220c48620c4395f854dca861f05d0271d96971 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Wed, 4 Sep 2024 15:07:11 +0200
Subject: [PATCH 09/17] skipping tests that should be skipped

---
 sklearn/compose/_column_transformer.py | 2 ++
 sklearn/tests/test_common.py           | 7 ++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 325c90d697acc..60c0feb9cb279 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -1330,6 +1330,8 @@ def _more_tags(self):
                 "check_transformer_data_not_an_array": "FIXME",
                 "check_fit1d": "FIXME",
                 "check_fit2d_predict1d": "FIXME",
+                "check_complex_data": "FIXME",
+                "check_fit2d_1feature": "FIXME",
             }
         }
 
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index d2df478e7899e..ebaa7a6037979 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -26,6 +26,7 @@
     MeanShift,
     SpectralClustering,
 )
+from sklearn.compose import ColumnTransformer
 from sklearn.datasets import make_blobs
 from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
 from sklearn.experimental import (
@@ -45,7 +46,7 @@
     RadiusNeighborsClassifier,
     RadiusNeighborsRegressor,
 )
-from sklearn.pipeline import make_pipeline
+from sklearn.pipeline import FeatureUnion, make_pipeline
 from sklearn.preprocessing import (
     FunctionTransformer,
     MinMaxScaler,
@@ -310,6 +311,8 @@ def _estimators_that_predict_in_fit():
     "estimator", column_name_estimators, ids=_get_check_estimator_ids
 )
 def test_pandas_column_name_consistency(estimator):
+    if isinstance(estimator, ColumnTransformer):
+        pytest.skip("ColumnTransformer is not tested here")
     _set_checking_parameters(estimator)
     with ignore_warnings(category=(FutureWarning)):
         with warnings.catch_warnings(record=True) as record:
@@ -400,6 +403,8 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator):
     ids=_get_check_estimator_ids,
 )
 def test_check_param_validation(estimator):
+    if isinstance(estimator, FeatureUnion):
+        pytest.skip("FeatureUnion is not tested here")
     name = estimator.__class__.__name__
     _set_checking_parameters(estimator)
     check_param_validation(name, estimator)

From f434406db8afa6e20dada0e7cae66c9d0329ffc8 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Wed, 4 Sep 2024 15:49:22 +0200
Subject: [PATCH 10/17] remove _required_parameters

---
 doc/developers/develop.rst                    |  9 ------
 sklearn/base.py                               | 28 ++++---------------
 sklearn/compose/_column_transformer.py        |  2 --
 sklearn/decomposition/_dict_learning.py       |  2 --
 sklearn/ensemble/_base.py                     |  6 ----
 .../_classification_threshold.py              |  1 -
 sklearn/model_selection/_search.py            |  4 ---
 .../_search_successive_halving.py             |  4 ---
 sklearn/pipeline.py                           |  4 ---
 9 files changed, 5 insertions(+), 55 deletions(-)

diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
index 97cb156da5812..c7e66d8df8ce5 100644
--- a/doc/developers/develop.rst
+++ b/doc/developers/develop.rst
@@ -659,15 +659,6 @@ Even if it is not recommended, it is possible to override the method
 any of the keys documented above is not present in the output of `_get_tags()`,
 an error will occur.
 
-In addition to the tags, estimators also need to declare any non-optional
-parameters to ``__init__`` in the ``_required_parameters`` class attribute,
-which is a list or tuple.  If ``_required_parameters`` is only
-``["estimator"]`` or ``["base_estimator"]``, then the estimator will be
-instantiated with an instance of ``LogisticRegression`` (or
-``RidgeRegression`` if the estimator is a regressor) in the tests. The choice
-of these two models is somewhat idiosyncratic but both should provide robust
-closed-form solutions.
-
 .. _developer_api_set_output:
 
 Developer API for `set_output`
diff --git a/sklearn/base.py b/sklearn/base.py
index 48b9081b8ade3..bc20b2282698f 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -1314,32 +1314,14 @@ def fit_predict(self, X, y=None, **kwargs):
 class MetaEstimatorMixin:
     """Mixin class for all meta estimators in scikit-learn.
 
-    This mixin defines the following functionality:
-
-    - define `_required_parameters` that specify the mandatory `estimator` parameter.
+    This mixin is empty, and only exists to indicate that the estimator is a
+    meta-estimator.
 
-    Examples
-    --------
-    >>> from sklearn.base import MetaEstimatorMixin
-    >>> from sklearn.datasets import load_iris
-    >>> from sklearn.linear_model import LogisticRegression
-    >>> class MyEstimator(MetaEstimatorMixin):
-    ...     def __init__(self, *, estimator=None):
-    ...         self.estimator = estimator
-    ...     def fit(self, X, y=None):
-    ...         if self.estimator is None:
-    ...             self.estimator_ = LogisticRegression()
-    ...         else:
-    ...             self.estimator_ = self.estimator
-    ...         return self
-    >>> X, y = load_iris(return_X_y=True)
-    >>> estimator = MyEstimator().fit(X, y)
-    >>> estimator.estimator_
-    LogisticRegression()
+    .. versionchanged:: 1.6
+        The `_required_parameters` is now removed and is unnecessary since tests are
+        refactored and don't use this anymore.
     """
 
-    _required_parameters = ["estimator"]
-
 
 class MultiOutputMixin:
     """Mixin to mark estimators that support multioutput."""
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 60c0feb9cb279..fa97d6ff4edbd 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -285,8 +285,6 @@ class ColumnTransformer(TransformerMixin, _BaseComposition):
     :ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`.
     """
 
-    _required_parameters = ["transformers"]
-
     _parameter_constraints: dict = {
         "transformers": [list, Hidden(tuple)],
         "remainder": [
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index 451d120756e9c..bb0131753929b 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -1279,8 +1279,6 @@ class SparseCoder(_BaseSparseCoding, BaseEstimator):
            [ 0.,  1.,  1.,  0.,  0.]])
     """
 
-    _required_parameters = ["dictionary"]
-
     def __init__(
         self,
         dictionary,
diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 289c7c9b46f4a..a1527ecc088d8 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -4,7 +4,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 from abc import ABCMeta, abstractmethod
-from typing import List
 
 import numpy as np
 from joblib import effective_n_jobs
@@ -106,9 +105,6 @@ class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
         The collection of fitted base estimators.
     """
 
-    # overwrite _required_parameters from MetaEstimatorMixin
-    _required_parameters: List[str] = []
-
     @abstractmethod
     def __init__(
         self,
@@ -200,8 +196,6 @@ class _BaseHeterogeneousEnsemble(
         appear in `estimators_`.
     """
 
-    _required_parameters = ["estimators"]
-
     @property
     def named_estimators(self):
         """Dictionary to access any fitted sub-estimators by name.
diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
index bd30a98ac7cc9..1c63d64a42252 100644
--- a/sklearn/model_selection/_classification_threshold.py
+++ b/sklearn/model_selection/_classification_threshold.py
@@ -87,7 +87,6 @@ class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator
           error.
     """
 
-    _required_parameters = ["estimator"]
     _parameter_constraints: dict = {
         "estimator": [
             HasMethods(["fit", "predict_proba"]),
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 9218b5bb6b3be..428f5bcdfe4eb 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1531,8 +1531,6 @@ class GridSearchCV(BaseSearchCV):
      'std_fit_time', 'std_score_time', 'std_test_score']
     """
 
-    _required_parameters = ["estimator", "param_grid"]
-
     _parameter_constraints: dict = {
         **BaseSearchCV._parameter_constraints,
         "param_grid": [dict, list],
@@ -1912,8 +1910,6 @@ class RandomizedSearchCV(BaseSearchCV):
     {'C': np.float64(2...), 'penalty': 'l1'}
     """
 
-    _required_parameters = ["estimator", "param_distributions"]
-
     _parameter_constraints: dict = {
         **BaseSearchCV._parameter_constraints,
         "param_distributions": [dict, list],
diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py
index e5e3096a85379..5f001d4ea6071 100644
--- a/sklearn/model_selection/_search_successive_halving.py
+++ b/sklearn/model_selection/_search_successive_halving.py
@@ -672,8 +672,6 @@ class HalvingGridSearchCV(BaseSuccessiveHalving):
     {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9}
     """
 
-    _required_parameters = ["estimator", "param_grid"]
-
     _parameter_constraints: dict = {
         **BaseSuccessiveHalving._parameter_constraints,
         "param_grid": [dict, list],
@@ -1022,8 +1020,6 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving):
     {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9}
     """
 
-    _required_parameters = ["estimator", "param_distributions"]
-
     _parameter_constraints: dict = {
         **BaseSuccessiveHalving._parameter_constraints,
         "param_distributions": [dict, list],
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index db0bdf0b6440c..a6bf9c99cb694 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -152,8 +152,6 @@ class Pipeline(_BaseComposition):
     """
 
     # BaseEstimator interface
-    _required_parameters = ["steps"]
-
     _parameter_constraints: dict = {
         "steps": [list, Hidden(tuple)],
         "memory": [None, str, HasMethods(["cache"])],
@@ -1426,8 +1424,6 @@ class FeatureUnion(TransformerMixin, _BaseComposition):
     :ref:`sphx_glr_auto_examples_compose_plot_feature_union.py`.
     """
 
-    _required_parameters = ["transformer_list"]
-
     def __init__(
         self,
         transformer_list,

From 650bb8eb85d65c883cd643d9010e1d4490df5fea Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 5 Sep 2024 09:16:10 +0200
Subject: [PATCH 11/17] trying different params

---
 sklearn/base.py                               | 19 +++++++++++++++++++
 sklearn/tests/test_common.py                  |  6 ++----
 .../utils/_test_common/instance_generator.py  |  5 +++--
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index bc20b2282698f..477d27cec34b0 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -1320,6 +1320,25 @@ class MetaEstimatorMixin:
     .. versionchanged:: 1.6
         The `_required_parameters` is now removed and is unnecessary since tests are
         refactored and don't use this anymore.
+
+    Examples
+    --------
+    >>> from sklearn.base import MetaEstimatorMixin
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> class MyEstimator(MetaEstimatorMixin):
+    ...     def __init__(self, *, estimator=None):
+    ...         self.estimator = estimator
+    ...     def fit(self, X, y=None):
+    ...         if self.estimator is None:
+    ...             self.estimator_ = LogisticRegression()
+    ...         else:
+    ...             self.estimator_ = self.estimator
+    ...         return self
+    >>> X, y = load_iris(return_X_y=True)
+    >>> estimator = MyEstimator().fit(X, y)
+    >>> estimator.estimator_
+    LogisticRegression()
     """
 
 
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index ebaa7a6037979..6f56bbbce84cb 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -29,15 +29,13 @@
 from sklearn.compose import ColumnTransformer
 from sklearn.datasets import make_blobs
 from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
+
+# make it possible to discover experimental estimators when calling `all_estimators`
 from sklearn.experimental import (
     enable_halving_search_cv,  # noqa
     enable_iterative_imputer,  # noqa
 )
-
-# make it possible to discover experimental estimators when calling `all_estimators`
 from sklearn.linear_model import LogisticRegression
-
-# make it possible to discover experimental estimators when calling `all_estimators`
 from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding
 from sklearn.neighbors import (
     KNeighborsClassifier,
diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
index 218969dfa5310..8ce1e5e65c05c 100644
--- a/sklearn/utils/_test_common/instance_generator.py
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -337,11 +337,12 @@ def _generate_pipeline():
         error_score="raise",
     ),
     HalvingRandomSearchCV: dict(
-        estimator=LogisticRegression(),
-        param_distributions={"C": [0.1, 1.0]},
+        estimator=Ridge(),
+        param_distributions={"alpha": [0.1, 1.0]},
         min_resources="smallest",
         cv=2,
         error_score="raise",
+        random_state=0,
     ),
     MultiOutputClassifier: dict(estimator=LogisticRegression(C=1)),
     MultiOutputRegressor: dict(estimator=Ridge()),

From 148e5d54156e2c587dedcac1b365e9b4d9fb9c56 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 5 Sep 2024 09:49:28 +0200
Subject: [PATCH 12/17] merge the two dicts

---
 sklearn/tests/test_common.py                  |  11 -
 .../utils/_test_common/instance_generator.py  | 368 +++++++++---------
 sklearn/utils/tests/test_estimator_checks.py  |   3 -
 3 files changed, 175 insertions(+), 207 deletions(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 6f56bbbce84cb..ae86c602cef7e 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -58,7 +58,6 @@
     _generate_pipeline,
     _generate_search_cv_instances,
     _get_check_estimator_ids,
-    _set_checking_parameters,
     _tested_estimators,
 )
 from sklearn.utils._testing import (
@@ -139,7 +138,6 @@ def test_estimators(estimator, check, request):
     with ignore_warnings(
         category=(FutureWarning, ConvergenceWarning, UserWarning, LinAlgWarning)
     ):
-        _set_checking_parameters(estimator)
         check(estimator)
 
 
@@ -270,7 +268,6 @@ def test_valid_tag_types(estimator):
     "estimator", _tested_estimators(), ids=_get_check_estimator_ids
 )
 def test_check_n_features_in_after_fitting(estimator):
-    _set_checking_parameters(estimator)
     check_n_features_in_after_fitting(estimator.__class__.__name__, estimator)
 
 
@@ -311,7 +308,6 @@ def _estimators_that_predict_in_fit():
 def test_pandas_column_name_consistency(estimator):
     if isinstance(estimator, ColumnTransformer):
         pytest.skip("ColumnTransformer is not tested here")
-    _set_checking_parameters(estimator)
     with ignore_warnings(category=(FutureWarning)):
         with warnings.catch_warnings(record=True) as record:
             check_dataframe_column_names_consistency(
@@ -347,7 +343,6 @@ def _include_in_get_feature_names_out_check(transformer):
     "transformer", GET_FEATURES_OUT_ESTIMATORS, ids=_get_check_estimator_ids
 )
 def test_transformers_get_feature_names_out(transformer):
-    _set_checking_parameters(transformer)
 
     with ignore_warnings(category=(FutureWarning)):
         check_transformer_get_feature_names_out(
@@ -368,7 +363,6 @@ def test_transformers_get_feature_names_out(transformer):
 )
 def test_estimators_get_feature_names_out_error(estimator):
     estimator_name = estimator.__class__.__name__
-    _set_checking_parameters(estimator)
     check_get_feature_names_out_error(estimator_name, estimator)
 
 
@@ -404,7 +398,6 @@ def test_check_param_validation(estimator):
     if isinstance(estimator, FeatureUnion):
         pytest.skip("FeatureUnion is not tested here")
     name = estimator.__class__.__name__
-    _set_checking_parameters(estimator)
     check_param_validation(name, estimator)
 
 
@@ -469,7 +462,6 @@ def test_set_output_transform(estimator):
             f"Skipping check_set_output_transform for {name}: Does not support"
             " set_output API"
         )
-    _set_checking_parameters(estimator)
     with ignore_warnings(category=(FutureWarning)):
         check_set_output_transform(estimator.__class__.__name__, estimator)
 
@@ -493,7 +485,6 @@ def test_set_output_transform_configured(estimator, check_func):
             f"Skipping {check_func.__name__} for {name}: Does not support"
             " set_output API yet"
         )
-    _set_checking_parameters(estimator)
     with ignore_warnings(category=(FutureWarning)):
         check_func(estimator.__class__.__name__, estimator)
 
@@ -511,8 +502,6 @@ def test_check_inplace_ensure_writeable(estimator):
     else:
         raise SkipTest(f"{name} doesn't require writeable input.")
 
-    _set_checking_parameters(estimator)
-
     # The following estimators can work inplace only with certain settings
     if name == "HDBSCAN":
         estimator.set_params(metric="precomputed", algorithm="brute")
diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
index 8ce1e5e65c05c..4f6b98917d260 100644
--- a/sklearn/utils/_test_common/instance_generator.py
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -149,150 +149,207 @@
 
 # The following dictionary is to indicate constructor arguments suitable for the test
 # suite, which uses very small datasets, and is intended to run rather quickly.
-TEST_PARAMS = {
-    AdaBoostClassifier: dict(n_estimators=5),
-    AdaBoostRegressor: dict(n_estimators=5),
-    AffinityPropagation: dict(max_iter=5),
-    AgglomerativeClustering: dict(n_clusters=2),
-    ARDRegression: dict(max_iter=5),
-    BaggingClassifier: dict(n_estimators=5),
-    BaggingRegressor: dict(n_estimators=5),
-    BayesianGaussianMixture: dict(n_init=2, max_iter=5),
-    BayesianRidge: dict(max_iter=5),
-    BernoulliRBM: dict(n_iter=5, batch_size=10),
-    Birch: dict(n_clusters=2),
-    BisectingKMeans: dict(n_init=2, n_clusters=2, max_iter=5),
-    CalibratedClassifierCV: dict(cv=3),
-    CCA: dict(n_components=1, max_iter=5),
-    ClassifierChain: dict(cv=3),
-    DictionaryLearning: dict(max_iter=20, transform_algorithm="lasso_lars"),
+INIT_PARAMS = {
+    AdaBoostClassifier: {"n_estimators": 5},
+    AdaBoostRegressor: {"n_estimators": 5},
+    AffinityPropagation: {"max_iter": 5},
+    AgglomerativeClustering: {"n_clusters": 2},
+    ARDRegression: {"max_iter": 5},
+    BaggingClassifier: {"n_estimators": 5},
+    BaggingRegressor: {"n_estimators": 5},
+    BayesianGaussianMixture: {"n_init": 2, "max_iter": 5},
+    BayesianRidge: {"max_iter": 5},
+    BernoulliRBM: {"n_iter": 5, "batch_size": 10},
+    Birch: {"n_clusters": 2},
+    BisectingKMeans: {"n_init": 2, "n_clusters": 2, "max_iter": 5},
+    CalibratedClassifierCV: {"estimator": LogisticRegression(C=1), "cv": 3},
+    CCA: {"n_components": 1, "max_iter": 5},
+    ClassifierChain: {"base_estimator": LogisticRegression(C=1), "cv": 3},
+    ColumnTransformer: {"transformers": [("trans1", StandardScaler(), [0, 1])]},
+    DictionaryLearning: {"max_iter": 20, "transform_algorithm": "lasso_lars"},
     # the default strategy prior would output constant predictions and fail
     # for check_classifiers_predictions
-    DummyClassifier: dict(strategy="stratified"),
-    ElasticNetCV: dict(max_iter=5, cv=3),
-    ElasticNet: dict(max_iter=5),
-    ExtraTreesClassifier: dict(n_estimators=5),
-    ExtraTreesRegressor: dict(n_estimators=5),
-    FactorAnalysis: dict(max_iter=5),
-    FastICA: dict(max_iter=5),
-    FeatureAgglomeration: dict(n_clusters=2),
-    GammaRegressor: dict(max_iter=5),
-    GaussianMixture: dict(n_init=2, max_iter=5),
+    DummyClassifier: {"strategy": "stratified"},
+    ElasticNetCV: {"max_iter": 5, "cv": 3},
+    ElasticNet: {"max_iter": 5},
+    ExtraTreesClassifier: {"n_estimators": 5},
+    ExtraTreesRegressor: {"n_estimators": 5},
+    FactorAnalysis: {"max_iter": 5},
+    FastICA: {"max_iter": 5},
+    FeatureAgglomeration: {"n_clusters": 2},
+    FeatureUnion: {"transformer_list": [("trans1", StandardScaler())]},
+    FixedThresholdClassifier: {"estimator": LogisticRegression(C=1)},
+    GammaRegressor: {"max_iter": 5},
+    GaussianMixture: {"n_init": 2, "max_iter": 5},
     # Due to the jl lemma and often very few samples, the number
     # of components of the random matrix projection will be probably
     # greater than the number of features.
     # So we impose a smaller number (avoid "auto" mode)
-    GaussianRandomProjection: dict(n_components=2),
-    GradientBoostingClassifier: dict(n_estimators=5),
-    GradientBoostingRegressor: dict(n_estimators=5),
-    GraphicalLassoCV: dict(max_iter=5, cv=3),
-    GraphicalLasso: dict(max_iter=5),
-    GridSearchCV: dict(cv=3),
-    HDBSCAN: dict(min_samples=1),
+    GaussianRandomProjection: {"n_components": 2},
+    GradientBoostingClassifier: {"n_estimators": 5},
+    GradientBoostingRegressor: {"n_estimators": 5},
+    GraphicalLassoCV: {"max_iter": 5, "cv": 3},
+    GraphicalLasso: {"max_iter": 5},
+    GridSearchCV: {
+        "estimator": LogisticRegression(C=1),
+        "param_grid": {"C": [1.0]},
+        "cv": 3,
+    },
+    HalvingGridSearchCV: {
+        "estimator": Ridge(),
+        "min_resources": "smallest",
+        "param_grid": {"alpha": [0.1, 1.0]},
+        "random_state": 0,
+        "cv": 2,
+        "error_score": "raise",
+    },
+    HalvingRandomSearchCV: {
+        "estimator": Ridge(),
+        "param_distributions": {"alpha": [0.1, 1.0]},
+        "min_resources": "smallest",
+        "cv": 2,
+        "error_score": "raise",
+        "random_state": 0,
+    },
+    HDBSCAN: {"min_samples": 1},
     # The default min_samples_leaf (20) isn't appropriate for small
     # datasets (only very shallow trees are built) that the checks use.
-    HistGradientBoostingClassifier: dict(max_iter=5, min_samples_leaf=5),
-    HistGradientBoostingRegressor: dict(max_iter=5, min_samples_leaf=5),
-    HuberRegressor: dict(max_iter=5),
-    IncrementalPCA: dict(batch_size=10),
-    IsolationForest: dict(n_estimators=5),
-    KMeans: dict(n_init=2, n_clusters=2, max_iter=5),
-    LabelPropagation: dict(max_iter=5),
-    LabelSpreading: dict(max_iter=5),
-    LarsCV: dict(max_iter=5, cv=3),
-    LassoCV: dict(max_iter=5, cv=3),
-    Lasso: dict(max_iter=5),
-    LassoLarsCV: dict(max_iter=5, cv=3),
-    LassoLars: dict(max_iter=5),
+    HistGradientBoostingClassifier: {"max_iter": 5, "min_samples_leaf": 5},
+    HistGradientBoostingRegressor: {"max_iter": 5, "min_samples_leaf": 5},
+    HuberRegressor: {"max_iter": 5},
+    IncrementalPCA: {"batch_size": 10},
+    IsolationForest: {"n_estimators": 5},
+    KMeans: {"n_init": 2, "n_clusters": 2, "max_iter": 5},
+    LabelPropagation: {"max_iter": 5},
+    LabelSpreading: {"max_iter": 5},
+    LarsCV: {"max_iter": 5, "cv": 3},
+    LassoCV: {"max_iter": 5, "cv": 3},
+    LassoLarsCV: {"max_iter": 5, "cv": 3},
     # Noise variance estimation does not work when `n_samples < n_features`.
     # We need to provide the noise variance explicitly.
-    LassoLarsIC: dict(max_iter=5, noise_variance=1.0),
-    LatentDirichletAllocation: dict(max_iter=5, batch_size=10),
-    LinearSVR: dict(max_iter=20),
-    LinearSVC: dict(max_iter=20),
-    LocallyLinearEmbedding: dict(max_iter=5),
-    LogisticRegressionCV: dict(max_iter=5, cv=3),
-    LogisticRegression: dict(max_iter=5),
-    MDS: dict(n_init=2, max_iter=5),
+    LassoLarsIC: {"max_iter": 5, "noise_variance": 1.0},
+    LassoLars: {"max_iter": 5},
+    Lasso: {"max_iter": 5},
+    LatentDirichletAllocation: {"max_iter": 5, "batch_size": 10},
+    LinearSVC: {"max_iter": 20},
+    LinearSVR: {"max_iter": 20},
+    LocallyLinearEmbedding: {"max_iter": 5},
+    LogisticRegressionCV: {"max_iter": 5, "cv": 3},
+    LogisticRegression: {"max_iter": 5},
+    MDS: {"n_init": 2, "max_iter": 5},
     # In the case of check_fit2d_1sample, bandwidth is set to None and
     # is thus estimated. De facto it is 0.0 as a single sample is provided
     # and this makes the test fails. Hence we give it a placeholder value.
-    MeanShift: dict(max_iter=5, bandwidth=1.0),
-    MiniBatchDictionaryLearning: dict(batch_size=10, max_iter=5),
-    MiniBatchKMeans: dict(n_init=2, n_clusters=2, max_iter=5, batch_size=10),
-    MiniBatchNMF: dict(batch_size=10, max_iter=20, fresh_restarts=True),
-    MiniBatchSparsePCA: dict(max_iter=5, batch_size=10),
-    MLPClassifier: dict(max_iter=100),
-    MLPRegressor: dict(max_iter=100),
-    MultiTaskElasticNetCV: dict(max_iter=5, cv=3),
-    MultiTaskElasticNet: dict(max_iter=5),
-    MultiTaskLassoCV: dict(max_iter=5, cv=3),
-    MultiTaskLasso: dict(max_iter=5),
-    NeighborhoodComponentsAnalysis: dict(max_iter=5),
-    NMF: dict(max_iter=500),
-    NuSVC: dict(max_iter=-1),
-    NuSVR: dict(max_iter=-1),
-    OneClassSVM: dict(max_iter=-1),
-    OneHotEncoder: dict(handle_unknown="ignore"),
-    OrthogonalMatchingPursuitCV: dict(cv=3),
-    PassiveAggressiveClassifier: dict(max_iter=5),
-    PassiveAggressiveRegressor: dict(max_iter=5),
-    Perceptron: dict(max_iter=5),
-    PLSCanonical: dict(n_components=1, max_iter=5),
-    PLSRegression: dict(n_components=1, max_iter=5),
-    PLSSVD: dict(n_components=1),
-    PoissonRegressor: dict(max_iter=5),
-    RandomForestClassifier: dict(n_estimators=5),
-    RandomForestRegressor: dict(n_estimators=5),
-    RandomizedSearchCV: dict(n_iter=5, cv=3),
-    RandomTreesEmbedding: dict(n_estimators=5),
-    RANSACRegressor: dict(max_trials=10),
-    RegressorChain: dict(cv=3),
-    RFECV: dict(cv=3),
+    MeanShift: {"max_iter": 5, "bandwidth": 1.0},
+    MiniBatchDictionaryLearning: {"batch_size": 10, "max_iter": 5},
+    MiniBatchKMeans: {"n_init": 2, "n_clusters": 2, "max_iter": 5, "batch_size": 10},
+    MiniBatchNMF: {"batch_size": 10, "max_iter": 20, "fresh_restarts": True},
+    MiniBatchSparsePCA: {"max_iter": 5, "batch_size": 10},
+    MLPClassifier: {"max_iter": 100},
+    MLPRegressor: {"max_iter": 100},
+    MultiOutputClassifier: {"estimator": LogisticRegression(C=1)},
+    MultiOutputRegressor: {"estimator": Ridge()},
+    MultiTaskElasticNetCV: {"max_iter": 5, "cv": 3},
+    MultiTaskElasticNet: {"max_iter": 5},
+    MultiTaskLassoCV: {"max_iter": 5, "cv": 3},
+    MultiTaskLasso: {"max_iter": 5},
+    NeighborhoodComponentsAnalysis: {"max_iter": 5},
+    NMF: {"max_iter": 500},
+    NuSVC: {"max_iter": -1},
+    NuSVR: {"max_iter": -1},
+    OneClassSVM: {"max_iter": -1},
+    OneHotEncoder: {"handle_unknown": "ignore"},
+    OneVsOneClassifier: {"estimator": LogisticRegression(C=1)},
+    OneVsRestClassifier: {"estimator": LogisticRegression(C=1)},
+    OrthogonalMatchingPursuitCV: {"cv": 3},
+    OutputCodeClassifier: {"estimator": LogisticRegression(C=1)},
+    PassiveAggressiveClassifier: {"max_iter": 5},
+    PassiveAggressiveRegressor: {"max_iter": 5},
+    Perceptron: {"max_iter": 5},
+    Pipeline: {"steps": [("scaler", StandardScaler()), ("est", Ridge())]},
+    PLSCanonical: {"n_components": 1, "max_iter": 5},
+    PLSRegression: {"n_components": 1, "max_iter": 5},
+    PLSSVD: {"n_components": 1},
+    PoissonRegressor: {"max_iter": 5},
+    RandomForestClassifier: {"n_estimators": 5},
+    RandomForestRegressor: {"n_estimators": 5},
+    RandomizedSearchCV: {
+        "estimator": LogisticRegression(C=1),
+        "param_distributions": {"C": [1.0]},
+        "n_iter": 5,
+        "cv": 3,
+    },
+    RandomTreesEmbedding: {"n_estimators": 5},
+    # `RANSACRegressor` will raise an error with any model other
+    # than `LinearRegression` if we don't fix `min_samples` parameter.
+    # For common test, we can enforce using `LinearRegression` that
+    # is the default estimator in `RANSACRegressor` instead of `Ridge`.
+    RANSACRegressor: {"estimator": LinearRegression(), "max_trials": 10},
+    RegressorChain: {"base_estimator": Ridge(), "cv": 3},
+    RFECV: {"estimator": LogisticRegression(C=1), "cv": 3},
+    RFE: {"estimator": LogisticRegression(C=1)},
     # be tolerant of noisy datasets (not actually speed)
-    SelectFdr: dict(alpha=0.5),
+    SelectFdr: {"alpha": 0.5},
+    # Increases coverage because SGDRegressor has partial_fit
+    SelectFromModel: {"estimator": SGDRegressor(random_state=0)},
     # SelectKBest has a default of k=10
     # which is more feature than we have in most case.
-    SelectKBest: dict(k=1),
-    SelfTrainingClassifier: dict(max_iter=5),
-    SequentialFeatureSelector: dict(cv=3),
-    SGDClassifier: dict(max_iter=5),
-    SGDOneClassSVM: dict(max_iter=5),
-    SGDRegressor: dict(max_iter=5),
-    SparsePCA: dict(max_iter=5),
+    SelectKBest: {"k": 1},
+    SelfTrainingClassifier: {"estimator": LogisticRegression(C=1), "max_iter": 5},
+    SequentialFeatureSelector: {"estimator": LogisticRegression(C=1), "cv": 3},
+    SGDClassifier: {"max_iter": 5},
+    SGDOneClassSVM: {"max_iter": 5},
+    SGDRegressor: {"max_iter": 5},
+    SparsePCA: {"max_iter": 5},
     # Due to the jl lemma and often very few samples, the number
     # of components of the random matrix projection will be probably
     # greater than the number of features.
     # So we impose a smaller number (avoid "auto" mode)
-    SparseRandomProjection: dict(n_components=2),
-    SpectralBiclustering: dict(n_init=2, n_best=1, n_clusters=2),
-    SpectralClustering: dict(n_init=2, n_clusters=2),
-    SpectralCoclustering: dict(n_init=2, n_clusters=2),
+    SparseRandomProjection: {"n_components": 2},
+    SpectralBiclustering: {"n_init": 2, "n_best": 1, "n_clusters": 2},
+    SpectralClustering: {"n_init": 2, "n_clusters": 2},
+    SpectralCoclustering: {"n_init": 2, "n_clusters": 2},
     # Default "auto" parameter can lead to different ordering of eigenvalues on
     # windows: #24105
-    SpectralEmbedding: dict(eigen_tol=1e-5),
-    StackingClassifier: dict(cv=3),
-    StackingRegressor: dict(cv=3),
-    SVC: dict(max_iter=-1),
-    SVR: dict(max_iter=-1),
-    TargetEncoder: dict(cv=3),
-    TheilSenRegressor: dict(max_iter=5, max_subpopulation=100),
+    SpectralEmbedding: {"eigen_tol": 1e-05},
+    StackingClassifier: {
+        "estimators": [
+            ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
+            ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
+        ],
+        "cv": 3,
+    },
+    StackingRegressor: {
+        "estimators": [
+            ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
+            ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
+        ],
+        "cv": 3,
+    },
+    SVC: {"max_iter": -1},
+    SVR: {"max_iter": -1},
+    TargetEncoder: {"cv": 3},
+    TheilSenRegressor: {"max_iter": 5, "max_subpopulation": 100},
     # TruncatedSVD doesn't run with n_components = n_features
-    TruncatedSVD: dict(n_iter=5, n_components=1),
-    TSNE: dict(perplexity=2),
-    TunedThresholdClassifierCV: dict(cv=3),
-    TweedieRegressor: dict(max_iter=5),
+    TruncatedSVD: {"n_iter": 5, "n_components": 1},
+    TSNE: {"perplexity": 2},
+    TunedThresholdClassifierCV: {"estimator": LogisticRegression(C=1), "cv": 3},
+    TweedieRegressor: {"max_iter": 5},
+    VotingClassifier: {
+        "estimators": [
+            ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
+            ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
+        ]
+    },
+    VotingRegressor: {
+        "estimators": [
+            ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
+            ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
+        ]
+    },
 }
 
 
-def _set_checking_parameters(estimator):
-    """Set the parameters of an estimator instance to speed-up tests and avoid
-    deprecation warnings in common test."""
-    if type(estimator) in TEST_PARAMS:
-        test_params = TEST_PARAMS[type(estimator)]
-        estimator.set_params(**test_params)
-
-
 def _tested_estimators(type_filter=None):
     for name, Estimator in all_estimators(type_filter=type_filter):
         try:
@@ -316,81 +373,6 @@ def _generate_pipeline():
         )
 
 
-INIT_PARAMS = {
-    SelfTrainingClassifier: dict(estimator=LogisticRegression(C=1)),
-    CalibratedClassifierCV: dict(estimator=LogisticRegression(C=1)),
-    ClassifierChain: dict(base_estimator=LogisticRegression(C=1)),
-    ColumnTransformer: dict(
-        transformers=[
-            ("trans1", StandardScaler(), [0, 1]),
-        ]
-    ),
-    FeatureUnion: dict(transformer_list=[("trans1", StandardScaler())]),
-    FixedThresholdClassifier: dict(estimator=LogisticRegression(C=1)),
-    GridSearchCV: dict(estimator=LogisticRegression(C=1), param_grid={"C": [1.0]}),
-    HalvingGridSearchCV: dict(
-        estimator=Ridge(),
-        min_resources="smallest",
-        param_grid={"alpha": [0.1, 1.0]},
-        random_state=0,
-        cv=2,
-        error_score="raise",
-    ),
-    HalvingRandomSearchCV: dict(
-        estimator=Ridge(),
-        param_distributions={"alpha": [0.1, 1.0]},
-        min_resources="smallest",
-        cv=2,
-        error_score="raise",
-        random_state=0,
-    ),
-    MultiOutputClassifier: dict(estimator=LogisticRegression(C=1)),
-    MultiOutputRegressor: dict(estimator=Ridge()),
-    OneVsOneClassifier: dict(estimator=LogisticRegression(C=1)),
-    OneVsRestClassifier: dict(estimator=LogisticRegression(C=1)),
-    OutputCodeClassifier: dict(estimator=LogisticRegression(C=1)),
-    Pipeline: dict(steps=[("scaler", StandardScaler()), ("est", Ridge())]),
-    RandomizedSearchCV: dict(
-        estimator=LogisticRegression(C=1), param_distributions={"C": [1.0]}
-    ),
-    # `RANSACRegressor` will raise an error with any model other
-    # than `LinearRegression` if we don't fix `min_samples` parameter.
-    # For common test, we can enforce using `LinearRegression` that
-    # is the default estimator in `RANSACRegressor` instead of `Ridge`.
-    RANSACRegressor: dict(estimator=LinearRegression()),
-    RegressorChain: dict(base_estimator=Ridge()),
-    RFECV: dict(estimator=LogisticRegression(C=1)),
-    RFE: dict(estimator=LogisticRegression(C=1)),
-    # Increases coverage because SGDRegressor has partial_fit
-    SelectFromModel: dict(estimator=SGDRegressor(random_state=0)),
-    SequentialFeatureSelector: dict(estimator=LogisticRegression(C=1)),
-    StackingClassifier: dict(
-        estimators=[
-            ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
-            ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
-        ]
-    ),
-    StackingRegressor: dict(
-        estimators=[
-            ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
-            ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
-        ]
-    ),
-    TunedThresholdClassifierCV: dict(estimator=LogisticRegression(C=1)),
-    VotingClassifier: dict(
-        estimators=[
-            ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
-            ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
-        ]
-    ),
-    VotingRegressor: dict(
-        estimators=[
-            ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
-            ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
-        ]
-    ),
-}
-
 SKIPPED_ESTIMATORS = [SparseCoder]
 
 
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index b90c8d0133dbe..65fbd8a50f3ba 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -30,7 +30,6 @@
 from sklearn.svm import SVC, NuSVC
 from sklearn.utils import _array_api, all_estimators, deprecated
 from sklearn.utils._param_validation import Interval, StrOptions
-from sklearn.utils._test_common.instance_generator import _set_checking_parameters
 from sklearn.utils._testing import (
     MinimalClassifier,
     MinimalRegressor,
@@ -745,7 +744,6 @@ def test_check_estimator_clones():
         # without fitting
         with ignore_warnings(category=ConvergenceWarning):
             est = Estimator()
-            _set_checking_parameters(est)
             set_random_state(est)
             old_hash = joblib.hash(est)
             check_estimator(est)
@@ -754,7 +752,6 @@ def test_check_estimator_clones():
         # with fitting
         with ignore_warnings(category=ConvergenceWarning):
             est = Estimator()
-            _set_checking_parameters(est)
             set_random_state(est)
             est.fit(iris.data + 10, iris.target)
             old_hash = joblib.hash(est)

From 32a6ec67506ab6d9dd36845dcad5499b3c8034ba Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 5 Sep 2024 09:58:13 +0200
Subject: [PATCH 13/17] reduce diff

---
 .../utils/_test_common/instance_generator.py  | 312 +++++++++---------
 1 file changed, 155 insertions(+), 157 deletions(-)

diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
index 4f6b98917d260..01a028aaf201e 100644
--- a/sklearn/utils/_test_common/instance_generator.py
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -150,203 +150,201 @@
 # The following dictionary is to indicate constructor arguments suitable for the test
 # suite, which uses very small datasets, and is intended to run rather quickly.
 INIT_PARAMS = {
-    AdaBoostClassifier: {"n_estimators": 5},
-    AdaBoostRegressor: {"n_estimators": 5},
-    AffinityPropagation: {"max_iter": 5},
-    AgglomerativeClustering: {"n_clusters": 2},
-    ARDRegression: {"max_iter": 5},
-    BaggingClassifier: {"n_estimators": 5},
-    BaggingRegressor: {"n_estimators": 5},
-    BayesianGaussianMixture: {"n_init": 2, "max_iter": 5},
-    BayesianRidge: {"max_iter": 5},
-    BernoulliRBM: {"n_iter": 5, "batch_size": 10},
-    Birch: {"n_clusters": 2},
-    BisectingKMeans: {"n_init": 2, "n_clusters": 2, "max_iter": 5},
-    CalibratedClassifierCV: {"estimator": LogisticRegression(C=1), "cv": 3},
-    CCA: {"n_components": 1, "max_iter": 5},
-    ClassifierChain: {"base_estimator": LogisticRegression(C=1), "cv": 3},
-    ColumnTransformer: {"transformers": [("trans1", StandardScaler(), [0, 1])]},
-    DictionaryLearning: {"max_iter": 20, "transform_algorithm": "lasso_lars"},
+    AdaBoostClassifier: dict(n_estimators=5),
+    AdaBoostRegressor: dict(n_estimators=5),
+    AffinityPropagation: dict(max_iter=5),
+    AgglomerativeClustering: dict(n_clusters=2),
+    ARDRegression: dict(max_iter=5),
+    BaggingClassifier: dict(n_estimators=5),
+    BaggingRegressor: dict(n_estimators=5),
+    BayesianGaussianMixture: dict(n_init=2, max_iter=5),
+    BayesianRidge: dict(max_iter=5),
+    BernoulliRBM: dict(n_iter=5, batch_size=10),
+    Birch: dict(n_clusters=2),
+    BisectingKMeans: dict(n_init=2, n_clusters=2, max_iter=5),
+    CalibratedClassifierCV: dict(estimator=LogisticRegression(C=1), cv=3),
+    CCA: dict(n_components=1, max_iter=5),
+    ClassifierChain: dict(base_estimator=LogisticRegression(C=1), cv=3),
+    ColumnTransformer: dict(transformers=[("trans1", StandardScaler(), [0, 1])]),
+    DictionaryLearning: dict(max_iter=20, transform_algorithm="lasso_lars"),
     # the default strategy prior would output constant predictions and fail
     # for check_classifiers_predictions
-    DummyClassifier: {"strategy": "stratified"},
-    ElasticNetCV: {"max_iter": 5, "cv": 3},
-    ElasticNet: {"max_iter": 5},
-    ExtraTreesClassifier: {"n_estimators": 5},
-    ExtraTreesRegressor: {"n_estimators": 5},
-    FactorAnalysis: {"max_iter": 5},
-    FastICA: {"max_iter": 5},
-    FeatureAgglomeration: {"n_clusters": 2},
-    FeatureUnion: {"transformer_list": [("trans1", StandardScaler())]},
-    FixedThresholdClassifier: {"estimator": LogisticRegression(C=1)},
-    GammaRegressor: {"max_iter": 5},
-    GaussianMixture: {"n_init": 2, "max_iter": 5},
+    DummyClassifier: dict(strategy="stratified"),
+    ElasticNetCV: dict(max_iter=5, cv=3),
+    ElasticNet: dict(max_iter=5),
+    ExtraTreesClassifier: dict(n_estimators=5),
+    ExtraTreesRegressor: dict(n_estimators=5),
+    FactorAnalysis: dict(max_iter=5),
+    FastICA: dict(max_iter=5),
+    FeatureAgglomeration: dict(n_clusters=2),
+    FeatureUnion: dict(transformer_list=[("trans1", StandardScaler())]),
+    FixedThresholdClassifier: dict(estimator=LogisticRegression(C=1)),
+    GammaRegressor: dict(max_iter=5),
+    GaussianMixture: dict(n_init=2, max_iter=5),
     # Due to the jl lemma and often very few samples, the number
     # of components of the random matrix projection will be probably
     # greater than the number of features.
     # So we impose a smaller number (avoid "auto" mode)
-    GaussianRandomProjection: {"n_components": 2},
-    GradientBoostingClassifier: {"n_estimators": 5},
-    GradientBoostingRegressor: {"n_estimators": 5},
-    GraphicalLassoCV: {"max_iter": 5, "cv": 3},
-    GraphicalLasso: {"max_iter": 5},
-    GridSearchCV: {
-        "estimator": LogisticRegression(C=1),
-        "param_grid": {"C": [1.0]},
-        "cv": 3,
-    },
-    HalvingGridSearchCV: {
-        "estimator": Ridge(),
-        "min_resources": "smallest",
-        "param_grid": {"alpha": [0.1, 1.0]},
-        "random_state": 0,
-        "cv": 2,
-        "error_score": "raise",
-    },
-    HalvingRandomSearchCV: {
-        "estimator": Ridge(),
-        "param_distributions": {"alpha": [0.1, 1.0]},
-        "min_resources": "smallest",
-        "cv": 2,
-        "error_score": "raise",
-        "random_state": 0,
-    },
-    HDBSCAN: {"min_samples": 1},
+    GaussianRandomProjection: dict(n_components=2),
+    GradientBoostingClassifier: dict(n_estimators=5),
+    GradientBoostingRegressor: dict(n_estimators=5),
+    GraphicalLassoCV: dict(max_iter=5, cv=3),
+    GraphicalLasso: dict(max_iter=5),
+    GridSearchCV: dict(
+        estimator=LogisticRegression(C=1), param_grid={"C": [1.0]}, cv=3
+    ),
+    HalvingGridSearchCV: dict(
+        estimator=Ridge(),
+        min_resources="smallest",
+        param_grid={"alpha": [0.1, 1.0]},
+        random_state=0,
+        cv=2,
+        error_score="raise",
+    ),
+    HalvingRandomSearchCV: dict(
+        estimator=Ridge(),
+        param_distributions={"alpha": [0.1, 1.0]},
+        min_resources="smallest",
+        cv=2,
+        error_score="raise",
+        random_state=0,
+    ),
+    HDBSCAN: dict(min_samples=1),
     # The default min_samples_leaf (20) isn't appropriate for small
     # datasets (only very shallow trees are built) that the checks use.
-    HistGradientBoostingClassifier: {"max_iter": 5, "min_samples_leaf": 5},
-    HistGradientBoostingRegressor: {"max_iter": 5, "min_samples_leaf": 5},
-    HuberRegressor: {"max_iter": 5},
-    IncrementalPCA: {"batch_size": 10},
-    IsolationForest: {"n_estimators": 5},
-    KMeans: {"n_init": 2, "n_clusters": 2, "max_iter": 5},
-    LabelPropagation: {"max_iter": 5},
-    LabelSpreading: {"max_iter": 5},
-    LarsCV: {"max_iter": 5, "cv": 3},
-    LassoCV: {"max_iter": 5, "cv": 3},
-    LassoLarsCV: {"max_iter": 5, "cv": 3},
+    HistGradientBoostingClassifier: dict(max_iter=5, min_samples_leaf=5),
+    HistGradientBoostingRegressor: dict(max_iter=5, min_samples_leaf=5),
+    HuberRegressor: dict(max_iter=5),
+    IncrementalPCA: dict(batch_size=10),
+    IsolationForest: dict(n_estimators=5),
+    KMeans: dict(n_init=2, n_clusters=2, max_iter=5),
+    LabelPropagation: dict(max_iter=5),
+    LabelSpreading: dict(max_iter=5),
+    LarsCV: dict(max_iter=5, cv=3),
+    LassoCV: dict(max_iter=5, cv=3),
+    Lasso: dict(max_iter=5),
+    LassoLarsCV: dict(max_iter=5, cv=3),
+    LassoLars: dict(max_iter=5),
     # Noise variance estimation does not work when `n_samples < n_features`.
     # We need to provide the noise variance explicitly.
-    LassoLarsIC: {"max_iter": 5, "noise_variance": 1.0},
-    LassoLars: {"max_iter": 5},
-    Lasso: {"max_iter": 5},
-    LatentDirichletAllocation: {"max_iter": 5, "batch_size": 10},
-    LinearSVC: {"max_iter": 20},
-    LinearSVR: {"max_iter": 20},
-    LocallyLinearEmbedding: {"max_iter": 5},
-    LogisticRegressionCV: {"max_iter": 5, "cv": 3},
-    LogisticRegression: {"max_iter": 5},
-    MDS: {"n_init": 2, "max_iter": 5},
+    LassoLarsIC: dict(max_iter=5, noise_variance=1.0),
+    LatentDirichletAllocation: dict(max_iter=5, batch_size=10),
+    LinearSVC: dict(max_iter=20),
+    LinearSVR: dict(max_iter=20),
+    LocallyLinearEmbedding: dict(max_iter=5),
+    LogisticRegressionCV: dict(max_iter=5, cv=3),
+    LogisticRegression: dict(max_iter=5),
+    MDS: dict(n_init=2, max_iter=5),
     # In the case of check_fit2d_1sample, bandwidth is set to None and
     # is thus estimated. De facto it is 0.0 as a single sample is provided
     # and this makes the test fails. Hence we give it a placeholder value.
-    MeanShift: {"max_iter": 5, "bandwidth": 1.0},
-    MiniBatchDictionaryLearning: {"batch_size": 10, "max_iter": 5},
-    MiniBatchKMeans: {"n_init": 2, "n_clusters": 2, "max_iter": 5, "batch_size": 10},
-    MiniBatchNMF: {"batch_size": 10, "max_iter": 20, "fresh_restarts": True},
-    MiniBatchSparsePCA: {"max_iter": 5, "batch_size": 10},
-    MLPClassifier: {"max_iter": 100},
-    MLPRegressor: {"max_iter": 100},
-    MultiOutputClassifier: {"estimator": LogisticRegression(C=1)},
-    MultiOutputRegressor: {"estimator": Ridge()},
-    MultiTaskElasticNetCV: {"max_iter": 5, "cv": 3},
-    MultiTaskElasticNet: {"max_iter": 5},
-    MultiTaskLassoCV: {"max_iter": 5, "cv": 3},
-    MultiTaskLasso: {"max_iter": 5},
-    NeighborhoodComponentsAnalysis: {"max_iter": 5},
-    NMF: {"max_iter": 500},
-    NuSVC: {"max_iter": -1},
-    NuSVR: {"max_iter": -1},
-    OneClassSVM: {"max_iter": -1},
-    OneHotEncoder: {"handle_unknown": "ignore"},
-    OneVsOneClassifier: {"estimator": LogisticRegression(C=1)},
-    OneVsRestClassifier: {"estimator": LogisticRegression(C=1)},
-    OrthogonalMatchingPursuitCV: {"cv": 3},
-    OutputCodeClassifier: {"estimator": LogisticRegression(C=1)},
-    PassiveAggressiveClassifier: {"max_iter": 5},
-    PassiveAggressiveRegressor: {"max_iter": 5},
-    Perceptron: {"max_iter": 5},
-    Pipeline: {"steps": [("scaler", StandardScaler()), ("est", Ridge())]},
-    PLSCanonical: {"n_components": 1, "max_iter": 5},
-    PLSRegression: {"n_components": 1, "max_iter": 5},
-    PLSSVD: {"n_components": 1},
-    PoissonRegressor: {"max_iter": 5},
-    RandomForestClassifier: {"n_estimators": 5},
-    RandomForestRegressor: {"n_estimators": 5},
-    RandomizedSearchCV: {
-        "estimator": LogisticRegression(C=1),
-        "param_distributions": {"C": [1.0]},
-        "n_iter": 5,
-        "cv": 3,
-    },
-    RandomTreesEmbedding: {"n_estimators": 5},
+    MeanShift: dict(max_iter=5, bandwidth=1.0),
+    MiniBatchDictionaryLearning: dict(batch_size=10, max_iter=5),
+    MiniBatchKMeans: dict(n_init=2, n_clusters=2, max_iter=5, batch_size=10),
+    MiniBatchNMF: dict(batch_size=10, max_iter=20, fresh_restarts=True),
+    MiniBatchSparsePCA: dict(max_iter=5, batch_size=10),
+    MLPClassifier: dict(max_iter=100),
+    MLPRegressor: dict(max_iter=100),
+    MultiOutputClassifier: dict(estimator=LogisticRegression(C=1)),
+    MultiOutputRegressor: dict(estimator=Ridge()),
+    MultiTaskElasticNetCV: dict(max_iter=5, cv=3),
+    MultiTaskElasticNet: dict(max_iter=5),
+    MultiTaskLassoCV: dict(max_iter=5, cv=3),
+    MultiTaskLasso: dict(max_iter=5),
+    NeighborhoodComponentsAnalysis: dict(max_iter=5),
+    NMF: dict(max_iter=500),
+    NuSVC: dict(max_iter=-1),
+    NuSVR: dict(max_iter=-1),
+    OneClassSVM: dict(max_iter=-1),
+    OneHotEncoder: dict(handle_unknown="ignore"),
+    OneVsOneClassifier: dict(estimator=LogisticRegression(C=1)),
+    OneVsRestClassifier: dict(estimator=LogisticRegression(C=1)),
+    OrthogonalMatchingPursuitCV: dict(cv=3),
+    OutputCodeClassifier: dict(estimator=LogisticRegression(C=1)),
+    PassiveAggressiveClassifier: dict(max_iter=5),
+    PassiveAggressiveRegressor: dict(max_iter=5),
+    Perceptron: dict(max_iter=5),
+    Pipeline: dict(steps=[("scaler", StandardScaler()), ("est", Ridge())]),
+    PLSCanonical: dict(n_components=1, max_iter=5),
+    PLSRegression: dict(n_components=1, max_iter=5),
+    PLSSVD: dict(n_components=1),
+    PoissonRegressor: dict(max_iter=5),
+    RandomForestClassifier: dict(n_estimators=5),
+    RandomForestRegressor: dict(n_estimators=5),
+    RandomizedSearchCV: dict(
+        estimator=LogisticRegression(C=1),
+        param_distributions={"C": [1.0]},
+        n_iter=5,
+        cv=3,
+    ),
+    RandomTreesEmbedding: dict(n_estimators=5),
     # `RANSACRegressor` will raise an error with any model other
     # than `LinearRegression` if we don't fix `min_samples` parameter.
     # For common test, we can enforce using `LinearRegression` that
     # is the default estimator in `RANSACRegressor` instead of `Ridge`.
-    RANSACRegressor: {"estimator": LinearRegression(), "max_trials": 10},
-    RegressorChain: {"base_estimator": Ridge(), "cv": 3},
-    RFECV: {"estimator": LogisticRegression(C=1), "cv": 3},
-    RFE: {"estimator": LogisticRegression(C=1)},
+    RANSACRegressor: dict(estimator=LinearRegression(), max_trials=10),
+    RegressorChain: dict(base_estimator=Ridge(), cv=3),
+    RFECV: dict(estimator=LogisticRegression(C=1), cv=3),
+    RFE: dict(estimator=LogisticRegression(C=1)),
     # be tolerant of noisy datasets (not actually speed)
-    SelectFdr: {"alpha": 0.5},
+    SelectFdr: dict(alpha=0.5),
     # Increases coverage because SGDRegressor has partial_fit
-    SelectFromModel: {"estimator": SGDRegressor(random_state=0)},
+    SelectFromModel: dict(estimator=SGDRegressor(random_state=0)),
     # SelectKBest has a default of k=10
     # which is more feature than we have in most case.
-    SelectKBest: {"k": 1},
-    SelfTrainingClassifier: {"estimator": LogisticRegression(C=1), "max_iter": 5},
-    SequentialFeatureSelector: {"estimator": LogisticRegression(C=1), "cv": 3},
-    SGDClassifier: {"max_iter": 5},
-    SGDOneClassSVM: {"max_iter": 5},
-    SGDRegressor: {"max_iter": 5},
-    SparsePCA: {"max_iter": 5},
+    SelectKBest: dict(k=1),
+    SelfTrainingClassifier: dict(estimator=LogisticRegression(C=1), max_iter=5),
+    SequentialFeatureSelector: dict(estimator=LogisticRegression(C=1), cv=3),
+    SGDClassifier: dict(max_iter=5),
+    SGDOneClassSVM: dict(max_iter=5),
+    SGDRegressor: dict(max_iter=5),
+    SparsePCA: dict(max_iter=5),
     # Due to the jl lemma and often very few samples, the number
     # of components of the random matrix projection will be probably
     # greater than the number of features.
     # So we impose a smaller number (avoid "auto" mode)
-    SparseRandomProjection: {"n_components": 2},
-    SpectralBiclustering: {"n_init": 2, "n_best": 1, "n_clusters": 2},
-    SpectralClustering: {"n_init": 2, "n_clusters": 2},
-    SpectralCoclustering: {"n_init": 2, "n_clusters": 2},
+    SparseRandomProjection: dict(n_components=2),
+    SpectralBiclustering: dict(n_init=2, n_best=1, n_clusters=2),
+    SpectralClustering: dict(n_init=2, n_clusters=2),
+    SpectralCoclustering: dict(n_init=2, n_clusters=2),
     # Default "auto" parameter can lead to different ordering of eigenvalues on
     # windows: #24105
-    SpectralEmbedding: {"eigen_tol": 1e-05},
-    StackingClassifier: {
-        "estimators": [
+    SpectralEmbedding: dict(eigen_tol=1e-05),
+    StackingClassifier: dict(
+        estimators=[
             ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
             ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
         ],
-        "cv": 3,
-    },
-    StackingRegressor: {
-        "estimators": [
+        cv=3,
+    ),
+    StackingRegressor: dict(
+        estimators=[
             ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
             ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
         ],
-        "cv": 3,
-    },
-    SVC: {"max_iter": -1},
-    SVR: {"max_iter": -1},
-    TargetEncoder: {"cv": 3},
-    TheilSenRegressor: {"max_iter": 5, "max_subpopulation": 100},
+        cv=3,
+    ),
+    SVC: dict(max_iter=-1),
+    SVR: dict(max_iter=-1),
+    TargetEncoder: dict(cv=3),
+    TheilSenRegressor: dict(max_iter=5, max_subpopulation=100),
     # TruncatedSVD doesn't run with n_components = n_features
-    TruncatedSVD: {"n_iter": 5, "n_components": 1},
-    TSNE: {"perplexity": 2},
-    TunedThresholdClassifierCV: {"estimator": LogisticRegression(C=1), "cv": 3},
-    TweedieRegressor: {"max_iter": 5},
-    VotingClassifier: {
-        "estimators": [
+    TruncatedSVD: dict(n_iter=5, n_components=1),
+    TSNE: dict(perplexity=2),
+    TunedThresholdClassifierCV: dict(estimator=LogisticRegression(C=1), cv=3),
+    TweedieRegressor: dict(max_iter=5),
+    VotingClassifier: dict(
+        estimators=[
             ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
             ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
         ]
-    },
-    VotingRegressor: {
-        "estimators": [
+    ),
+    VotingRegressor: dict(
+        estimators=[
             ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
             ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
         ]
-    },
+    ),
 }
 
 

From 0c7366c3e071fe732e887975ab3561fc706e875b Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 5 Sep 2024 10:13:51 +0200
Subject: [PATCH 14/17] use new tags

---
 sklearn/compose/_column_transformer.py | 26 +++++++++++++-------------
 sklearn/pipeline.py                    | 14 +++++++-------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index fa97d6ff4edbd..3099e25e64bed 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -1318,20 +1318,20 @@ def get_metadata_routing(self):
 
         return router
 
-    def _more_tags(self):
-        return {
-            "_xfail_checks": {
-                "check_estimators_empty_data_messages": "FIXME",
-                "check_estimators_nan_inf": "FIXME",
-                "check_estimator_sparse_array": "FIXME",
-                "check_estimator_sparse_matrix": "FIXME",
-                "check_transformer_data_not_an_array": "FIXME",
-                "check_fit1d": "FIXME",
-                "check_fit2d_predict1d": "FIXME",
-                "check_complex_data": "FIXME",
-                "check_fit2d_1feature": "FIXME",
-            }
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags._xfail_checks = {
+            "check_estimators_empty_data_messages": "FIXME",
+            "check_estimators_nan_inf": "FIXME",
+            "check_estimator_sparse_array": "FIXME",
+            "check_estimator_sparse_matrix": "FIXME",
+            "check_transformer_data_not_an_array": "FIXME",
+            "check_fit1d": "FIXME",
+            "check_fit2d_predict1d": "FIXME",
+            "check_complex_data": "FIXME",
+            "check_fit2d_1feature": "FIXME",
         }
+        return tags
 
 
 def _check_X(X):
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index b09ae8f01381c..41daced76c1a9 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -1878,14 +1878,14 @@ def get_metadata_routing(self):
 
         return router
 
-    def _more_tags(self):
-        return {
-            "_xfail_checks": {
-                "check_estimators_overwrite_params": "FIXME",
-                "check_estimators_nan_inf": "FIXME",
-                "check_dont_overwrite_parameters": "FIXME",
-            }
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags._xfail_checks = {
+            "check_estimators_overwrite_params": "FIXME",
+            "check_estimators_nan_inf": "FIXME",
+            "check_dont_overwrite_parameters": "FIXME",
         }
+        return tags
 
 
 def make_union(*transformers, n_jobs=None, verbose=False):

From 27d315d5ff1b91fb4739dbb8f9671432a1f5c4de Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 5 Sep 2024 13:27:02 +0200
Subject: [PATCH 15/17] test error messages of tests

---
 sklearn/utils/tests/test_estimator_checks.py | 28 ++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index a5c2a73aa5061..7a851bb4e7a6c 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -51,6 +51,8 @@
     check_dataframe_column_names_consistency,
     check_decision_proba_consistency,
     check_estimator,
+    check_estimator_cloneable,
+    check_estimator_repr,
     check_estimators_unfitted,
     check_fit_check_is_fitted,
     check_fit_score_takes_y,
@@ -1294,3 +1296,29 @@ def get_check_name(check):
     non_legacy_check_names = {get_check_name(check) for check in non_legacy_checks}
     legacy_check_names = {get_check_name(check) for check in legacy_checks}
     assert non_legacy_check_names.issubset(legacy_check_names)
+
+
+def test_check_estimator_cloneable_error():
+    """Check that the right error is raised when the estimator is not cloneable."""
+
+    class NotCloneable(BaseEstimator):
+        def __sklearn_clone__(self):
+            raise NotImplementedError("This estimator is not cloneable.")
+
+    estimator = NotCloneable()
+    msg = "Cloning of .* failed with error"
+    with raises(AssertionError, match=msg):
+        check_estimator_cloneable("NotCloneable", estimator)
+
+
+def test_estimator_repr_error():
+    """Check that the right error is raised when the estimator does not do a repr."""
+
+    class NotRepr(BaseEstimator):
+        def __repr__(self):
+            raise NotImplementedError("This estimator does not have a repr.")
+
+    estimator = NotRepr()
+    msg = "Repr of .* failed with error"
+    with raises(AssertionError, match=msg):
+        check_estimator_repr("NotRepr", estimator)

From 912b14a43c5cc971da1ac7b0d3513ec274ac1a78 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman2007@gmail.com>
Date: Fri, 6 Sep 2024 11:14:58 +0500
Subject: [PATCH 16/17] Update sklearn/utils/_test_common/instance_generator.py

---
 sklearn/utils/_test_common/instance_generator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
index 01a028aaf201e..aff5d58a8f3a7 100644
--- a/sklearn/utils/_test_common/instance_generator.py
+++ b/sklearn/utils/_test_common/instance_generator.py
@@ -279,8 +279,8 @@
     ),
     RandomTreesEmbedding: dict(n_estimators=5),
     # `RANSACRegressor` will raise an error with any model other
-    # than `LinearRegression` if we don't fix `min_samples` parameter.
-    # For common test, we can enforce using `LinearRegression` that
+    # than `LinearRegression` if we don't fix the `min_samples` parameter.
+    # For common tests, we can enforce using `LinearRegression` that
     # is the default estimator in `RANSACRegressor` instead of `Ridge`.
     RANSACRegressor: dict(estimator=LinearRegression(), max_trials=10),
     RegressorChain: dict(base_estimator=Ridge(), cv=3),

From fd0332906f983264cc56a953c66d5b900572e877 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman2007@gmail.com>
Date: Fri, 6 Sep 2024 11:15:13 +0500
Subject: [PATCH 17/17] Update sklearn/utils/tests/test_estimator_checks.py

---
 sklearn/utils/tests/test_estimator_checks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index 7a851bb4e7a6c..984a6f3ff5b63 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -1312,7 +1312,7 @@ def __sklearn_clone__(self):
 
 
 def test_estimator_repr_error():
-    """Check that the right error is raised when the estimator does not do a repr."""
+    """Check that the right error is raised when the estimator does not have a repr."""
 
     class NotRepr(BaseEstimator):
         def __repr__(self):