Skip to content

TST allow setting per test settings for estimators #29820

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 12, 2024
11 changes: 11 additions & 0 deletions sklearn/cluster/_bicluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,17 @@ def _fit(self, X):
[self.column_labels_ == c for c in range(self.n_clusters)]
)

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags._xfail_checks.update(
{
# ValueError: Found array with 0 feature(s) (shape=(23, 0))
# while a minimum of 1 is required.
"check_dict_unchanged": "FIXME",
}
)
return tags


class SpectralBiclustering(BaseSpectral):
"""Spectral biclustering (Kluger, 2003).
Expand Down
126 changes: 124 additions & 2 deletions sklearn/utils/_test_common/instance_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from functools import partial
from inspect import isfunction

from sklearn import config_context
from sklearn import clone, config_context
from sklearn.calibration import CalibratedClassifierCV
from sklearn.cluster import (
HDBSCAN,
Expand All @@ -33,6 +33,7 @@
FactorAnalysis,
FastICA,
IncrementalPCA,
KernelPCA,
LatentDirichletAllocation,
MiniBatchDictionaryLearning,
MiniBatchNMF,
Expand All @@ -41,6 +42,7 @@
SparsePCA,
TruncatedSVD,
)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
Expand Down Expand Up @@ -72,6 +74,12 @@
SelectKBest,
SequentialFeatureSelector,
)
from sklearn.kernel_approximation import (
Nystroem,
PolynomialCountSketch,
RBFSampler,
SkewedChi2Sampler,
)
from sklearn.linear_model import (
ARDRegression,
BayesianRidge,
Expand Down Expand Up @@ -105,7 +113,13 @@
TheilSenRegressor,
TweedieRegressor,
)
from sklearn.manifold import MDS, TSNE, LocallyLinearEmbedding, SpectralEmbedding
from sklearn.manifold import (
MDS,
TSNE,
Isomap,
LocallyLinearEmbedding,
SpectralEmbedding,
)
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.model_selection import (
FixedThresholdClassifier,
Expand Down Expand Up @@ -457,6 +471,79 @@
),
}

# This dictionary stores parameters for specific checks. It also enables running the
# same check with multiple instances of the same estimator with different parameters.
# The special key "*" allows to apply the parameters to all checks.
# TODO(devtools): allow third-party developers to pass test specific params to checks
PER_ESTIMATOR_CHECK_PARAMS: dict = {
# TODO(devtools): check that function names here exist in checks for the estimator
# TODO(devtools): write a test for the same thing with tags._xfail_checks
AgglomerativeClustering: {"check_dict_unchanged": dict(n_clusters=1)},
BayesianGaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)},
BernoulliRBM: {"check_dict_unchanged": dict(n_components=1, n_iter=5)},
Birch: {"check_dict_unchanged": dict(n_clusters=1)},
BisectingKMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)},
CCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
DictionaryLearning: {
"check_dict_unchanged": dict(
max_iter=20, n_components=1, transform_algorithm="lasso_lars"
)
},
FactorAnalysis: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
FastICA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
FeatureAgglomeration: {"check_dict_unchanged": dict(n_clusters=1)},
GaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)},
GaussianRandomProjection: {"check_dict_unchanged": dict(n_components=1)},
IncrementalPCA: {"check_dict_unchanged": dict(batch_size=10, n_components=1)},
Isomap: {"check_dict_unchanged": dict(n_components=1)},
KMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)},
KernelPCA: {"check_dict_unchanged": dict(n_components=1)},
LatentDirichletAllocation: {
"check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
},
LinearDiscriminantAnalysis: {"check_dict_unchanged": dict(n_components=1)},
LocallyLinearEmbedding: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
MDS: {"check_dict_unchanged": dict(max_iter=5, n_components=1, n_init=2)},
MiniBatchDictionaryLearning: {
"check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
},
MiniBatchKMeans: {
"check_dict_unchanged": dict(batch_size=10, max_iter=5, n_clusters=1, n_init=2)
},
MiniBatchNMF: {
"check_dict_unchanged": dict(
batch_size=10, fresh_restarts=True, max_iter=20, n_components=1
)
},
MiniBatchSparsePCA: {
"check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
},
NMF: {"check_dict_unchanged": dict(max_iter=500, n_components=1)},
NeighborhoodComponentsAnalysis: {
"check_dict_unchanged": dict(max_iter=5, n_components=1)
},
Nystroem: {"check_dict_unchanged": dict(n_components=1)},
PCA: {"check_dict_unchanged": dict(n_components=1)},
PLSCanonical: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
PLSRegression: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
PLSSVD: {"check_dict_unchanged": dict(n_components=1)},
PolynomialCountSketch: {"check_dict_unchanged": dict(n_components=1)},
RBFSampler: {"check_dict_unchanged": dict(n_components=1)},
SkewedChi2Sampler: {"check_dict_unchanged": dict(n_components=1)},
SparsePCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
SparseRandomProjection: {"check_dict_unchanged": dict(n_components=1)},
SpectralBiclustering: {
"check_dict_unchanged": dict(n_best=1, n_clusters=1, n_components=1, n_init=2)
},
SpectralClustering: {
"check_dict_unchanged": dict(n_clusters=1, n_components=1, n_init=2)
},
SpectralCoclustering: {"check_dict_unchanged": dict(n_clusters=1, n_init=2)},
SpectralEmbedding: {"check_dict_unchanged": dict(eigen_tol=1e-05, n_components=1)},
TSNE: {"check_dict_unchanged": dict(n_components=1, perplexity=2)},
TruncatedSVD: {"check_dict_unchanged": dict(n_components=1)},
}


def _tested_estimators(type_filter=None):
for name, Estimator in all_estimators(type_filter=type_filter):
Expand Down Expand Up @@ -527,3 +614,38 @@ def _get_check_estimator_ids(obj):
if hasattr(obj, "get_params"):
with config_context(print_changed_only=True):
return re.sub(r"\s", "", str(obj))


def _yield_instances_for_check(check, estimator_orig):
"""Yield instances for a check.

For most estimators, this is a no-op.

For estimators which have an entry in PER_ESTIMATOR_CHECK_PARAMS, this will yield
an estimator for each parameter set in PER_ESTIMATOR_CHECK_PARAMS[estimator].
"""
# TODO(devtools): enable this behavior for third party estimators as well
if type(estimator_orig) not in PER_ESTIMATOR_CHECK_PARAMS:
yield estimator_orig
return

check_params = PER_ESTIMATOR_CHECK_PARAMS[type(estimator_orig)]

try:
check_name = check.__name__
except AttributeError:
# partial tests
check_name = check.func.__name__

if check_name not in check_params:
yield estimator_orig
return

param_set = check_params[check_name]
if isinstance(param_set, dict):
param_set = [param_set]

for params in param_set:
estimator = clone(estimator_orig)
estimator.set_params(**params)
yield estimator
37 changes: 13 additions & 24 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
from ._test_common.instance_generator import (
CROSS_DECOMPOSITION,
_get_check_estimator_ids,
_yield_instances_for_check,
)
from ._testing import (
SkipTest,
Expand All @@ -81,7 +82,6 @@


def _yield_api_checks(estimator):
yield check_estimator_cloneable
yield check_estimator_repr
yield check_no_attributes_set_in_init
yield check_fit_score_takes_y
Expand Down Expand Up @@ -509,10 +509,14 @@ def parametrize_with_checks(estimators, *, legacy=True):

def checks_generator():
for estimator in estimators:
# First check that the estimator is cloneable which is needed for the rest
# of the checks to run
name = type(estimator).__name__
yield estimator, partial(check_estimator_cloneable, name)
for check in _yield_all_checks(estimator, legacy=legacy):
check = partial(check, name)
yield _maybe_mark_xfail(estimator, check, pytest)
check_with_name = partial(check, name)
for check_instance in _yield_instances_for_check(check, estimator):
yield _maybe_mark_xfail(check_instance, check_with_name, pytest)

return pytest.mark.parametrize(
"estimator, check", checks_generator(), ids=_get_check_estimator_ids
Expand Down Expand Up @@ -597,9 +601,13 @@ def check_estimator(estimator=None, generate_only=False, *, legacy=True):
name = type(estimator).__name__

def checks_generator():
# we first need to check if the estimator is cloneable for the rest of the tests
# to run
yield estimator, partial(check_estimator_cloneable, name)
for check in _yield_all_checks(estimator, legacy=legacy):
check = _maybe_skip(estimator, check)
yield estimator, partial(check, name)
for check_instance in _yield_instances_for_check(check, estimator):
yield check_instance, partial(check, name)

if generate_only:
return checks_generator()
Expand Down Expand Up @@ -1257,32 +1265,13 @@ def check_complex_data(name, estimator_orig):

@ignore_warnings
def check_dict_unchanged(name, estimator_orig):
# this estimator raises
# ValueError: Found array with 0 feature(s) (shape=(23, 0))
# while a minimum of 1 is required.
# error
if name in ["SpectralCoclustering"]:
return
rnd = np.random.RandomState(0)
if name in ["RANSACRegressor"]:
X = 3 * rnd.uniform(size=(20, 3))
else:
X = 2 * rnd.uniform(size=(20, 3))

X = 3 * rnd.uniform(size=(20, 3))
X = _enforce_estimator_tags_X(estimator_orig, X)

y = X[:, 0].astype(int)
estimator = clone(estimator_orig)
y = _enforce_estimator_tags_y(estimator, y)
if hasattr(estimator, "n_components"):
estimator.n_components = 1

if hasattr(estimator, "n_clusters"):
estimator.n_clusters = 1

if hasattr(estimator, "n_best"):
estimator.n_best = 1

set_random_state(estimator, 1)

estimator.fit(X, y)
Expand Down