Skip to content

[MRG] run check_estimator on meta-estimators #9741

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions sklearn/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,10 @@ def _more_tags(self):
# tuples and `fit` is not called yet to validate the steps.
pass

# hack to make common cases work:
# we assume the pipeline can handle NaN if all the steps can
tags["allow_nan"] = all(s[1]._get_tags()["allow_nan"] for s in self.steps)

return tags

def get_feature_names_out(self, input_features=None):
Expand Down Expand Up @@ -1817,6 +1821,14 @@ def _update_transformer_list(self, transformers):
for name, old in self.transformer_list
]

def _more_tags(self):
# The FeatureUnion can handle NaNs if all the steps can.
return {
"allow_nan": all(
s[1]._get_tags()["allow_nan"] for s in self.transformer_list
)
}

@property
def n_features_in_(self):
"""Number of features seen during :term:`fit`."""
Expand Down
39 changes: 37 additions & 2 deletions sklearn/tests/test_metaestimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,24 @@
import pytest

from sklearn.base import BaseEstimator, is_regressor
from sklearn.cluster import KMeans
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingClassifier
from sklearn.exceptions import NotFittedError
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import RFE, RFECV
from sklearn.feature_selection import RFE, RFECV, SelectFromModel
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.utils import all_estimators
from sklearn.utils._testing import set_random_state
from sklearn.utils.estimator_checks import (
_enforce_estimator_tags_X,
_enforce_estimator_tags_y,
parametrize_with_checks,
)
from sklearn.utils.validation import check_is_fitted

Expand Down Expand Up @@ -74,6 +77,38 @@ def __init__(
),
]

TESTED_META = [
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why if this better here than in test_pipeline.py, test_from_model.py, etc?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure it is. Having them in one place has pros and cons I think.
Right now it shows what kind of cases we're testing and what interactions we're testing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for putting them all at the same place (for now?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no strong opinion either way, I was about to move them to the respective files, mostly into the pipeline tests.

# pipelines
Pipeline((("ss", StandardScaler()),)),
Pipeline([("ss", StandardScaler())]),
make_pipeline(StandardScaler(), LogisticRegression()),
# union
make_union(StandardScaler()),
# union and pipeline
make_pipeline(make_union(PCA(), StandardScaler()), LogisticRegression()),
# pipeline with clustering
make_pipeline(KMeans(random_state=0)),
# SelectFromModel
make_pipeline(
SelectFromModel(LogisticRegression(), threshold=-np.inf), LogisticRegression()
),
# grid-search
GridSearchCV(LogisticRegression(), {"C": [0.1, 1]}, cv=2),
# will fail tragically
# make_pipeline(StandardScaler(), None)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this failing because of how None or 'passthrough' does not support _get_tags?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there were multiple issues

]


@parametrize_with_checks(TESTED_META)
def test_metaestimators_check_estimator(estimator, check):
if check.func.__name__ in [
"check_estimators_overwrite_params",
"check_dont_overwrite_parameters",
] and (isinstance(estimator, Pipeline) or isinstance(estimator, FeatureUnion)):
# we don't clone in pipeline or feature union
return
check(estimator)


def test_metaestimator_delegation():
# Ensures specified metaestimators have methods iff subestimator does
Expand Down
13 changes: 12 additions & 1 deletion sklearn/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@ def inverse_transform(self, X):
return X


class FitTransf(NoTrans):
# has fit_transform but not transform
def fit_transform(self, X, y=None):
return X


class TransfFitParams(Transf):
def fit(self, X, y, **fit_params):
self.fit_params = fit_params
Expand Down Expand Up @@ -812,6 +818,7 @@ def test_pipeline_ducktyping():
pipeline.predict
pipeline.transform
pipeline.inverse_transform
pipeline.fit_transform

pipeline = make_pipeline(Transf())
assert not hasattr(pipeline, "predict")
Expand All @@ -823,6 +830,7 @@ def test_pipeline_ducktyping():
assert not hasattr(pipeline, "predict")
pipeline.transform
pipeline.inverse_transform
pipeline.fit_transform

pipeline = make_pipeline(Transf(), NoInvTransf())
assert not hasattr(pipeline, "predict")
Expand All @@ -834,6 +842,10 @@ def test_pipeline_ducktyping():
pipeline.transform
assert not hasattr(pipeline, "inverse_transform")

pipeline = make_pipeline(FitTransf())
assert not hasattr(pipeline, "transform")
pipeline.fit_transform


def test_make_pipeline():
t1 = Transf()
Expand Down Expand Up @@ -1260,7 +1272,6 @@ def test_step_name_validation():
est.set_params(**{param: bad_steps})
with pytest.raises(ValueError, match=message):
est.fit([[1]], [1])

with pytest.raises(ValueError, match=message):
est.fit_transform([[1]], [1])

Expand Down