diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 83c62be61b21a..23c8e82c27fca 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -1037,6 +1037,10 @@ def _more_tags(self): # tuples and `fit` is not called yet to validate the steps. pass + # hack to make common cases work: + # we assume the pipeline can handle NaN if all the steps can + tags["allow_nan"] = all(s[1]._get_tags()["allow_nan"] for s in self.steps) + return tags def get_feature_names_out(self, input_features=None): @@ -1817,6 +1821,14 @@ def _update_transformer_list(self, transformers): for name, old in self.transformer_list ] + def _more_tags(self): + # The FeatureUnion can handle NaNs if all the steps can. + return { + "allow_nan": all( + s[1]._get_tags()["allow_nan"] for s in self.transformer_list + ) + } + @property def n_features_in_(self): """Number of features seen during :term:`fit`.""" diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index b3c6820faefc2..d06d9c657a062 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -6,14 +6,16 @@ import pytest from sklearn.base import BaseEstimator, is_regressor +from sklearn.cluster import KMeans from sklearn.datasets import make_classification +from sklearn.decomposition import PCA from sklearn.ensemble import BaggingClassifier from sklearn.exceptions import NotFittedError from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.feature_selection import RFE, RFECV +from sklearn.feature_selection import RFE, RFECV, SelectFromModel from sklearn.linear_model import LogisticRegression, Ridge from sklearn.model_selection import GridSearchCV, RandomizedSearchCV -from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union from sklearn.preprocessing import MaxAbsScaler, StandardScaler from sklearn.semi_supervised import SelfTrainingClassifier from sklearn.utils import all_estimators @@ -21,6 +23,7 @@ from sklearn.utils.estimator_checks import ( _enforce_estimator_tags_X, _enforce_estimator_tags_y, + parametrize_with_checks, ) from sklearn.utils.validation import check_is_fitted @@ -74,6 +77,38 @@ def __init__( ), ] +TESTED_META = [ + # pipelines + Pipeline((("ss", StandardScaler()),)), + Pipeline([("ss", StandardScaler())]), + make_pipeline(StandardScaler(), LogisticRegression()), + # union + make_union(StandardScaler()), + # union and pipeline + make_pipeline(make_union(PCA(), StandardScaler()), LogisticRegression()), + # pipeline with clustering + make_pipeline(KMeans(random_state=0)), + # SelectFromModel + make_pipeline( + SelectFromModel(LogisticRegression(), threshold=-np.inf), LogisticRegression() + ), + # grid-search + GridSearchCV(LogisticRegression(), {"C": [0.1, 1]}, cv=2), + # will fail tragically + # make_pipeline(StandardScaler(), None) +] + + +@parametrize_with_checks(TESTED_META) +def test_metaestimators_check_estimator(estimator, check): + if check.func.__name__ in [ + "check_estimators_overwrite_params", + "check_dont_overwrite_parameters", + ] and (isinstance(estimator, Pipeline) or isinstance(estimator, FeatureUnion)): + # we don't clone in pipeline or feature union + return + check(estimator) + def test_metaestimator_delegation(): # Ensures specified metaestimators have methods iff subestimator does diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index f5ed64a094063..2176889b8fb0e 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -95,6 +95,12 @@ def inverse_transform(self, X): return X +class FitTransf(NoTrans): + # has fit_transform but not transform + def fit_transform(self, X, y=None): + return X + + class TransfFitParams(Transf): def fit(self, X, y, **fit_params): self.fit_params = fit_params @@ -812,6 +818,7 @@ def test_pipeline_ducktyping(): pipeline.predict pipeline.transform pipeline.inverse_transform + pipeline.fit_transform pipeline = make_pipeline(Transf()) assert not hasattr(pipeline, "predict") @@ -823,6 +830,7 @@ def test_pipeline_ducktyping(): assert not hasattr(pipeline, "predict") pipeline.transform pipeline.inverse_transform + pipeline.fit_transform pipeline = make_pipeline(Transf(), NoInvTransf()) assert not hasattr(pipeline, "predict") @@ -834,6 +842,10 @@ def test_pipeline_ducktyping(): pipeline.transform assert not hasattr(pipeline, "inverse_transform") + pipeline = make_pipeline(FitTransf()) + assert not hasattr(pipeline, "transform") + pipeline.fit_transform + def test_make_pipeline(): t1 = Transf() @@ -1260,7 +1272,6 @@ def test_step_name_validation(): est.set_params(**{param: bad_steps}) with pytest.raises(ValueError, match=message): est.fit([[1]], [1]) - with pytest.raises(ValueError, match=message): est.fit_transform([[1]], [1])