Skip to content

MAINT validate parameters of Pipeline #25133

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 26 additions & 5 deletions sklearn/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .utils.validation import check_memory
from .utils.validation import check_is_fitted
from .utils import check_pandas_support
from .utils._param_validation import HasMethods, Hidden
from .utils._set_output import _safe_set_output, _get_output_config
from .utils.fixes import delayed
from .exceptions import NotFittedError
Expand All @@ -40,7 +41,7 @@
def _final_estimator_has(attr):
"""Check that final_estimator has `attr`.

Used together with `avaliable_if` in `Pipeline`."""
Used together with `available_if` in `Pipeline`."""

def check(self):
# raise original `AttributeError` if `attr` does not exist
Expand Down Expand Up @@ -143,6 +144,12 @@ class Pipeline(_BaseComposition):
# BaseEstimator interface
_required_parameters = ["steps"]

_parameter_constraints: dict = {
"steps": [list, Hidden(tuple)],
"memory": [None, str, HasMethods(["cache"])],
"verbose": ["boolean"],
}

def __init__(self, steps, *, memory=None, verbose=False):
self.steps = steps
self.memory = memory
Expand Down Expand Up @@ -307,8 +314,15 @@ def named_steps(self):

@property
def _final_estimator(self):
estimator = self.steps[-1][1]
return "passthrough" if estimator is None else estimator
try:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an additional change that we did not think of at first.

estimator = self.steps[-1][1]
return "passthrough" if estimator is None else estimator
except (ValueError, AttributeError, TypeError):
# This condition happens when a call to a method is first calling
# `_available_if` and `fit` did not validate `steps` yet. We
# return `None` and an `InvalidParameterError` will be raised
# right after.
return None

def _log_message(self, step_idx):
if not self.verbose:
Expand Down Expand Up @@ -398,6 +412,7 @@ def fit(self, X, y=None, **fit_params):
self : object
Pipeline with fitted steps.
"""
self._validate_params()
fit_params_steps = self._check_fit_params(**fit_params)
Xt = self._fit(X, y, **fit_params_steps)
with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
Expand Down Expand Up @@ -434,6 +449,7 @@ def fit_transform(self, X, y=None, **fit_params):
Xt : ndarray of shape (n_samples, n_transformed_features)
Transformed samples.
"""
self._validate_params()
fit_params_steps = self._check_fit_params(**fit_params)
Xt = self._fit(X, y, **fit_params_steps)

Expand Down Expand Up @@ -510,6 +526,7 @@ def fit_predict(self, X, y=None, **fit_params):
y_pred : ndarray
Result of calling `fit_predict` on the final estimator.
"""
self._validate_params()
fit_params_steps = self._check_fit_params(**fit_params)
Xt = self._fit(X, y, **fit_params_steps)

Expand Down Expand Up @@ -728,8 +745,12 @@ def classes_(self):
return self.steps[-1][1].classes_

def _more_tags(self):
# check if first estimator expects pairwise input
return {"pairwise": _safe_tags(self.steps[0][1], "pairwise")}
try:
return {"pairwise": _safe_tags(self.steps[0][1], "pairwise")}
except (ValueError, AttributeError, TypeError):
# This happens when the `steps` is not a list of (name, estimator)
# tuples and `fit` is not called yet to validate the steps.
return {}

def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.
Expand Down
16 changes: 14 additions & 2 deletions sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.utils import IS_PYPY
from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags
Expand Down Expand Up @@ -273,6 +273,16 @@ def test_class_support_removed():
parametrize_with_checks([LogisticRegression])


def _generate_pipeline():
for final_estimator in [Ridge(), LogisticRegression()]:
yield Pipeline(
steps=[
("scaler", StandardScaler()),
("final_estimator", final_estimator),
]
)


def _generate_search_cv_instances():
for SearchCV, (Estimator, param_grid) in product(
[
Expand Down Expand Up @@ -458,7 +468,9 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator):


@pytest.mark.parametrize(
"estimator", _tested_estimators(), ids=_get_check_estimator_ids
"estimator",
chain(_tested_estimators(), _generate_pipeline()),
ids=_get_check_estimator_ids,
)
def test_check_param_validation(estimator):
name = estimator.__class__.__name__
Expand Down
40 changes: 0 additions & 40 deletions sklearn/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -1185,46 +1185,6 @@ def test_set_params_nested_pipeline():
estimator.set_params(a__steps=[("b", LogisticRegression())], a__b__C=5)


def test_pipeline_wrong_memory():
# Test that an error is raised when memory is not a string or a Memory
# instance
X = iris.data
y = iris.target
# Define memory as an integer
memory = 1
cached_pipe = Pipeline([("transf", DummyTransf()), ("svc", SVC())], memory=memory)

msg = re.escape(
"'memory' should be None, a string or have the same interface "
"as joblib.Memory. Got memory='1' instead."
)
with pytest.raises(ValueError, match=msg):
cached_pipe.fit(X, y)


class DummyMemory:
def cache(self, func):
return func


class WrongDummyMemory:
pass


def test_pipeline_with_cache_attribute():
X = np.array([[1, 2]])
pipe = Pipeline([("transf", Transf()), ("clf", Mult())], memory=DummyMemory())
pipe.fit(X, y=None)
dummy = WrongDummyMemory()
pipe = Pipeline([("transf", Transf()), ("clf", Mult())], memory=dummy)
msg = re.escape(
"'memory' should be None, a string or have the same interface "
f"as joblib.Memory. Got memory='{dummy}' instead."
)
with pytest.raises(ValueError, match=msg):
pipe.fit(X)


def test_pipeline_memory():
X = iris.data
y = iris.target
Expand Down