Skip to content

FIX: partial_fit from SelectFromModel doesn't validate the parameters #23299

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
May 11, 2022
7 changes: 7 additions & 0 deletions doc/whats_new/v1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ Changelog
:pr:`10468` by :user:`Ruben <icfly2>` and :pr:`22993` by
:user:`Jovan Stojanovic <jovan-stojanovic>`.

:mod:`sklearn.feature_selection`
................................

- |Fix| The `partial_fit` method of :class:`feature_selection.SelectFromModel`
now conducts validation for `max_features` and `feature_names_in` parameters.
:pr:`23299` by :user:`Long Bao <lorentzbao>`.

Code and Documentation Contributors
-----------------------------------

Expand Down
63 changes: 38 additions & 25 deletions sklearn/feature_selection/_from_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@
from ._base import _get_feature_importances
from ..base import BaseEstimator, clone, MetaEstimatorMixin
from ..utils._tags import _safe_tags
from ..utils.validation import check_is_fitted
from ..utils.validation import check_is_fitted, check_scalar, _num_features

from ..exceptions import NotFittedError
from ..utils.metaestimators import available_if
from ..utils.validation import check_scalar


def _calculate_threshold(estimator, importances, threshold):
Expand Down Expand Up @@ -287,34 +286,17 @@ def _get_support_mask(self):
mask[scores < threshold] = False
return mask

def fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.

y : array-like of shape (n_samples,), default=None
The target values (integers that correspond to classes in
classification, real numbers in regression).

**fit_params : dict
Other estimator specific parameters.

Returns
-------
self : object
Fitted estimator.
"""
def _check_max_features(self, X):
if self.max_features is not None:
n_features = _num_features(X)

if isinstance(self.max_features, numbers.Integral):
check_scalar(
self.max_features,
"max_features",
numbers.Integral,
min_val=0,
max_val=len(X[0]),
max_val=n_features,
)
self.max_features_ = self.max_features
elif callable(self.max_features):
Expand All @@ -324,7 +306,7 @@ def fit(self, X, y=None, **fit_params):
"max_features(X)",
numbers.Integral,
min_val=0,
max_val=len(X[0]),
max_val=n_features,
)
self.max_features_ = max_features
else:
Expand All @@ -333,6 +315,28 @@ def fit(self, X, y=None, **fit_params):
f" 'X' as input. Got {self.max_features} instead."
)

def fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.

y : array-like of shape (n_samples,), default=None
The target values (integers that correspond to classes in
classification, real numbers in regression).

**fit_params : dict
Other estimator specific parameters.

Returns
-------
self : object
Fitted estimator.
"""
self._check_max_features(X)

if self.prefit:
try:
check_is_fitted(self.estimator)
Expand Down Expand Up @@ -385,6 +389,8 @@ def partial_fit(self, X, y=None, **fit_params):
self : object
Fitted estimator.
"""
self._check_max_features(X)

if self.prefit:
if not hasattr(self, "estimator_"):
try:
Expand All @@ -397,9 +403,16 @@ def partial_fit(self, X, y=None, **fit_params):
self.estimator_ = deepcopy(self.estimator)
return self

if not hasattr(self, "estimator_"):
first_call = not hasattr(self, "estimator_")
if first_call:
self.estimator_ = clone(self.estimator)
self.estimator_.partial_fit(X, y, **fit_params)

if hasattr(self.estimator_, "feature_names_in_"):
self.feature_names_in_ = self.estimator_.feature_names_in_
else:
self._check_feature_names(X, reset=first_call)

return self

@property
Expand Down
37 changes: 37 additions & 0 deletions sklearn/feature_selection/tests/test_from_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,3 +607,40 @@ def importance_getter(estimator):
warnings.simplefilter("error", UserWarning)

selector.transform(X.iloc[1:3])


@pytest.mark.parametrize(
"error, err_msg, max_features",
(
[ValueError, "max_features == 10, must be <= 4", 10],
[TypeError, "'max_features' must be either an int or a callable", "a"],
[ValueError, r"max_features\(X\) == 5, must be <= 4", lambda x: x.shape[1] + 1],
),
)
def test_partial_fit_validate_max_features(error, err_msg, max_features):
"""Test that partial_fit from SelectFromModel validates `max_features`."""
X, y = datasets.make_classification(
n_samples=100,
n_features=4,
random_state=0,
)

with pytest.raises(error, match=err_msg):
SelectFromModel(
estimator=SGDClassifier(), max_features=max_features
).partial_fit(X, y, classes=[0, 1])


@pytest.mark.parametrize("as_frame", [True, False])
def test_partial_fit_validate_feature_names(as_frame):
"""Test that partial_fit from SelectFromModel validates `feature_names_in_`."""
pytest.importorskip("pandas")
X, y = datasets.load_iris(as_frame=as_frame, return_X_y=True)

selector = SelectFromModel(estimator=SGDClassifier(), max_features=4).partial_fit(
X, y, classes=[0, 1, 2]
)
if as_frame:
assert_array_equal(selector.feature_names_in_, X.columns)
else:
assert not hasattr(selector, "feature_names_in_")
5 changes: 5 additions & 0 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from ..linear_model import LogisticRegression
from ..linear_model import RANSACRegressor
from ..linear_model import Ridge
from ..linear_model import SGDRegressor

from ..base import (
clone,
Expand All @@ -44,6 +45,7 @@
from ..metrics import accuracy_score, adjusted_rand_score, f1_score
from ..random_projection import BaseRandomProjection
from ..feature_selection import SelectKBest
from ..feature_selection import SelectFromModel
from ..pipeline import make_pipeline
from ..exceptions import DataConversionWarning
from ..exceptions import NotFittedError
Expand Down Expand Up @@ -389,6 +391,9 @@ def _construct_instance(Estimator):
estimator = Estimator(LinearRegression())
elif issubclass(Estimator, RegressorMixin):
estimator = Estimator(Ridge())
elif issubclass(Estimator, SelectFromModel):
# Increases coverage because SGDRegressor has partial_fit
estimator = Estimator(SGDRegressor(random_state=0))
else:
estimator = Estimator(LogisticRegression(C=1))
elif required_parameters in (["estimators"],):
Expand Down