Skip to content
7 changes: 7 additions & 0 deletions doc/whats_new/v1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ Changelog
:pr:`10468` by :user:`Ruben <icfly2>` and :pr:`22993` by
:user:`Jovan Stojanovic <jovan-stojanovic>`.

:mod:`sklearn.feature_selection`
................................

- |Fix| The `partial_fit` method of :class:`feature_selection.SelectFromModel`
now conducts validation for `max_features` and `feature_names_in` parameters.
:pr:`23299` by :user:`Long Bao <lorentzbao>`.

Code and Documentation Contributors
-----------------------------------

Expand Down
63 changes: 38 additions & 25 deletions sklearn/feature_selection/_from_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@
from ._base import _get_feature_importances
from ..base import BaseEstimator, clone, MetaEstimatorMixin
from ..utils._tags import _safe_tags
from ..utils.validation import check_is_fitted
from ..utils.validation import check_is_fitted, check_scalar, _num_features

from ..exceptions import NotFittedError
from ..utils.metaestimators import available_if
from ..utils.validation import check_scalar


def _calculate_threshold(estimator, importances, threshold):
Expand Down Expand Up @@ -287,34 +286,17 @@ def _get_support_mask(self):
mask[scores < threshold] = False
return mask

def fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.

y : array-like of shape (n_samples,), default=None
The target values (integers that correspond to classes in
classification, real numbers in regression).

**fit_params : dict
Other estimator specific parameters.

Returns
-------
self : object
Fitted estimator.
"""
def _check_max_features(self, X):
if self.max_features is not None:
n_features = _num_features(X)

if isinstance(self.max_features, numbers.Integral):
check_scalar(
self.max_features,
"max_features",
numbers.Integral,
min_val=0,
max_val=len(X[0]),
max_val=n_features,
)
self.max_features_ = self.max_features
elif callable(self.max_features):
Expand All @@ -324,7 +306,7 @@ def fit(self, X, y=None, **fit_params):
"max_features(X)",
numbers.Integral,
min_val=0,
max_val=len(X[0]),
max_val=n_features,
)
self.max_features_ = max_features
else:
Expand All @@ -333,6 +315,28 @@ def fit(self, X, y=None, **fit_params):
f" 'X' as input. Got {self.max_features} instead."
)

def fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.

y : array-like of shape (n_samples,), default=None
The target values (integers that correspond to classes in
classification, real numbers in regression).

**fit_params : dict
Other estimator specific parameters.

Returns
-------
self : object
Fitted estimator.
"""
self._check_max_features(X)

if self.prefit:
try:
check_is_fitted(self.estimator)
Expand Down Expand Up @@ -385,6 +389,8 @@ def partial_fit(self, X, y=None, **fit_params):
self : object
Fitted estimator.
"""
self._check_max_features(X)

if self.prefit:
if not hasattr(self, "estimator_"):
try:
Expand All @@ -397,9 +403,16 @@ def partial_fit(self, X, y=None, **fit_params):
self.estimator_ = deepcopy(self.estimator)
return self

if not hasattr(self, "estimator_"):
first_call = not hasattr(self, "estimator_")
if first_call:
self.estimator_ = clone(self.estimator)
self.estimator_.partial_fit(X, y, **fit_params)

if hasattr(self.estimator_, "feature_names_in_"):
self.feature_names_in_ = self.estimator_.feature_names_in_
else:
self._check_feature_names(X, reset=first_call)

return self

@property
Expand Down
37 changes: 37 additions & 0 deletions sklearn/feature_selection/tests/test_from_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,3 +607,40 @@ def importance_getter(estimator):
warnings.simplefilter("error", UserWarning)

selector.transform(X.iloc[1:3])


@pytest.mark.parametrize(
"error, err_msg, max_features",
(
[ValueError, "max_features == 10, must be <= 4", 10],
[TypeError, "'max_features' must be either an int or a callable", "a"],
[ValueError, r"max_features\(X\) == 5, must be <= 4", lambda x: x.shape[1] + 1],
),
)
def test_partial_fit_validate_max_features(error, err_msg, max_features):
"""Test that partial_fit from SelectFromModel validates `max_features`."""
X, y = datasets.make_classification(
n_samples=100,
n_features=4,
random_state=0,
)

with pytest.raises(error, match=err_msg):
SelectFromModel(
estimator=SGDClassifier(), max_features=max_features
).partial_fit(X, y, classes=[0, 1])


@pytest.mark.parametrize("as_frame", [True, False])
def test_partial_fit_validate_feature_names(as_frame):
"""Test that partial_fit from SelectFromModel validates `feature_names_in_`."""
pytest.importorskip("pandas")
X, y = datasets.load_iris(as_frame=as_frame, return_X_y=True)

selector = SelectFromModel(estimator=SGDClassifier(), max_features=4).partial_fit(
X, y, classes=[0, 1, 2]
)
if as_frame:
assert_array_equal(selector.feature_names_in_, X.columns)
else:
assert not hasattr(selector, "feature_names_in_")
5 changes: 5 additions & 0 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from ..linear_model import LogisticRegression
from ..linear_model import RANSACRegressor
from ..linear_model import Ridge
from ..linear_model import SGDRegressor

from ..base import (
clone,
Expand All @@ -44,6 +45,7 @@
from ..metrics import accuracy_score, adjusted_rand_score, f1_score
from ..random_projection import BaseRandomProjection
from ..feature_selection import SelectKBest
from ..feature_selection import SelectFromModel
from ..pipeline import make_pipeline
from ..exceptions import DataConversionWarning
from ..exceptions import NotFittedError
Expand Down Expand Up @@ -389,6 +391,9 @@ def _construct_instance(Estimator):
estimator = Estimator(LinearRegression())
elif issubclass(Estimator, RegressorMixin):
estimator = Estimator(Ridge())
elif issubclass(Estimator, SelectFromModel):
# Increases coverage because SGDRegressor has partial_fit
estimator = Estimator(SGDRegressor(random_state=0))
else:
estimator = Estimator(LogisticRegression(C=1))
elif required_parameters in (["estimators"],):
Expand Down