diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 7d4584ec04300..30dde6dd121d1 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -58,6 +58,13 @@ Changelog :pr:`10468` by :user:`Ruben ` and :pr:`22993` by :user:`Jovan Stojanovic `. +:mod:`sklearn.feature_selection` +................................ + +- |Fix| The `partial_fit` method of :class:`feature_selection.SelectFromModel` + now conducts validation for `max_features` and `feature_names_in` parameters. + :pr:`23299` by :user:`Long Bao `. + Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 4868c8a7e86eb..0c41c66fbef1f 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -10,11 +10,10 @@ from ._base import _get_feature_importances from ..base import BaseEstimator, clone, MetaEstimatorMixin from ..utils._tags import _safe_tags -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_scalar, _num_features from ..exceptions import NotFittedError from ..utils.metaestimators import available_if -from ..utils.validation import check_scalar def _calculate_threshold(estimator, importances, threshold): @@ -287,34 +286,17 @@ def _get_support_mask(self): mask[scores < threshold] = False return mask - def fit(self, X, y=None, **fit_params): - """Fit the SelectFromModel meta-transformer. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The training input samples. - - y : array-like of shape (n_samples,), default=None - The target values (integers that correspond to classes in - classification, real numbers in regression). - - **fit_params : dict - Other estimator specific parameters. - - Returns - ------- - self : object - Fitted estimator. - """ + def _check_max_features(self, X): if self.max_features is not None: + n_features = _num_features(X) + if isinstance(self.max_features, numbers.Integral): check_scalar( self.max_features, "max_features", numbers.Integral, min_val=0, - max_val=len(X[0]), + max_val=n_features, ) self.max_features_ = self.max_features elif callable(self.max_features): @@ -324,7 +306,7 @@ def fit(self, X, y=None, **fit_params): "max_features(X)", numbers.Integral, min_val=0, - max_val=len(X[0]), + max_val=n_features, ) self.max_features_ = max_features else: @@ -333,6 +315,28 @@ def fit(self, X, y=None, **fit_params): f" 'X' as input. Got {self.max_features} instead." ) + def fit(self, X, y=None, **fit_params): + """Fit the SelectFromModel meta-transformer. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The training input samples. + + y : array-like of shape (n_samples,), default=None + The target values (integers that correspond to classes in + classification, real numbers in regression). + + **fit_params : dict + Other estimator specific parameters. + + Returns + ------- + self : object + Fitted estimator. + """ + self._check_max_features(X) + if self.prefit: try: check_is_fitted(self.estimator) @@ -385,6 +389,8 @@ def partial_fit(self, X, y=None, **fit_params): self : object Fitted estimator. """ + self._check_max_features(X) + if self.prefit: if not hasattr(self, "estimator_"): try: @@ -397,9 +403,16 @@ def partial_fit(self, X, y=None, **fit_params): self.estimator_ = deepcopy(self.estimator) return self - if not hasattr(self, "estimator_"): + first_call = not hasattr(self, "estimator_") + if first_call: self.estimator_ = clone(self.estimator) self.estimator_.partial_fit(X, y, **fit_params) + + if hasattr(self.estimator_, "feature_names_in_"): + self.feature_names_in_ = self.estimator_.feature_names_in_ + else: + self._check_feature_names(X, reset=first_call) + return self @property diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index a9681635b8ddb..de45d9e0ab6a4 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -607,3 +607,40 @@ def importance_getter(estimator): warnings.simplefilter("error", UserWarning) selector.transform(X.iloc[1:3]) + + +@pytest.mark.parametrize( + "error, err_msg, max_features", + ( + [ValueError, "max_features == 10, must be <= 4", 10], + [TypeError, "'max_features' must be either an int or a callable", "a"], + [ValueError, r"max_features\(X\) == 5, must be <= 4", lambda x: x.shape[1] + 1], + ), +) +def test_partial_fit_validate_max_features(error, err_msg, max_features): + """Test that partial_fit from SelectFromModel validates `max_features`.""" + X, y = datasets.make_classification( + n_samples=100, + n_features=4, + random_state=0, + ) + + with pytest.raises(error, match=err_msg): + SelectFromModel( + estimator=SGDClassifier(), max_features=max_features + ).partial_fit(X, y, classes=[0, 1]) + + +@pytest.mark.parametrize("as_frame", [True, False]) +def test_partial_fit_validate_feature_names(as_frame): + """Test that partial_fit from SelectFromModel validates `feature_names_in_`.""" + pytest.importorskip("pandas") + X, y = datasets.load_iris(as_frame=as_frame, return_X_y=True) + + selector = SelectFromModel(estimator=SGDClassifier(), max_features=4).partial_fit( + X, y, classes=[0, 1, 2] + ) + if as_frame: + assert_array_equal(selector.feature_names_in_, X.columns) + else: + assert not hasattr(selector, "feature_names_in_") diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 86494b2ae2307..33cd54a5cb4ac 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -31,6 +31,7 @@ from ..linear_model import LogisticRegression from ..linear_model import RANSACRegressor from ..linear_model import Ridge +from ..linear_model import SGDRegressor from ..base import ( clone, @@ -44,6 +45,7 @@ from ..metrics import accuracy_score, adjusted_rand_score, f1_score from ..random_projection import BaseRandomProjection from ..feature_selection import SelectKBest +from ..feature_selection import SelectFromModel from ..pipeline import make_pipeline from ..exceptions import DataConversionWarning from ..exceptions import NotFittedError @@ -389,6 +391,9 @@ def _construct_instance(Estimator): estimator = Estimator(LinearRegression()) elif issubclass(Estimator, RegressorMixin): estimator = Estimator(Ridge()) + elif issubclass(Estimator, SelectFromModel): + # Increases coverage because SGDRegressor has partial_fit + estimator = Estimator(SGDRegressor(random_state=0)) else: estimator = Estimator(LogisticRegression(C=1)) elif required_parameters in (["estimators"],):