From c15e6e2dfda8dfd9803544a86ea701313cac8ff6 Mon Sep 17 00:00:00 2001 From: Vasco Pereira Date: Wed, 26 Mar 2025 12:46:27 +0000 Subject: [PATCH 1/3] Fix #30936: ElasticNet l1 ratio fails for float-only arrays Added `_is_l1_ratio_one` function to handle l1_ratio as array, list, or tuple. Uses `np.isclose` and `np.any` to check if at least one element is approximately equal to 1.0. --- sklearn/feature_selection/_from_model.py | 9 +++- .../tests/test_from_model.py | 41 ++++++++++++++++++- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index d73b53eea647e..cd69b15a7c09d 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -26,6 +26,11 @@ ) from ._base import SelectorMixin, _get_feature_importances +def _is_l1_ratio_one(value): + """Helper to safely check if l1_ratio is (or includes) 1.0""" + if isinstance(value, (list, tuple, np.ndarray)): + return np.any(np.isclose(value, 1.0)) + return np.isclose(value, 1.0) def _calculate_threshold(estimator, importances, threshold): """Interpret the threshold value""" @@ -36,8 +41,8 @@ def _calculate_threshold(estimator, importances, threshold): is_l1_penalized = hasattr(estimator, "penalty") and estimator.penalty == "l1" is_lasso = "Lasso" in est_name is_elasticnet_l1_penalized = "ElasticNet" in est_name and ( - (hasattr(estimator, "l1_ratio_") and np.isclose(estimator.l1_ratio_, 1.0)) - or (hasattr(estimator, "l1_ratio") and np.isclose(estimator.l1_ratio, 1.0)) + (hasattr(estimator, "l1_ratio_") and _is_l1_ratio_one(estimator.l1_ratio_)) or + (hasattr(estimator, "l1_ratio") and _is_l1_ratio_one(estimator.l1_ratio)) ) if is_l1_penalized or is_lasso or is_elasticnet_l1_penalized: # the natural default threshold is 0 when l1 penalty was used diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 421f575c92a0e..ee0c5ed733353 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -8,7 +8,7 @@ from sklearn import datasets from sklearn.base import BaseEstimator from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression -from sklearn.datasets import make_friedman1 +from sklearn.datasets import make_friedman1, make_regression from sklearn.decomposition import PCA from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier from sklearn.exceptions import NotFittedError @@ -687,3 +687,42 @@ def test_from_model_estimator_attribute_error(): from_model.fit(data, y).partial_fit(data) assert isinstance(exec_info.value.__cause__, AttributeError) assert inner_msg in str(exec_info.value.__cause__) + +def test_is_l1_ratio_one(): + """Test _is_l1_ratio_one helper function""" + from sklearn.feature_selection._from_model import _is_l1_ratio_one + # Test single values + assert _is_l1_ratio_one(1.0) == True + assert _is_l1_ratio_one(0.9) == False + assert _is_l1_ratio_one(0.0) == False + # Test lists + assert _is_l1_ratio_one([0.5, 1.0, 0.8]) == True + assert _is_l1_ratio_one([0.5, 0.8]) == False + # Test numpy arrays + assert _is_l1_ratio_one(np.array([0.5, 1.0])) == True + assert _is_l1_ratio_one(np.array([0.5, 0.8])) == False + # Test empty sequences + assert _is_l1_ratio_one([]) == False + assert _is_l1_ratio_one(np.array([])) == False + +def test_invalid_l1_ratio(): + """Test that invalid l1_ratio values are handled correctly""" + # Values outside [0,1] should work but return False + clf = ElasticNetCV(l1_ratio=[0.5, 1.1]) + model = SelectFromModel(estimator=clf) + X, y = make_regression(n_samples=100, n_features=10) + model.fit(X, y) + # Model should still work, just ignoring invalid l1_ratio + assert model.transform(X).shape[1] < X.shape[1] + +def test_l1_ratio_edge_cases(): + """Test edge cases for l1_ratio detection""" + from sklearn.feature_selection._from_model import _is_l1_ratio_one + # Test values very close to 1 + assert _is_l1_ratio_one(1.0 + 1e-10) == True + assert _is_l1_ratio_one([0.5, 1.0 + 1e-10]) == True + # Test mixed valid/invalid values + assert _is_l1_ratio_one([0.5, 1.1, 1.0]) == True + # Test non-numeric values + with pytest.raises(TypeError): + _is_l1_ratio_one(['a', 'b']) From 593224f9c236a0b3af5de6b2c182f279f3970265 Mon Sep 17 00:00:00 2001 From: Vasco Pereira Date: Mon, 21 Apr 2025 18:49:47 +0100 Subject: [PATCH 2/3] =?UTF-8?q?Fix=20scikit-learn#30936:=20Add=20=20`is=5F?= =?UTF-8?q?elasticnetcv=5Fl1=5Fpenalized`=20branch=20so=20that=20ElasticNe?= =?UTF-8?q?tCV=20=20works=20with=20a=20list=20of=20floats;=20Add=20non?= =?UTF-8?q?=E2=80=91regression=20test=20`test=5Fget=5Ffeature=5Fnames=5Fou?= =?UTF-8?q?t=5Felasticnetcv`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sklearn/feature_selection/_from_model.py | 20 +++---- .../tests/test_from_model.py | 54 ++++++------------- 2 files changed, 26 insertions(+), 48 deletions(-) diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index cd69b15a7c09d..92654821c9dff 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -26,11 +26,6 @@ ) from ._base import SelectorMixin, _get_feature_importances -def _is_l1_ratio_one(value): - """Helper to safely check if l1_ratio is (or includes) 1.0""" - if isinstance(value, (list, tuple, np.ndarray)): - return np.any(np.isclose(value, 1.0)) - return np.isclose(value, 1.0) def _calculate_threshold(estimator, importances, threshold): """Interpret the threshold value""" @@ -40,11 +35,18 @@ def _calculate_threshold(estimator, importances, threshold): est_name = estimator.__class__.__name__ is_l1_penalized = hasattr(estimator, "penalty") and estimator.penalty == "l1" is_lasso = "Lasso" in est_name - is_elasticnet_l1_penalized = "ElasticNet" in est_name and ( - (hasattr(estimator, "l1_ratio_") and _is_l1_ratio_one(estimator.l1_ratio_)) or - (hasattr(estimator, "l1_ratio") and _is_l1_ratio_one(estimator.l1_ratio)) + is_elasticnet_l1_penalized = est_name == "ElasticNet" and ( + hasattr(estimator, "l1_ratio") and np.isclose(estimator.l1_ratio, 1.0) ) - if is_l1_penalized or is_lasso or is_elasticnet_l1_penalized: + is_elasticnetcv_l1_penalized = est_name == "ElasticNetCV" and ( + hasattr(estimator, "l1_ratio_") and np.isclose(estimator.l1_ratio_, 1.0) + ) + if ( + is_l1_penalized + or is_lasso + or is_elasticnet_l1_penalized + or is_elasticnetcv_l1_penalized + ): # the natural default threshold is 0 when l1 penalty was used threshold = 1e-5 else: diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index ee0c5ed733353..53d33f6ebc898 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -489,6 +489,21 @@ def test_prefit_max_features(): model.transform(data) +def test_get_feature_names_out_elasticnetcv(): + """Check if ElasticNetCV works with a list of floats. + + Non-regression test for #30936.""" + X, y = make_regression(n_samples=100, n_features=5, n_informative=3, random_state=0) + estimator = ElasticNetCV(l1_ratio=[0.25, 0.5, 0.75], random_state=0) + selector = SelectFromModel(estimator=estimator) + selector.fit(X, y) + + names_out = selector.get_feature_names_out() + mask = selector.get_support() + expected = np.array([f"x{i}" for i in range(X.shape[1])])[mask] + assert_array_equal(names_out, expected) + + def test_prefit_get_feature_names_out(): """Check the interaction between prefit and the feature names.""" clf = RandomForestClassifier(n_estimators=2, random_state=0) @@ -687,42 +702,3 @@ def test_from_model_estimator_attribute_error(): from_model.fit(data, y).partial_fit(data) assert isinstance(exec_info.value.__cause__, AttributeError) assert inner_msg in str(exec_info.value.__cause__) - -def test_is_l1_ratio_one(): - """Test _is_l1_ratio_one helper function""" - from sklearn.feature_selection._from_model import _is_l1_ratio_one - # Test single values - assert _is_l1_ratio_one(1.0) == True - assert _is_l1_ratio_one(0.9) == False - assert _is_l1_ratio_one(0.0) == False - # Test lists - assert _is_l1_ratio_one([0.5, 1.0, 0.8]) == True - assert _is_l1_ratio_one([0.5, 0.8]) == False - # Test numpy arrays - assert _is_l1_ratio_one(np.array([0.5, 1.0])) == True - assert _is_l1_ratio_one(np.array([0.5, 0.8])) == False - # Test empty sequences - assert _is_l1_ratio_one([]) == False - assert _is_l1_ratio_one(np.array([])) == False - -def test_invalid_l1_ratio(): - """Test that invalid l1_ratio values are handled correctly""" - # Values outside [0,1] should work but return False - clf = ElasticNetCV(l1_ratio=[0.5, 1.1]) - model = SelectFromModel(estimator=clf) - X, y = make_regression(n_samples=100, n_features=10) - model.fit(X, y) - # Model should still work, just ignoring invalid l1_ratio - assert model.transform(X).shape[1] < X.shape[1] - -def test_l1_ratio_edge_cases(): - """Test edge cases for l1_ratio detection""" - from sklearn.feature_selection._from_model import _is_l1_ratio_one - # Test values very close to 1 - assert _is_l1_ratio_one(1.0 + 1e-10) == True - assert _is_l1_ratio_one([0.5, 1.0 + 1e-10]) == True - # Test mixed valid/invalid values - assert _is_l1_ratio_one([0.5, 1.1, 1.0]) == True - # Test non-numeric values - with pytest.raises(TypeError): - _is_l1_ratio_one(['a', 'b']) From b0f51a42c6a3fa41dfb55ad5caee3c69985b7a02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Tue, 6 May 2025 14:46:24 +0200 Subject: [PATCH 3/3] add a changelog entry --- .../upcoming_changes/sklearn.feature_selection/31107.fix.rst | 4 ++++ sklearn/feature_selection/tests/test_from_model.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 doc/whats_new/upcoming_changes/sklearn.feature_selection/31107.fix.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.feature_selection/31107.fix.rst b/doc/whats_new/upcoming_changes/sklearn.feature_selection/31107.fix.rst new file mode 100644 index 0000000000000..b5ca4ab283434 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.feature_selection/31107.fix.rst @@ -0,0 +1,4 @@ +- :class:`feature_selection.SelectFromModel` now correctly works when the estimator + is an instance of :class:`linear_model.ElasticNetCV` with its `l1_ratio` parameter + being an array-like. + By :user:`Vasco Pereira `. diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 53d33f6ebc898..17bedf44748fb 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -493,7 +493,7 @@ def test_get_feature_names_out_elasticnetcv(): """Check if ElasticNetCV works with a list of floats. Non-regression test for #30936.""" - X, y = make_regression(n_samples=100, n_features=5, n_informative=3, random_state=0) + X, y = make_regression(n_features=5, n_informative=3, random_state=0) estimator = ElasticNetCV(l1_ratio=[0.25, 0.5, 0.75], random_state=0) selector = SelectFromModel(estimator=estimator) selector.fit(X, y)