Skip to content

Commit ec74b2a

Browse files
FEAT rfecv: add support and ranking for each cv and step (scikit-learn#30179)
Co-authored-by: MarieS-WiMLDS <79304610+MarieS-WiMLDS@users.noreply.github.com> Co-authored-by: Adrin Jalali <adrin.jalali@gmail.com>
1 parent 7cc6032 commit ec74b2a

File tree

4 files changed

+96
-13
lines changed

4 files changed

+96
-13
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
- :class:`feature_selection.RFECV` now gives access to the ranking and support in each
2+
iteration and cv step of feature selection.
3+
By :user:`Marie S. <MarieSacksick>`

examples/feature_selection/plot_rfe_with_cross_validation.py

+24-2
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,12 @@
2222

2323
from sklearn.datasets import make_classification
2424

25+
n_features = 15
26+
feat_names = [f"feature_{i}" for i in range(15)]
27+
2528
X, y = make_classification(
2629
n_samples=500,
27-
n_features=15,
30+
n_features=n_features,
2831
n_informative=3,
2932
n_redundant=2,
3033
n_repeated=0,
@@ -71,7 +74,12 @@
7174
import matplotlib.pyplot as plt
7275
import pandas as pd
7376

74-
cv_results = pd.DataFrame(rfecv.cv_results_)
77+
data = {
78+
key: value
79+
for key, value in rfecv.cv_results_.items()
80+
if key in ["n_features", "mean_test_score", "std_test_score"]
81+
}
82+
cv_results = pd.DataFrame(data)
7583
plt.figure()
7684
plt.xlabel("Number of features selected")
7785
plt.ylabel("Mean test accuracy")
@@ -91,3 +99,17 @@
9199
# cross-validation technique. The test accuracy decreases above 5 selected
92100
# features, this is, keeping non-informative features leads to over-fitting and
93101
# is therefore detrimental for the statistical performance of the models.
102+
103+
# %%
104+
import numpy as np
105+
106+
for i in range(cv.n_splits):
107+
mask = rfecv.cv_results_[f"split{i}_support"][
108+
rfecv.n_features_
109+
] # mask of features selected by the RFE
110+
features_selected = np.ma.compressed(np.ma.masked_array(feat_names, mask=1 - mask))
111+
print(f"Features selected in fold {i}: {features_selected}")
112+
# %%
113+
# In the five folds, the selected features are consistant. This is good news,
114+
# it means that the selection is stable accross folds, and it confirms that
115+
# these features are the most informative ones.

sklearn/feature_selection/_rfe.py

+36-10
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer, routed_params):
6262
**fit_params,
6363
)
6464

65-
return rfe.step_scores_, rfe.step_n_features_
65+
return rfe.step_scores_, rfe.step_support_, rfe.step_ranking_, rfe.step_n_features_
6666

6767

6868
class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
@@ -318,6 +318,8 @@ def _fit(self, X, y, step_score=None, **fit_params):
318318
if step_score:
319319
self.step_n_features_ = []
320320
self.step_scores_ = []
321+
self.step_support_ = []
322+
self.step_ranking_ = []
321323

322324
# Elimination
323325
while np.sum(support_) > n_features_to_select:
@@ -331,6 +333,14 @@ def _fit(self, X, y, step_score=None, **fit_params):
331333

332334
estimator.fit(X[:, features], y, **fit_params)
333335

336+
# Compute step values on the previous selection iteration because
337+
# 'estimator' must use features that have not been eliminated yet
338+
if step_score:
339+
self.step_n_features_.append(len(features))
340+
self.step_scores_.append(step_score(estimator, features))
341+
self.step_support_.append(list(support_))
342+
self.step_ranking_.append(list(ranking_))
343+
334344
# Get importance and rank them
335345
importances = _get_feature_importances(
336346
estimator,
@@ -345,12 +355,6 @@ def _fit(self, X, y, step_score=None, **fit_params):
345355
# Eliminate the worse features
346356
threshold = min(step, np.sum(support_) - n_features_to_select)
347357

348-
# Compute step score on the previous selection iteration
349-
# because 'estimator' must use features
350-
# that have not been eliminated yet
351-
if step_score:
352-
self.step_n_features_.append(len(features))
353-
self.step_scores_.append(step_score(estimator, features))
354358
support_[features[ranks][:threshold]] = False
355359
ranking_[np.logical_not(support_)] += 1
356360

@@ -359,10 +363,12 @@ def _fit(self, X, y, step_score=None, **fit_params):
359363
self.estimator_ = clone(self.estimator)
360364
self.estimator_.fit(X[:, features], y, **fit_params)
361365

362-
# Compute step score when only n_features_to_select features left
366+
# Compute step values when only n_features_to_select features left
363367
if step_score:
364368
self.step_n_features_.append(len(features))
365369
self.step_scores_.append(step_score(self.estimator_, features))
370+
self.step_support_.append(support_)
371+
self.step_ranking_.append(ranking_)
366372
self.n_features_ = support_.sum()
367373
self.support_ = support_
368374
self.ranking_ = ranking_
@@ -674,6 +680,20 @@ class RFECV(RFE):
674680
675681
.. versionadded:: 1.5
676682
683+
split(k)_ranking : ndarray of shape (n_subsets_of_features,)
684+
The cross-validation rankings across (k)th fold.
685+
Selected (i.e., estimated best) features are assigned rank 1.
686+
Illustration in
687+
:ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`
688+
689+
.. versionadded:: 1.7
690+
691+
split(k)_support : ndarray of shape (n_subsets_of_features,)
692+
The cross-validation supports across (k)th fold. The support
693+
is the mask of selected features.
694+
695+
.. versionadded:: 1.7
696+
677697
n_features_ : int
678698
The number of selected features with cross-validation.
679699
@@ -874,14 +894,16 @@ def fit(self, X, y, *, groups=None, **params):
874894
parallel = Parallel(n_jobs=self.n_jobs)
875895
func = delayed(_rfe_single_fit)
876896

877-
scores_features = parallel(
897+
step_results = parallel(
878898
func(clone(rfe), self.estimator, X, y, train, test, scorer, routed_params)
879899
for train, test in cv.split(X, y, **routed_params.splitter.split)
880900
)
881-
scores, step_n_features = zip(*scores_features)
901+
scores, supports, rankings, step_n_features = zip(*step_results)
882902

883903
step_n_features_rev = np.array(step_n_features[0])[::-1]
884904
scores = np.array(scores)
905+
rankings = np.array(rankings)
906+
supports = np.array(supports)
885907

886908
# Reverse order such that lowest number of features is selected in case of tie.
887909
scores_sum_rev = np.sum(scores, axis=0)[::-1]
@@ -907,10 +929,14 @@ def fit(self, X, y, *, groups=None, **params):
907929

908930
# reverse to stay consistent with before
909931
scores_rev = scores[:, ::-1]
932+
supports_rev = supports[:, ::-1]
933+
rankings_rev = rankings[:, ::-1]
910934
self.cv_results_ = {
911935
"mean_test_score": np.mean(scores_rev, axis=0),
912936
"std_test_score": np.std(scores_rev, axis=0),
913937
**{f"split{i}_test_score": scores_rev[i] for i in range(scores.shape[0])},
938+
**{f"split{i}_ranking": rankings_rev[i] for i in range(rankings.shape[0])},
939+
**{f"split{i}_support": supports_rev[i] for i in range(supports.shape[0])},
914940
"n_features": step_n_features_rev,
915941
}
916942
return self

sklearn/feature_selection/tests/test_rfe.py

+33-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Testing Recursive feature elimination
33
"""
44

5+
import re
56
from operator import attrgetter
67

78
import numpy as np
@@ -541,7 +542,11 @@ def test_rfecv_std_and_mean(global_random_seed):
541542

542543
rfecv = RFECV(estimator=SVC(kernel="linear"))
543544
rfecv.fit(X, y)
544-
split_keys = [key for key in rfecv.cv_results_.keys() if "split" in key]
545+
split_keys = [
546+
key
547+
for key in rfecv.cv_results_.keys()
548+
if re.search(r"split\d+_test_score", key)
549+
]
545550
cv_scores = np.asarray([rfecv.cv_results_[key] for key in split_keys])
546551
expected_mean = np.mean(cv_scores, axis=0)
547552
expected_std = np.std(cv_scores, axis=0)
@@ -721,3 +726,30 @@ def test_rfe_with_joblib_threading_backend(global_random_seed):
721726
rfe.fit(X, y)
722727

723728
assert_array_equal(ranking_ref, rfe.ranking_)
729+
730+
731+
def test_results_per_cv_in_rfecv(global_random_seed):
732+
"""
733+
Test that the results of RFECV are consistent across the different folds
734+
in terms of length of the arrays.
735+
"""
736+
X, y = make_classification(random_state=global_random_seed)
737+
738+
clf = LogisticRegression()
739+
rfecv = RFECV(
740+
estimator=clf,
741+
n_jobs=2,
742+
cv=5,
743+
)
744+
745+
rfecv.fit(X, y)
746+
747+
assert len(rfecv.cv_results_["split1_test_score"]) == len(
748+
rfecv.cv_results_["split2_test_score"]
749+
)
750+
assert len(rfecv.cv_results_["split1_support"]) == len(
751+
rfecv.cv_results_["split2_support"]
752+
)
753+
assert len(rfecv.cv_results_["split1_ranking"]) == len(
754+
rfecv.cv_results_["split2_ranking"]
755+
)

0 commit comments

Comments
 (0)