Closed
Description
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from joblib import Parallel, delayed
X, y = make_classification(
n_samples=500,
n_features=15,
n_informative=3,
n_redundant=2,
n_repeated=0,
n_classes=8,
n_clusters_per_class=1,
class_sep=0.8,
random_state=0,
)
min_features_to_select = 1 # Minimum number of features to consider
clf = LogisticRegression()
cv = StratifiedKFold(5)
def fit():
rfecv = RFECV(
estimator=clf,
step=1,
cv=cv,
scoring="accuracy",
min_features_to_select=min_features_to_select,
n_jobs=2,
)
rfecv.fit(X, y)
Parallel(n_jobs=2)(delayed(fit)() for _ in range(5))
You can get two types of errors:
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5,) + inhomogeneous part.
or
AttributeError: 'LogisticRegression' object has no attribute 'coef_'
I don't quite understand what is happening yet but it seems like there is a side-effect somewhere I would have thought that the inner parallelism would do copy but apparently not. Using clone
in
scikit-learn/sklearn/feature_selection/_rfe.py
Lines 886 to 889 in e04142c
seems to fix it:
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index 8ccbffce9b..99aa8e2b4f 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -886,7 +886,7 @@ class RFECV(RFE):
func = delayed(_rfe_single_fit)
scores_features = parallel(
- func(rfe, self.estimator, X, y, train, test, scorer, routed_params)
+ func(clone(rfe), self.estimator, X, y, train, test, scorer, routed_params)
for train, test in cv.split(X, y, **routed_params.splitter.split)
)
scores, step_n_features = zip(*scores_features)
This was seen in #29614 (and also in private testing of CPython 3.13 free-threaded with default joblib backend set to threading but I thought it was threading related).