scikit-learn · glemaitre · Oct 7, 2024 · Aug 6, 2024 · Aug 7, 2024 · Sep 9, 2024
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -138,6 +138,11 @@ more details.
   now support metadata routing.
   :pr:`29312` by :user:`Omar Salman <OmarManzoor>`.
 
+- |Fix| Metadata is routed correctly to grouped CV splitters via
+  :class:`linear_model.RidgeCV` and :class:`linear_model.RidgeClassifierCV` and
+  `UnsetMetadataPassedError` is fixed for :class:`linear_model.RidgeClassifierCV` with
+  default scoring. :pr:`29634` by :user:`Stefanie Senger <StefanieSenger>`.
+
 Dropping support for building with setuptools
 ---------------------------------------------
 

diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
@@ -2178,8 +2178,6 @@ def fit(self, X, y, sample_weight=None, score_params=None):
 
         X_mean, *decomposition = decompose(X, y, sqrt_sw)
 
-        scorer = self._get_scorer()
-
         n_y = 1 if len(y.shape) == 1 else y.shape[1]
         n_alphas = 1 if np.ndim(self.alphas) == 0 else len(self.alphas)
 
@@ -2190,7 +2188,7 @@ def fit(self, X, y, sample_weight=None, score_params=None):
 
         for i, alpha in enumerate(np.atleast_1d(self.alphas)):
             G_inverse_diag, c = solve(float(alpha), y, sqrt_sw, X_mean, *decomposition)
-            if scorer is None:
+            if self.scoring is None:
                 squared_errors = (c / G_inverse_diag) ** 2
                 alpha_score = self._score_without_scorer(squared_errors=squared_errors)
                 if self.store_cv_results:
@@ -2213,7 +2211,7 @@ def fit(self, X, y, sample_weight=None, score_params=None):
                     predictions=predictions,
                     y=unscaled_y,
                     n_y=n_y,
-                    scorer=scorer,
+                    scorer=self.scoring,
                     score_params=score_params,
                 )
 
@@ -2258,9 +2256,6 @@ def fit(self, X, y, sample_weight=None, score_params=None):
 
         return self
 
-    def _get_scorer(self):
-        return check_scoring(self, scoring=self.scoring, allow_none=True)
-
     def _score_without_scorer(self, squared_errors):
         """Performs scoring using squared errors when the scorer is None."""
         if self.alpha_per_target:
@@ -2382,6 +2377,7 @@ def fit(self, X, y, sample_weight=None, **params):
         """
         _raise_for_params(params, self, "fit")
         cv = self.cv
+        scorer = self._get_scorer()
 
         # TODO(1.7): Remove in 1.7
         # Also change `store_cv_results` default back to False
@@ -2445,10 +2441,14 @@ def fit(self, X, y, sample_weight=None, **params):
                 if sample_weight is not None:
                     routed_params.scorer.score["sample_weight"] = sample_weight
 
+            # reset `scorer` variable to original user-intend if no scoring is passed
+            if self.scoring is None:
+                scorer = None
+
             estimator = _RidgeGCV(
                 alphas,
                 fit_intercept=self.fit_intercept,
-                scoring=self.scoring,
+                scoring=scorer,
                 gcv_mode=self.gcv_mode,
                 store_cv_results=self._store_cv_results,
                 is_clf=is_classifier(self),
@@ -2484,7 +2484,7 @@ def fit(self, X, y, sample_weight=None, **params):
                 estimator,
                 parameters,
                 cv=cv,
-                scoring=self.scoring,
+                scoring=scorer,
             )
 
             grid_search.fit(X, y, **params)
@@ -2518,14 +2518,25 @@ def get_metadata_routing(self):
             MetadataRouter(owner=self.__class__.__name__)
             .add_self_request(self)
             .add(
-                scorer=self._get_scorer(),
-                method_mapping=MethodMapping().add(callee="score", caller="fit"),
+                scorer=self.scoring,
+                method_mapping=MethodMapping().add(caller="fit", callee="score"),
+            )
+            .add(
+                splitter=self.cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
             )
         )
         return router
 
     def _get_scorer(self):
-        return check_scoring(self, scoring=self.scoring, allow_none=True)
+        scorer = check_scoring(estimator=self, scoring=self.scoring, allow_none=True)
+        if _routing_enabled() and self.scoring is None:
+            # This estimator passes an array of 1s as sample_weight even if
+            # sample_weight is not provided by the user. Therefore we need to
+            # always request it. But we don't set it if it's passed explicitly
+            # by the user.
+            scorer.set_score_request(sample_weight=True)
+        return scorer
 
     # TODO(1.7): Remove
     # mypy error: Decorated property not supported

diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
@@ -2369,5 +2369,21 @@ def test_metadata_routing_with_default_scoring(metaestimator):
     metaestimator().get_metadata_routing()
 
 
+@pytest.mark.usefixtures("enable_slep006")
+@pytest.mark.parametrize(
+    "metaestimator, make_dataset",
+    [
+        (RidgeCV(), make_regression),
+        (RidgeClassifierCV(), make_classification),
+    ],
+)
+def test_set_score_request_with_default_scoring(metaestimator, make_dataset):
+    """Test that `set_score_request` is set within `RidgeCV.fit()` and
+    `RidgeClassifierCV.fit()` when using the default scoring and no
+    UnsetMetadataPassedError is raised. Regression test for the fix in PR #29634."""
+    X, y = make_dataset(n_samples=100, n_features=5, random_state=42)
+    metaestimator.fit(X, y, sample_weight=np.ones(X.shape[0]))
+
+
 # End of Metadata Routing Tests
 # =============================
diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py
@@ -40,13 +40,17 @@
     RidgeClassifierCV,
     RidgeCV,
 )
+from sklearn.metrics._regression import mean_squared_error
+from sklearn.metrics._scorer import make_scorer
 from sklearn.model_selection import (
     FixedThresholdClassifier,
     GridSearchCV,
+    GroupKFold,
     HalvingGridSearchCV,
     HalvingRandomSearchCV,
     RandomizedSearchCV,
     TunedThresholdClassifierCV,
+    cross_validate,
 )
 from sklearn.multiclass import (
     OneVsOneClassifier,
@@ -83,7 +87,7 @@
 classes_multi = [np.unique(y_multi[:, i]) for i in range(y_multi.shape[1])]
 metadata = rng.randint(0, 10, size=N)
 sample_weight = rng.rand(N)
-groups = np.array([0, 1] * (len(y) // 2))
+groups = rng.randint(0, 10, size=len(y))
 
 
 @pytest.fixture(autouse=True)
@@ -620,9 +624,9 @@ def test_registry_copy():
 @pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
 def test_default_request(metaestimator):
     # Check that by default request is empty and the right type
-    cls = metaestimator["metaestimator"]
+    metaestimator_class = metaestimator["metaestimator"]
     kwargs, *_ = get_init_args(metaestimator, sub_estimator_consumes=True)
-    instance = cls(**kwargs)
+    instance = metaestimator_class(**kwargs)
     if "cv_name" in metaestimator:
         # Our GroupCV splitters request groups by default, which we should
         # ignore in this test.
@@ -642,7 +646,7 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator):
         # sub-estimator, e.g. MyMetaEstimator(estimator=MySubEstimator())
         return
 
-    cls = metaestimator["metaestimator"]
+    metaestimator_class = metaestimator["metaestimator"]
     X = metaestimator["X"]
     y = metaestimator["y"]
     routing_methods = metaestimator["estimator_routing_methods"]
@@ -656,7 +660,7 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator):
                 scorer.set_score_request(**{key: True})
             val = {"sample_weight": sample_weight, "metadata": metadata}[key]
             method_kwargs = {key: val}
-            instance = cls(**kwargs)
+            instance = metaestimator_class(**kwargs)
             msg = (
                 f"[{key}] are passed but are not explicitly set as requested or not"
                 f" requested for {estimator.__class__.__name__}.{method_name}"
@@ -700,7 +704,7 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator):
         # sub-estimator, e.g. MyMetaEstimator(estimator=MySubEstimator())
         return
 
-    cls = metaestimator["metaestimator"]
+    metaestimator_class = metaestimator["metaestimator"]
     X = metaestimator["X"]
     y = metaestimator["y"]
     routing_methods = metaestimator["estimator_routing_methods"]
@@ -730,7 +734,7 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator):
                 metadata_name=key,
             )
 
-            instance = cls(**kwargs)
+            instance = metaestimator_class(**kwargs)
             method = getattr(instance, method_name)
             extra_method_args = metaestimator.get("method_args", {}).get(
                 method_name, {}
@@ -775,7 +779,7 @@ def set_request(estimator, method_name):
         if is_classifier(estimator) and method_name == "partial_fit":
             estimator.set_partial_fit_request(classes=True)
 
-    cls = metaestimator["metaestimator"]
+    metaestimator_class = metaestimator["metaestimator"]
     X = metaestimator["X"]
     y = metaestimator["y"]
     routing_methods = metaestimator["estimator_routing_methods"]
@@ -784,7 +788,7 @@ def set_request(estimator, method_name):
         kwargs, (estimator, _), (_, _), (_, _) = get_init_args(
             metaestimator, sub_estimator_consumes=False
         )
-        instance = cls(**kwargs)
+        instance = metaestimator_class(**kwargs)
         set_request(estimator, method_name)
         method = getattr(instance, method_name)
         extra_method_args = metaestimator.get("method_args", {}).get(method_name, {})
@@ -807,7 +811,7 @@ def test_metadata_is_routed_correctly_to_scorer(metaestimator):
         # This test only makes sense for CV estimators
         return
 
-    cls = metaestimator["metaestimator"]
+    metaestimator_class = metaestimator["metaestimator"]
     routing_methods = metaestimator["scorer_routing_methods"]
     method_mapping = metaestimator.get("method_mapping", {})
 
@@ -825,7 +829,7 @@ def test_metadata_is_routed_correctly_to_scorer(metaestimator):
                 methods=[method_name],
                 metadata_name="sample_weight",
             )
-        instance = cls(**kwargs)
+        instance = metaestimator_class(**kwargs)
         method = getattr(instance, method_name)
         method_kwargs = {"sample_weight": sample_weight}
         if "fit" not in method_name:
@@ -852,7 +856,7 @@ def test_metadata_is_routed_correctly_to_splitter(metaestimator):
         # This test is only for metaestimators accepting a CV splitter
         return
 
-    cls = metaestimator["metaestimator"]
+    metaestimator_class = metaestimator["metaestimator"]
     routing_methods = metaestimator["cv_routing_methods"]
     X_ = metaestimator["X"]
     y_ = metaestimator["y"]
@@ -866,7 +870,7 @@ def test_metadata_is_routed_correctly_to_splitter(metaestimator):
         if scorer:
             scorer.set_score_request(sample_weight=False, metadata=False)
         cv.set_split_request(groups=True, metadata=True)
-        instance = cls(**kwargs)
+        instance = metaestimator_class(**kwargs)
         method_kwargs = {"groups": groups, "metadata": metadata}
         method = getattr(instance, method_name)
         method(X_, y_, **method_kwargs)
@@ -875,3 +879,31 @@ def test_metadata_is_routed_correctly_to_splitter(metaestimator):
             check_recorded_metadata(
                 obj=_splitter, method="split", parent=method_name, **method_kwargs
             )
+
+
+@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
+def test_metadata_routed_to_group_splitter(metaestimator):
+    """Test that groups are routed correctly if group splitter of CV estimator is used
+    within cross_validate. Regression test for issue described in PR #29634 to test that
+    `ValueError: The 'groups' parameter should not be None.` is not raised."""
+
+    if "cv_routing_methods" not in metaestimator:
+        # This test is only for metaestimators accepting a CV splitter
+        return
+
+    metaestimator_class = metaestimator["metaestimator"]
+    X_ = metaestimator["X"]
+    y_ = metaestimator["y"]
+
+    kwargs, *_ = get_init_args(metaestimator, sub_estimator_consumes=True)
+    # remove `ConsumingSplitter` from kwargs, so 'cv' param isn't passed twice:
+    kwargs.pop("cv", None)
+    instance = metaestimator_class(cv=GroupKFold(n_splits=2), **kwargs)
+    cross_validate(
+        instance,
+        X_,
+        y_,
+        params={"groups": groups},
+        cv=GroupKFold(n_splits=2),
+        scoring=make_scorer(mean_squared_error, response_method="predict"),
+    )