diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 1a3a89c1156e2..426884ca0513c 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -138,6 +138,11 @@ more details. now support metadata routing. :pr:`29312` by :user:`Omar Salman `. +- |Fix| Metadata is routed correctly to grouped CV splitters via + :class:`linear_model.RidgeCV` and :class:`linear_model.RidgeClassifierCV` and + `UnsetMetadataPassedError` is fixed for :class:`linear_model.RidgeClassifierCV` with + default scoring. :pr:`29634` by :user:`Stefanie Senger `. + Dropping support for building with setuptools --------------------------------------------- diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 19ec621868203..00006caece676 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -2178,8 +2178,6 @@ def fit(self, X, y, sample_weight=None, score_params=None): X_mean, *decomposition = decompose(X, y, sqrt_sw) - scorer = self._get_scorer() - n_y = 1 if len(y.shape) == 1 else y.shape[1] n_alphas = 1 if np.ndim(self.alphas) == 0 else len(self.alphas) @@ -2190,7 +2188,7 @@ def fit(self, X, y, sample_weight=None, score_params=None): for i, alpha in enumerate(np.atleast_1d(self.alphas)): G_inverse_diag, c = solve(float(alpha), y, sqrt_sw, X_mean, *decomposition) - if scorer is None: + if self.scoring is None: squared_errors = (c / G_inverse_diag) ** 2 alpha_score = self._score_without_scorer(squared_errors=squared_errors) if self.store_cv_results: @@ -2213,7 +2211,7 @@ def fit(self, X, y, sample_weight=None, score_params=None): predictions=predictions, y=unscaled_y, n_y=n_y, - scorer=scorer, + scorer=self.scoring, score_params=score_params, ) @@ -2258,9 +2256,6 @@ def fit(self, X, y, sample_weight=None, score_params=None): return self - def _get_scorer(self): - return check_scoring(self, scoring=self.scoring, allow_none=True) - def _score_without_scorer(self, squared_errors): """Performs scoring using squared errors when the scorer is None.""" if self.alpha_per_target: @@ -2382,6 +2377,7 @@ def fit(self, X, y, sample_weight=None, **params): """ _raise_for_params(params, self, "fit") cv = self.cv + scorer = self._get_scorer() # TODO(1.7): Remove in 1.7 # Also change `store_cv_results` default back to False @@ -2445,10 +2441,14 @@ def fit(self, X, y, sample_weight=None, **params): if sample_weight is not None: routed_params.scorer.score["sample_weight"] = sample_weight + # reset `scorer` variable to original user-intend if no scoring is passed + if self.scoring is None: + scorer = None + estimator = _RidgeGCV( alphas, fit_intercept=self.fit_intercept, - scoring=self.scoring, + scoring=scorer, gcv_mode=self.gcv_mode, store_cv_results=self._store_cv_results, is_clf=is_classifier(self), @@ -2484,7 +2484,7 @@ def fit(self, X, y, sample_weight=None, **params): estimator, parameters, cv=cv, - scoring=self.scoring, + scoring=scorer, ) grid_search.fit(X, y, **params) @@ -2518,14 +2518,25 @@ def get_metadata_routing(self): MetadataRouter(owner=self.__class__.__name__) .add_self_request(self) .add( - scorer=self._get_scorer(), - method_mapping=MethodMapping().add(callee="score", caller="fit"), + scorer=self.scoring, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + .add( + splitter=self.cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), ) ) return router def _get_scorer(self): - return check_scoring(self, scoring=self.scoring, allow_none=True) + scorer = check_scoring(estimator=self, scoring=self.scoring, allow_none=True) + if _routing_enabled() and self.scoring is None: + # This estimator passes an array of 1s as sample_weight even if + # sample_weight is not provided by the user. Therefore we need to + # always request it. But we don't set it if it's passed explicitly + # by the user. + scorer.set_score_request(sample_weight=True) + return scorer # TODO(1.7): Remove # mypy error: Decorated property not supported diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index f846c24b7b39f..3eb9739ed7438 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -2369,5 +2369,21 @@ def test_metadata_routing_with_default_scoring(metaestimator): metaestimator().get_metadata_routing() +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize( + "metaestimator, make_dataset", + [ + (RidgeCV(), make_regression), + (RidgeClassifierCV(), make_classification), + ], +) +def test_set_score_request_with_default_scoring(metaestimator, make_dataset): + """Test that `set_score_request` is set within `RidgeCV.fit()` and + `RidgeClassifierCV.fit()` when using the default scoring and no + UnsetMetadataPassedError is raised. Regression test for the fix in PR #29634.""" + X, y = make_dataset(n_samples=100, n_features=5, random_state=42) + metaestimator.fit(X, y, sample_weight=np.ones(X.shape[0])) + + # End of Metadata Routing Tests # ============================= diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 614c8669592b4..741dfd0537bb1 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -40,13 +40,17 @@ RidgeClassifierCV, RidgeCV, ) +from sklearn.metrics._regression import mean_squared_error +from sklearn.metrics._scorer import make_scorer from sklearn.model_selection import ( FixedThresholdClassifier, GridSearchCV, + GroupKFold, HalvingGridSearchCV, HalvingRandomSearchCV, RandomizedSearchCV, TunedThresholdClassifierCV, + cross_validate, ) from sklearn.multiclass import ( OneVsOneClassifier, @@ -83,7 +87,7 @@ classes_multi = [np.unique(y_multi[:, i]) for i in range(y_multi.shape[1])] metadata = rng.randint(0, 10, size=N) sample_weight = rng.rand(N) -groups = np.array([0, 1] * (len(y) // 2)) +groups = rng.randint(0, 10, size=len(y)) @pytest.fixture(autouse=True) @@ -620,9 +624,9 @@ def test_registry_copy(): @pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS) def test_default_request(metaestimator): # Check that by default request is empty and the right type - cls = metaestimator["metaestimator"] + metaestimator_class = metaestimator["metaestimator"] kwargs, *_ = get_init_args(metaestimator, sub_estimator_consumes=True) - instance = cls(**kwargs) + instance = metaestimator_class(**kwargs) if "cv_name" in metaestimator: # Our GroupCV splitters request groups by default, which we should # ignore in this test. @@ -642,7 +646,7 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator): # sub-estimator, e.g. MyMetaEstimator(estimator=MySubEstimator()) return - cls = metaestimator["metaestimator"] + metaestimator_class = metaestimator["metaestimator"] X = metaestimator["X"] y = metaestimator["y"] routing_methods = metaestimator["estimator_routing_methods"] @@ -656,7 +660,7 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator): scorer.set_score_request(**{key: True}) val = {"sample_weight": sample_weight, "metadata": metadata}[key] method_kwargs = {key: val} - instance = cls(**kwargs) + instance = metaestimator_class(**kwargs) msg = ( f"[{key}] are passed but are not explicitly set as requested or not" f" requested for {estimator.__class__.__name__}.{method_name}" @@ -700,7 +704,7 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): # sub-estimator, e.g. MyMetaEstimator(estimator=MySubEstimator()) return - cls = metaestimator["metaestimator"] + metaestimator_class = metaestimator["metaestimator"] X = metaestimator["X"] y = metaestimator["y"] routing_methods = metaestimator["estimator_routing_methods"] @@ -730,7 +734,7 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): metadata_name=key, ) - instance = cls(**kwargs) + instance = metaestimator_class(**kwargs) method = getattr(instance, method_name) extra_method_args = metaestimator.get("method_args", {}).get( method_name, {} @@ -775,7 +779,7 @@ def set_request(estimator, method_name): if is_classifier(estimator) and method_name == "partial_fit": estimator.set_partial_fit_request(classes=True) - cls = metaestimator["metaestimator"] + metaestimator_class = metaestimator["metaestimator"] X = metaestimator["X"] y = metaestimator["y"] routing_methods = metaestimator["estimator_routing_methods"] @@ -784,7 +788,7 @@ def set_request(estimator, method_name): kwargs, (estimator, _), (_, _), (_, _) = get_init_args( metaestimator, sub_estimator_consumes=False ) - instance = cls(**kwargs) + instance = metaestimator_class(**kwargs) set_request(estimator, method_name) method = getattr(instance, method_name) extra_method_args = metaestimator.get("method_args", {}).get(method_name, {}) @@ -807,7 +811,7 @@ def test_metadata_is_routed_correctly_to_scorer(metaestimator): # This test only makes sense for CV estimators return - cls = metaestimator["metaestimator"] + metaestimator_class = metaestimator["metaestimator"] routing_methods = metaestimator["scorer_routing_methods"] method_mapping = metaestimator.get("method_mapping", {}) @@ -825,7 +829,7 @@ def test_metadata_is_routed_correctly_to_scorer(metaestimator): methods=[method_name], metadata_name="sample_weight", ) - instance = cls(**kwargs) + instance = metaestimator_class(**kwargs) method = getattr(instance, method_name) method_kwargs = {"sample_weight": sample_weight} if "fit" not in method_name: @@ -852,7 +856,7 @@ def test_metadata_is_routed_correctly_to_splitter(metaestimator): # This test is only for metaestimators accepting a CV splitter return - cls = metaestimator["metaestimator"] + metaestimator_class = metaestimator["metaestimator"] routing_methods = metaestimator["cv_routing_methods"] X_ = metaestimator["X"] y_ = metaestimator["y"] @@ -866,7 +870,7 @@ def test_metadata_is_routed_correctly_to_splitter(metaestimator): if scorer: scorer.set_score_request(sample_weight=False, metadata=False) cv.set_split_request(groups=True, metadata=True) - instance = cls(**kwargs) + instance = metaestimator_class(**kwargs) method_kwargs = {"groups": groups, "metadata": metadata} method = getattr(instance, method_name) method(X_, y_, **method_kwargs) @@ -875,3 +879,31 @@ def test_metadata_is_routed_correctly_to_splitter(metaestimator): check_recorded_metadata( obj=_splitter, method="split", parent=method_name, **method_kwargs ) + + +@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS) +def test_metadata_routed_to_group_splitter(metaestimator): + """Test that groups are routed correctly if group splitter of CV estimator is used + within cross_validate. Regression test for issue described in PR #29634 to test that + `ValueError: The 'groups' parameter should not be None.` is not raised.""" + + if "cv_routing_methods" not in metaestimator: + # This test is only for metaestimators accepting a CV splitter + return + + metaestimator_class = metaestimator["metaestimator"] + X_ = metaestimator["X"] + y_ = metaestimator["y"] + + kwargs, *_ = get_init_args(metaestimator, sub_estimator_consumes=True) + # remove `ConsumingSplitter` from kwargs, so 'cv' param isn't passed twice: + kwargs.pop("cv", None) + instance = metaestimator_class(cv=GroupKFold(n_splits=2), **kwargs) + cross_validate( + instance, + X_, + y_, + params={"groups": groups}, + cv=GroupKFold(n_splits=2), + scoring=make_scorer(mean_squared_error, response_method="predict"), + )