diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index b1edda7900b81..3de4c40e109c7 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -56,7 +56,7 @@ Changelog - |Enhancement| Functions :func:`datasets.make_circles` and :func:`datasets.make_moons` now accept two-element tuple. - :pr:`15707` by :user:`Maciej J Mikulski ` + :pr:`15707` by :user:`Maciej J Mikulski `. :mod:`sklearn.linear_model` ........................... @@ -66,6 +66,13 @@ Changelog the wrapped `base_estimator` during the fitting of the final model. :pr:`15573` by :user:`Jeremy Alexandre `. +- |Efficiency| :class:`linear_model.RidgeCV` and + :class:`linear_model.RidgeClassifierCV` now does not allocate a + potentially large array to store dual coefficients for all hyperparameters + during its `fit`, nor an array to store all error or LOO predictions unless + `store_cv_values` is `True`. + :pr:`15652` by :user:`Jérôme Dockès `. + :mod:`sklearn.preprocessing` ............................ @@ -78,4 +85,3 @@ Changelog - |Fix| :func:`tree.plot_tree` `rotate` parameter was unused and has been deprecated. :pr:`15806` by :user:`Chiara Marmo `. - diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 9e1dd7f22085d..1c0407066048c 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1054,6 +1054,16 @@ def _matmat(self, v): return res +class _IdentityEstimator: + """Hack to call a scorer when we already have the predictions.""" + + def decision_function(self, y_predict): + return y_predict + + def predict(self, y_predict): + return y_predict + + class _RidgeGCV(LinearModel): """Ridge regression with built-in Generalized Cross-Validation @@ -1087,6 +1097,10 @@ class _RidgeGCV(LinearModel): looe = y - loov = c / diag(G^-1) + The best score (negative mean squared error or user-provided scoring) is + stored in the `best_score_` attribute, and the selected hyperparameter in + `alpha_`. + References ---------- http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf @@ -1462,43 +1476,40 @@ def fit(self, X, y, sample_weight=None): else: sqrt_sw = np.ones(X.shape[0], dtype=X.dtype) + X_mean, *decomposition = decompose(X, y, sqrt_sw) + scorer = check_scoring(self, scoring=self.scoring, allow_none=True) error = scorer is None n_y = 1 if len(y.shape) == 1 else y.shape[1] - cv_values = np.zeros((n_samples * n_y, len(self.alphas)), - dtype=X.dtype) - C = [] - X_mean, *decomposition = decompose(X, y, sqrt_sw) + + if self.store_cv_values: + self.cv_values_ = np.empty( + (n_samples * n_y, len(self.alphas)), dtype=X.dtype) + + best_coef, best_score, best_alpha = None, None, None + for i, alpha in enumerate(self.alphas): G_inverse_diag, c = solve( float(alpha), y, sqrt_sw, X_mean, *decomposition) if error: squared_errors = (c / G_inverse_diag) ** 2 - cv_values[:, i] = squared_errors.ravel() + alpha_score = -squared_errors.mean() + if self.store_cv_values: + self.cv_values_[:, i] = squared_errors.ravel() else: predictions = y - (c / G_inverse_diag) - cv_values[:, i] = predictions.ravel() - C.append(c) + alpha_score = scorer( + _IdentityEstimator(), predictions.ravel(), y.ravel()) + if self.store_cv_values: + self.cv_values_[:, i] = predictions.ravel() - if error: - best = cv_values.mean(axis=0).argmin() - else: - # The scorer want an object that will make the predictions but - # they are already computed efficiently by _RidgeGCV. This - # identity_estimator will just return them - def identity_estimator(): - pass - identity_estimator.decision_function = lambda y_predict: y_predict - identity_estimator.predict = lambda y_predict: y_predict - - # signature of scorer is (estimator, X, y) - out = [scorer(identity_estimator, cv_values[:, i], y.ravel()) - for i in range(len(self.alphas))] - best = np.argmax(out) - - self.alpha_ = self.alphas[best] - self.dual_coef_ = C[best] + if (best_score is None) or (alpha_score > best_score): + best_coef, best_score, best_alpha = c, alpha_score, alpha + + self.alpha_ = best_alpha + self.best_score_ = best_score + self.dual_coef_ = best_coef self.coef_ = safe_sparse_dot(self.dual_coef_.T, X) X_offset += X_mean * X_scale @@ -1509,7 +1520,7 @@ def identity_estimator(): cv_values_shape = n_samples, len(self.alphas) else: cv_values_shape = n_samples, n_y, len(self.alphas) - self.cv_values_ = cv_values.reshape(cv_values_shape) + self.cv_values_ = self.cv_values_.reshape(cv_values_shape) return self diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index c786b154fcb85..a95b40a7ba4f6 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -34,6 +34,7 @@ from sklearn.linear_model._ridge import _check_gcv_mode from sklearn.linear_model._ridge import _X_CenterStackOp from sklearn.datasets import make_regression +from sklearn.datasets import make_classification from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold, GroupKFold, cross_val_predict @@ -661,6 +662,19 @@ def _test_ridge_cv(filter_): assert type(ridge_cv.intercept_) == np.float64 +@pytest.mark.parametrize( + "ridge, make_dataset", + [(RidgeCV(), make_regression), + (RidgeClassifierCV(), make_classification)] +) +def test_ridge_gcv_cv_values_not_stored(ridge, make_dataset): + # Check that `cv_values_` is not stored when store_cv_values is False + X, y = make_dataset(n_samples=6, random_state=42) + ridge.set_params(store_cv_values=False) + ridge.fit(X, y) + assert not hasattr(ridge, "cv_values_") + + def _test_ridge_diabetes(filter_): ridge = Ridge(fit_intercept=False) ridge.fit(filter_(X_diabetes), y_diabetes) @@ -818,7 +832,8 @@ def test_class_weights_cv(): assert_array_equal(reg.predict([[-.2, 2]]), np.array([-1])) -def test_ridgecv_store_cv_values(): +@pytest.mark.parametrize("scoring", [None, 'neg_mean_squared_error']) +def test_ridgecv_store_cv_values(scoring): rng = np.random.RandomState(42) n_samples = 8 @@ -827,7 +842,7 @@ def test_ridgecv_store_cv_values(): alphas = [1e-1, 1e0, 1e1] n_alphas = len(alphas) - r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True) + r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True, scoring=scoring) # with len(y.shape) == 1 y = rng.randn(n_samples) @@ -840,7 +855,7 @@ def test_ridgecv_store_cv_values(): r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_targets, n_alphas) - r = RidgeCV(cv=3, store_cv_values=True) + r = RidgeCV(cv=3, store_cv_values=True, scoring=scoring) assert_raises_regex(ValueError, 'cv!=None and store_cv_values', r.fit, x, y)