scikit-learn · jeromedockes · Oct 11, 2019 · Oct 11, 2019 · Oct 11, 2019 · Oct 12, 2019
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -99,6 +99,7 @@ random sampling procedures.
 - :class:`impute.IterativeImputer` when `X` has features with no missing
   values. |Feature|
 - :class:`linear_model.Ridge` when `X` is sparse. |Fix|
+- :class:`linear_model.RidgeCV` when `cv` is `None`. |Efficiency| |Fix|
 - :class:`model_selection.StratifiedKFold` and any use of `cv=int` with a
   classifier. |Fix|
 
@@ -504,6 +505,22 @@ Changelog
   requires less memory.
   :pr:`14108`, :pr:`14170`, :pr:`14296` by :user:`Alex Henrie <alexhenrie>`.
 
+- |Efficiency| :class:`linear_model.RidgeCV` and
+  :class:`linear_model.RidgeClassifierCV` now does not allocate a
+  potentially large array to store dual coefficients for all hyperparameters
+  during its `fit`, nor an array to store all LOO predictions unless
+  `store_cv_values` is `True`.
+  :pr:`15183` by :user:`Jérôme Dockès <jeromedockes>`.
+
+- |Fix| :class:`linear_model.RidgeCV` returns predictions in the original space
+  when `scoring is not None`. In addition, the reported mean squared errors are
+  normalized by the sample weights for consistency.
+  :pr:`15183` by :user:`Jérôme Dockès <jeromedockes>`.
+
+- |Fix| In :class:`linear_model.RidgeCV`, the predicitons reported by
+  `cv_values_` are now rescaled in the original space when `scoring` is not
+  `None`. :pr:`13995` by :user:`Jérôme Dockès <jeromedockes>`.
+
 - |Fix| :class:`linear_model.Ridge` now correctly fits an intercept when `X` is
   sparse, `solver="auto"` and `fit_intercept=True`, because the default solver
   in this configuration has changed to `sparse_cg`, which can fit an intercept
@@ -807,7 +824,6 @@ Changelog
 - |Fix| The liblinear solver now supports ``sample_weight``.
   :pr:`15038` by :user:`Guillaume Lemaitre <glemaitre>`.
 
-
 :mod:`sklearn.tree`
 ...................
 

diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
@@ -1054,6 +1054,16 @@ def _matmat(self, v):
         return res
 
 
+class _IdentityEstimator:
+    """Hack to call a scorer when we already have the predictions."""
+
+    def decision_function(self, y_predict):
+        return y_predict
+
+    def predict(self, y_predict):
+        return y_predict
+
+
 class _RidgeGCV(LinearModel):
     """Ridge regression with built-in Generalized Cross-Validation
 
@@ -1087,6 +1097,10 @@ class _RidgeGCV(LinearModel):
 
     looe = y - loov = c / diag(G^-1)
 
+    The best score (negative mean squared error or user-provided scoring) is
+    stored in the `best_score_` attribute, and the selected hyperparameter in
+    `alpha_`.
+
     References
     ----------
     http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf
@@ -1462,43 +1476,41 @@ def fit(self, X, y, sample_weight=None):
         else:
             sqrt_sw = np.ones(X.shape[0], dtype=X.dtype)
 
+        X_mean, *decomposition = decompose(X, y, sqrt_sw)
+
         scorer = check_scoring(self, scoring=self.scoring, allow_none=True)
         error = scorer is None
 
         n_y = 1 if len(y.shape) == 1 else y.shape[1]
-        cv_values = np.zeros((n_samples * n_y, len(self.alphas)),
-                             dtype=X.dtype)
-        C = []
-        X_mean, *decomposition = decompose(X, y, sqrt_sw)
+
+        if self.store_cv_values:
+            self.cv_values_ = np.empty(
+                (n_samples * n_y, len(self.alphas)), dtype=X.dtype)
+
+        best_coef, best_score, best_alpha = None, None, None
+
         for i, alpha in enumerate(self.alphas):
             G_inverse_diag, c = solve(
                 float(alpha), y, sqrt_sw, X_mean, *decomposition)
             if error:
                 squared_errors = (c / G_inverse_diag) ** 2
-                cv_values[:, i] = squared_errors.ravel()
+                alpha_score = -squared_errors.mean()
         or another form of cross-validation, because only generalized 
         or another form of cross-validation, because only generalized 
+                if self.store_cv_values:
+                    self.cv_values_[:, i] = squared_errors.ravel()
             else:
                 predictions = y - (c / G_inverse_diag)
-                cv_values[:, i] = predictions.ravel()
-            C.append(c)
-
-        if error:
-            best = cv_values.mean(axis=0).argmin()
-        else:
-            # The scorer want an object that will make the predictions but
-            # they are already computed efficiently by _RidgeGCV. This
-            # identity_estimator will just return them
-            def identity_estimator():
-                pass
-            identity_estimator.decision_function = lambda y_predict: y_predict
-            identity_estimator.predict = lambda y_predict: y_predict
-
-            # signature of scorer is (estimator, X, y)
-            out = [scorer(identity_estimator, cv_values[:, i], y.ravel())
-                   for i in range(len(self.alphas))]
-            best = np.argmax(out)
-
-        self.alpha_ = self.alphas[best]
-        self.dual_coef_ = C[best]
+                alpha_score = scorer(
+                    _IdentityEstimator(), predictions.ravel(), y.ravel())
+                if self.store_cv_values:
+                    self.cv_values_[:, i] = \
+                        (predictions.ravel() / np.repeat(sqrt_sw, n_y)
+                         + y_offset)
+            if (best_score is None) or (alpha_score > best_score):
+                best_coef, best_score, best_alpha = c, alpha_score, alpha
+
+        self.alpha_ = best_alpha
+        self.best_score_ = best_score
+        self.dual_coef_ = best_coef
         self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)
 
         X_offset += X_mean * X_scale
@@ -1509,7 +1521,7 @@ def identity_estimator():
                 cv_values_shape = n_samples, len(self.alphas)
             else:
                 cv_values_shape = n_samples, n_y, len(self.alphas)
-            self.cv_values_ = cv_values.reshape(cv_values_shape)
+            self.cv_values_ = self.cv_values_.reshape(cv_values_shape)
 
         return self
 

diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
@@ -553,6 +553,85 @@ def test_ridge_gcv_sample_weights(
     assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
 
 
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_ridge_gcv_stored_predictions(fit_intercept):
+    # check the predictions stored in `cv_values_` are equivalent to the
+    # prediction computed with a LOO with a Ridge at a specific alpha when the
+    # `scoring` is not `None`. We check as well that the predictions are
+    # rescaled if fitted with an intercept.
+    # Partly a regression test for #13998
+    X, y = make_regression(n_samples=6, n_features=2, random_state=42)
+    ridge_cv = RidgeCV(
+        fit_intercept=fit_intercept, scoring='neg_mean_squared_error',
+        store_cv_values=True, alphas=[1.]
+    )
+    ridge_cv.fit(X, y)
+    ridge = Ridge(fit_intercept=fit_intercept)
+    loo_pred = cross_val_predict(ridge, X, y, cv=len(X))
+    assert_allclose(loo_pred.ravel(), ridge_cv.cv_values_.ravel())
+
+
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize("with_sample_weight", [False, True])
+def test_ridge_gcv_equivalence_prediction_metric(fit_intercept,
+                                                 with_sample_weight):
+    # Check the consistency between the cv_values_ obtained with and without
+    # a score. The default score being the mean squared error, we can compute
+    # the error from the prediction with `scoring='neg_mean_squared_error`
+    # the score reported when `scoring=None`
+    X, y = make_regression(n_samples=6, n_features=2, random_state=42)
+    sample_weight = np.arange(1, len(y) + 1) if with_sample_weight else None
+    from sklearn.utils.validation import _check_sample_weight
+    sample_weight = _check_sample_weight(sample_weight, X)
+    ridge_cv_no_score = RidgeCV(
+        fit_intercept=fit_intercept, scoring=None, store_cv_values=True,
+        alphas=[1.]
+    )
+    ridge_cv_score = RidgeCV(
+        fit_intercept=fit_intercept, scoring='neg_mean_squared_error',
+        store_cv_values=True, alphas=[1.]
+    )
+
+    ridge_cv_no_score.fit(X, y, sample_weight=sample_weight)
+    ridge_cv_score.fit(X, y, sample_weight=sample_weight)
+
+    # compute the mean squared error and apply the sample weight
+    mse = ((y - ridge_cv_score.cv_values_.ravel()) ** 2) * sample_weight
+    mse = mse.sum() / len(mse)
+    assert ridge_cv_no_score.cv_values_.mean() == pytest.approx(mse)
+
+
+def test_ridge_gcv_cv_values_not_stored():
+    # Check that `cv_values_` is not stored when store_cv_values is False
+    X, y = make_regression(n_samples=6, n_features=2, random_state=42)
+    ridge_cv = RidgeCV(store_cv_values=False, alphas=[1.])
+    ridge_cv.fit(X, y)
+
+    assert not hasattr(ridge_cv, "cv_values_")
+
+
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize("with_sample_weight", [False, True])
+def test_ridge_gcv_decision_function_scoring(fit_intercept,
+                                             with_sample_weight):
+    # check that the passing a scorer computing the mean squared error is
+    # equivalent to `scoring=None`
+
+    def scorer(estimator, X, Y):
+        predict = estimator.decision_function(X)
+        return - np.mean((Y - predict) ** 2)
+
+    X, y = make_regression(n_samples=10, n_features=2, random_state=0)
+    sample_weight = np.arange(1, len(y) + 1) if with_sample_weight else None
+    ridge_1 = _RidgeGCV(
+        fit_intercept=fit_intercept, scoring=None, store_cv_values=True,
+        alphas=[1.0]).fit(X, y, sample_weight=sample_weight)
+    ridge_2 = _RidgeGCV(
+        fit_intercept=fit_intercept, scoring=scorer, store_cv_values=True,
+        alphas=[1.0]).fit(X, y, sample_weight=sample_weight)
+    assert ridge_1.best_score_ == pytest.approx(ridge_2.best_score_)
+
+
 @pytest.mark.parametrize('mode', [True, 1, 5, 'bad', 'gcv'])
 def test_check_gcv_mode_error(mode):
     X, y = make_regression(n_samples=5, n_features=2)