diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index a370791d248e2..c01b74775684f 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -136,17 +136,24 @@ Setting the regularization parameter: generalized Cross-Validation
 ------------------------------------------------------------------
 
 :class:`RidgeCV` implements ridge regression with built-in
-cross-validation of the alpha parameter.  The object works in the same way
+cross-validation of the alpha parameter. The object works in the same way
 as GridSearchCV except that it defaults to Generalized Cross-Validation
 (GCV), an efficient form of leave-one-out cross-validation::
 
+    >>> import numpy as np
     >>> from sklearn import linear_model
-    >>> reg = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0], cv=3)
-    >>> reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])       # doctest: +SKIP
-    RidgeCV(alphas=[0.1, 1.0, 10.0], cv=3, fit_intercept=True, scoring=None,
-        normalize=False)
-    >>> reg.alpha_                                      # doctest: +SKIP
-    0.1
+    >>> reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
+    >>> reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])       # doctest: +NORMALIZE_WHITESPACE
+    RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
+          1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]),
+            cv=None, fit_intercept=True, gcv_mode=None, normalize=False,
+            scoring=None, store_cv_values=False)
+    >>> reg.alpha_
+    0.01
+
+Specifying the value of the `cv` attribute will trigger the use of
+cross-validation with `GridSearchCV`, for example `cv=10` for 10-fold
+cross-validation, rather than Generalized Cross-Validation.
 
 .. topic:: References
 
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 473bbf91878d5..38ca0a96040ea 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -38,6 +38,8 @@ random sampling procedures.
   seed, including :class:`linear_model.LogisticRegression`,
   :class:`linear_model.LogisticRegressionCV`, :class:`linear_model.Ridge`,
   and :class:`linear_model.RidgeCV` with 'sag' solver. |Fix|
+- :class:`linear_model.ridge.RidgeCV` when using generalized cross-validation
+  with sparse inputs. |Fix|
 
 
 Details are listed in the changelog below.
@@ -384,7 +386,7 @@ Support for Python 3.4 and below has been officially dropped.
 :mod:`sklearn.linear_model`
 ...........................
 
-- |Enhancement| :mod:`linear_model.ridge` now preserves ``float32`` and
+- |Enhancement| :class:`linear_model.Ridge` now preserves ``float32`` and
   ``float64`` dtypes. :issues:`8769` and :issues:`11000` by
   :user:`Guillaume Lemaitre <glemaitre>`, and :user:`Joan Massich <massich>`
 
@@ -482,6 +484,10 @@ Support for Python 3.4 and below has been officially dropped.
   in version 0.21 and will be removed in version 0.23.
   :pr:`12821` by :user:`Nicolas Hug <NicolasHug>`.
 
+- |Fix| :class:`linear_model.ridge.RidgeCV` with generalized cross-validation
+  now correctly fits an intercept when ``fit_intercept=True`` and the design
+  matrix is sparse. :issue:`13350` by :user:`Jérôme Dockès <jeromedockes>`
+
 :mod:`sklearn.manifold`
 .......................
 
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
index b36535ffaeff1..0e54126e52c33 100644
--- a/sklearn/linear_model/ridge.py
+++ b/sklearn/linear_model/ridge.py
@@ -31,6 +31,7 @@
 from ..model_selection import GridSearchCV
 from ..metrics.scorer import check_scoring
 from ..exceptions import ConvergenceWarning
+from ..utils.sparsefuncs import mean_variance_axis
 
 
 def _solve_sparse_cg(X, y, alpha, max_iter=None, tol=1e-3, verbose=0,
@@ -927,6 +928,106 @@ def classes_(self):
         return self._label_binarizer.classes_
 
 
+def _check_gcv_mode(X, gcv_mode):
+    possible_gcv_modes = [None, 'auto', 'svd', 'eigen']
+    if gcv_mode not in possible_gcv_modes:
+        raise ValueError(
+            "Unknown value for 'gcv_mode'. "
+            "Got {} instead of one of {}" .format(
+                gcv_mode, possible_gcv_modes))
+    if gcv_mode in ['eigen', 'svd']:
+        return gcv_mode
+    # if X has more rows than columns, use decomposition of X^T.X,
+    # otherwise X.X^T
+    if X.shape[0] > X.shape[1]:
+        return 'svd'
+    return 'eigen'
+
+
+def _find_smallest_angle(query, vectors):
+    """Find the column of vectors that is most aligned with the query.
+
+    Both query and the columns of vectors must have their l2 norm equal to 1.
+
+    Parameters
+    ----------
+    query : ndarray, shape (n_samples,)
+        Normalized query vector.
+
+    vectors : ndarray, shape (n_samples, n_features)
+        Vectors to which we compare query, as columns. Must be normalized.
+    """
+    abs_cosine = np.abs(query.dot(vectors))
+    index = np.argmax(abs_cosine)
+    return index
+
+
+class _X_operator(sparse.linalg.LinearOperator):
+    """Behaves as centered and scaled X with an added intercept column.
+
+    This operator behaves as
+    np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]])
+    """
+
+    def __init__(self, X, X_mean, sqrt_sw):
+        n_samples, n_features = X.shape
+        super().__init__(X.dtype, (n_samples, n_features + 1))
+        self.X = X
+        self.X_mean = X_mean
+        self.sqrt_sw = sqrt_sw
+
+    def _matvec(self, v):
+        v = v.ravel()
+        return safe_sparse_dot(
+            self.X, v[:-1], dense_output=True
+        ) - self.sqrt_sw * self.X_mean.dot(v[:-1]) + v[-1] * self.sqrt_sw
+
+    def _matmat(self, v):
+        return (
+            safe_sparse_dot(self.X, v[:-1], dense_output=True) -
+            self.sqrt_sw[:, None] * self.X_mean.dot(v[:-1]) + v[-1] *
+            self.sqrt_sw[:, None])
+
+    def _transpose(self):
+        return _Xt_operator(self.X, self.X_mean, self.sqrt_sw)
+
+
+class _Xt_operator(sparse.linalg.LinearOperator):
+    """Behaves as transposed centered and scaled X with an intercept column.
+
+    This operator behaves as
+    np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T
+    """
+
+    def __init__(self, X, X_mean, sqrt_sw):
+        n_samples, n_features = X.shape
+        super().__init__(X.dtype, (n_features + 1, n_samples))
+        self.X = X
+        self.X_mean = X_mean
+        self.sqrt_sw = sqrt_sw
+
+    def _matvec(self, v):
+        v = v.ravel()
+        n_features = self.shape[0]
+        res = np.empty(n_features, dtype=self.X.dtype)
+        res[:-1] = (
+            safe_sparse_dot(self.X.T, v, dense_output=True) -
+            (self.X_mean * self.sqrt_sw.dot(v))
+        )
+        res[-1] = np.dot(v, self.sqrt_sw)
+        return res
+
+    def _matmat(self, v):
+        n_features = self.shape[0]
+        res = np.empty((n_features, v.shape[1]), dtype=self.X.dtype)
+        res[:-1] = (
+            safe_sparse_dot(self.X.T, v, dense_output=True) -
+            self.X_mean[:, None] * self.sqrt_sw.dot(v)
+        )
+        res[-1] = np.dot(self.sqrt_sw, v)
+        return res
+
+
 class _RidgeGCV(LinearModel):
     """Ridge regression with built-in Generalized Cross-Validation
 
@@ -978,18 +1079,6 @@ def __init__(self, alphas=(0.1, 1.0, 10.0),
         self.gcv_mode = gcv_mode
         self.store_cv_values = store_cv_values
 
-    def _pre_compute(self, X, y, centered_kernel=True):
-        # even if X is very sparse, K is usually very dense
-        K = safe_sparse_dot(X, X.T, dense_output=True)
-        # the following emulates an additional constant regressor
-        # corresponding to fit_intercept=True
-        # but this is done only when the features have been centered
-        if centered_kernel:
-            K += np.ones_like(K)
-        v, Q = linalg.eigh(K)
-        QT_y = np.dot(Q.T, y)
-        return v, Q, QT_y
-
     def _decomp_diag(self, v_prime, Q):
         # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T))
         return (v_prime * Q ** 2).sum(axis=-1)
@@ -1001,18 +1090,161 @@ def _diag_dot(self, D, B):
             D = D[(slice(None), ) + (np.newaxis, ) * (len(B.shape) - 1)]
         return D * B
 
-    def _errors_and_values_helper(self, alpha, y, v, Q, QT_y):
-        """Helper function to avoid code duplication between self._errors and
-        self._values.
+    def _compute_gram(self, X, sqrt_sw):
+        """Computes the Gram matrix with possible centering.
 
-        Notes
-        -----
-        We don't construct matrix G, instead compute action on y & diagonal.
+        If ``center`` is ``True``, compute
+        (X - X.mean(axis=0)).dot((X - X.mean(axis=0)).T)
+        else X.dot(X.T)
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input uncentered data.
+
+        sqrt_sw : ndarray, shape (n_samples,)
+            square roots of sample weights
+
+        center : bool, default is True
+            Whether or not to remove the mean from ``X``.
+
+        Returns
+        -------
+        gram : ndarray, shape (n_samples, n_samples)
+            The Gram matrix.
+        X_mean : ndarray, shape (n_feature,)
+            The mean of ``X`` for each feature.
+        """
+        center = self.fit_intercept and sparse.issparse(X)
+        if not center:
+            # in this case centering has been done in preprocessing
+            # or we are not fitting an intercept.
+            X_mean = np.zeros(X.shape[1], dtype=X.dtype)
+            return safe_sparse_dot(X, X.T, dense_output=True), X_mean
+        # otherwise X is always sparse
+        n_samples = X.shape[0]
+        sample_weight_matrix = sparse.dia_matrix(
+            (sqrt_sw, 0), shape=(n_samples, n_samples))
+        X_weighted = sample_weight_matrix.dot(X)
+        X_mean, _ = mean_variance_axis(X_weighted, axis=0)
+        X_mean *= n_samples / sqrt_sw.dot(sqrt_sw)
+        X_mX = sqrt_sw[:, None] * safe_sparse_dot(
+            X_mean, X.T, dense_output=True)
+        X_mX_m = np.outer(sqrt_sw, sqrt_sw) * np.dot(X_mean, X_mean)
+        return (safe_sparse_dot(X, X.T, dense_output=True) + X_mX_m
+                - X_mX - X_mX.T, X_mean)
+
+    def _compute_covariance(self, X, sqrt_sw):
+        """Computes centered covariance matrix.
+
+        If ``center`` is ``True``, compute
+        (X - X.mean(axis=0)).T.dot(X - X.mean(axis=0))
+        else
+        X.T.dot(X)
+
+        Parameters
+        ----------
+        X : sparse matrix, shape (n_samples, n_features)
+            The input uncentered data.
+
+        sqrt_sw : ndarray, shape (n_samples,)
+            square roots of sample weights
+
+        center : bool, default is True
+            Whether or not to remove the mean from ``X``.
+
+        Returns
+        -------
+        covariance : ndarray, shape (n_features, n_features)
+            The covariance matrix.
+        X_mean : ndarray, shape (n_feature,)
+            The mean of ``X`` for each feature.
+        """
+        if not self.fit_intercept:
+            # in this case centering has been done in preprocessing
+            # or we are not fitting an intercept.
+            X_mean = np.zeros(X.shape[1], dtype=X.dtype)
+            return safe_sparse_dot(X.T, X, dense_output=True), X_mean
+        # this function only gets called for sparse X
+        n_samples = X.shape[0]
+        sample_weight_matrix = sparse.dia_matrix(
+            (sqrt_sw, 0), shape=(n_samples, n_samples))
+        X_weighted = sample_weight_matrix.dot(X)
+        X_mean, _ = mean_variance_axis(X_weighted, axis=0)
+        X_mean = X_mean * n_samples / sqrt_sw.dot(sqrt_sw)
+        weight_sum = sqrt_sw.dot(sqrt_sw)
+        return (safe_sparse_dot(X.T, X, dense_output=True) -
+                weight_sum * np.outer(X_mean, X_mean),
+                X_mean)
+
+    def _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw):
+        """Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T)
+        without explicitely centering X nor computing X.dot(A)
+        when X is sparse.
+
+        Parameters
+        ----------
+        X : sparse matrix, shape = (n_samples, n_features)
+
+        A : np.ndarray, shape = (n_features, n_features)
+
+        X_mean : np.ndarray, shape = (n_features,)
+
+        sqrt_sw : np.ndarray, shape = (n_features,)
+            square roots of sample weights
+
+        Returns
+        -------
+        diag : np.ndarray, shape = (n_samples,)
+            The computed diagonal.
+        """
+        intercept_col = sqrt_sw
+        scale = sqrt_sw
+        batch_size = X.shape[1]
+        diag = np.empty(X.shape[0], dtype=X.dtype)
+        for start in range(0, X.shape[0], batch_size):
+            batch = slice(start, min(X.shape[0], start + batch_size), 1)
+            X_batch = np.empty(
+                (X[batch].shape[0], X.shape[1] + self.fit_intercept),
+                dtype=X.dtype
+            )
+            if self.fit_intercept:
+                X_batch[:, :-1] = X[batch].A - X_mean * scale[batch][:, None]
+                X_batch[:, -1] = intercept_col[batch]
+            else:
+                X_batch = X[batch].A
+            diag[batch] = (X_batch.dot(A) * X_batch).sum(axis=1)
+        return diag
+
+    def _eigen_decompose_gram(self, X, y, sqrt_sw):
+        """Eigendecomposition of X.X^T, used when n_samples <= n_features"""
+        # if X is dense it has already been centered in preprocessing
+        K, X_mean = self._compute_gram(X, sqrt_sw)
+        if self.fit_intercept:
+            # to emulate centering X with sample weights,
+            # ie removing the weighted average, we add a column
+            # containing the square roots of the sample weights.
+            # by centering, it is orthogonal to the other columns
+            K += np.outer(sqrt_sw, sqrt_sw)
+        v, Q = linalg.eigh(K)
+        QT_y = np.dot(Q.T, y)
+        return X_mean, v, Q, QT_y
+
+    def _solve_eigen_gram(self, alpha, y, sqrt_sw, X_mean, v, Q, QT_y):
+        """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
+
+        Used when we have a decomposition of X.X^T (n_features >= n_samples).
         """
         w = 1. / (v + alpha)
-        constant_column = np.var(Q, 0) < 1.e-12
-        # detect constant columns
-        w[constant_column] = 0  # cancel the regularization for the intercept
+        if self.fit_intercept:
+            # the vector containing the square roots of the sample weights (1
+            # when no sample weights) is the eigenvector of XX^T which
+            # corresponds to the intercept; we cancel the regularization on
+            # this dimension. the corresponding eigenvalue is
+            # sum(sample_weight).
+            normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)
+            intercept_dim = _find_smallest_angle(normalized_sw, Q)
+            w[intercept_dim] = 0  # cancel regularization for the intercept
 
         c = np.dot(Q, self._diag_dot(w, QT_y))
         G_diag = self._decomp_diag(w, Q)
@@ -1021,35 +1253,117 @@ def _errors_and_values_helper(self, alpha, y, v, Q, QT_y):
             G_diag = G_diag[:, np.newaxis]
         return G_diag, c
 
-    def _errors(self, alpha, y, v, Q, QT_y):
-        G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y)
-        return (c / G_diag) ** 2, c
+    def _eigen_decompose_covariance(self, X, y, sqrt_sw):
+        """Eigendecomposition of X^T.X, used when n_samples > n_features."""
+        n_samples, n_features = X.shape
+        cov = np.empty((n_features + 1, n_features + 1), dtype=X.dtype)
+        cov[:-1, :-1], X_mean = self._compute_covariance(X, sqrt_sw)
+        if not self.fit_intercept:
+            cov = cov[:-1, :-1]
+        # to emulate centering X with sample weights,
+        # ie removing the weighted average, we add a column
+        # containing the square roots of the sample weights.
+        # by centering, it is orthogonal to the other columns
+        # when all samples have the same weight we add a column of 1
+        else:
+            cov[-1] = 0
+            cov[:, -1] = 0
+            cov[-1, -1] = sqrt_sw.dot(sqrt_sw)
+        nullspace_dim = max(0, X.shape[1] - X.shape[0])
+        s, V = linalg.eigh(cov)
+        # remove eigenvalues and vectors in the null space of X^T.X
+        s = s[nullspace_dim:]
+        V = V[:, nullspace_dim:]
+        return X_mean, s, V, X
+
+    def _solve_eigen_covariance_no_intercept(
+            self, alpha, y, sqrt_sw, X_mean, s, V, X):
+        """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
+
+        Used when we have a decomposition of X^T.X
+        (n_features < n_samples and X is sparse), and not fitting an intercept.
+        """
+        w = 1 / (s + alpha)
+        A = (V * w).dot(V.T)
+        AXy = A.dot(safe_sparse_dot(X.T, y, dense_output=True))
+        y_hat = safe_sparse_dot(X, AXy, dense_output=True)
+        hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)
+        if len(y.shape) != 1:
+            # handle case where y is 2-d
+            hat_diag = hat_diag[:, np.newaxis]
+        return (1 - hat_diag) / alpha, (y - y_hat) / alpha
 
-    def _values(self, alpha, y, v, Q, QT_y):
-        G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y)
-        return y - (c / G_diag), c
+    def _solve_eigen_covariance_intercept(
+            self, alpha, y, sqrt_sw, X_mean, s, V, X):
+        """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
 
-    def _pre_compute_svd(self, X, y, centered_kernel=True):
-        if sparse.issparse(X):
-            raise TypeError("SVD not supported for sparse matrices")
-        if centered_kernel:
-            X = np.hstack((X, np.ones((X.shape[0], 1))))
-        # to emulate fit_intercept=True situation, add a column on ones
-        # Note that by centering, the other columns are orthogonal to that one
+        Used when we have a decomposition of X^T.X
+        (n_features < n_samples and X is sparse),
+        and we are fitting an intercept.
+        """
+        # the vector [0, 0, ..., 0, 1]
+        # is the eigenvector of X^TX which
+        # corresponds to the intercept; we cancel the regularization on
+        # this dimension. the corresponding eigenvalue is
+        # sum(sample_weight), e.g. n when uniform sample weights.
+        intercept_sv = np.zeros(V.shape[0])
+        intercept_sv[-1] = 1
+        intercept_dim = _find_smallest_angle(intercept_sv, V)
+        w = 1 / (s + alpha)
+        w[intercept_dim] = 1 / s[intercept_dim]
+        A = (V * w).dot(V.T)
+        # add a column to X containing the square roots of sample weights
+        X_op = _X_operator(X, X_mean, sqrt_sw)
+        AXy = A.dot(X_op.T.dot(y))
+        y_hat = X_op.dot(AXy)
+        hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)
+        # return (1 - hat_diag), (y - y_hat)
+        if len(y.shape) != 1:
+            # handle case where y is 2-d
+            hat_diag = hat_diag[:, np.newaxis]
+        return (1 - hat_diag) / alpha, (y - y_hat) / alpha
+
+    def _solve_eigen_covariance(
+            self, alpha, y, sqrt_sw, X_mean, s, V, X):
+        """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
+
+        Used when we have a decomposition of X^T.X
+        (n_features < n_samples and X is sparse).
+        """
+        if self.fit_intercept:
+            return self._solve_eigen_covariance_intercept(
+                alpha, y, sqrt_sw, X_mean, s, V, X)
+        return self._solve_eigen_covariance_no_intercept(
+            alpha, y, sqrt_sw, X_mean, s, V, X)
+
+    def _svd_decompose_design_matrix(self, X, y, sqrt_sw):
+        # X already centered
+        X_mean = np.zeros(X.shape[1], dtype=X.dtype)
+        if self.fit_intercept:
+            # to emulate fit_intercept=True situation, add a column
+            # containing the square roots of the sample weights
+            # by centering, the other columns are orthogonal to that one
+            intercept_column = sqrt_sw[:, None]
+            X = np.hstack((X, intercept_column))
         U, s, _ = linalg.svd(X, full_matrices=0)
         v = s ** 2
         UT_y = np.dot(U.T, y)
-        return v, U, UT_y
+        return X_mean, v, U, UT_y
 
-    def _errors_and_values_svd_helper(self, alpha, y, v, U, UT_y):
-        """Helper function to avoid code duplication between self._errors_svd
-        and self._values_svd.
+    def _solve_svd_design_matrix(
+            self, alpha, y, sqrt_sw, X_mean, v, U, UT_y):
+        """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
+
+        Used when we have an SVD decomposition of X
+        (n_features >= n_samples and X is dense).
         """
-        constant_column = np.var(U, 0) < 1.e-12
-        # detect columns colinear to ones
         w = ((v + alpha) ** -1) - (alpha ** -1)
-        w[constant_column] = - (alpha ** -1)
-        # cancel the regularization for the intercept
+        if self.fit_intercept:
+            # detect intercept column
+            normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)
+            intercept_dim = _find_smallest_angle(normalized_sw, U)
+            # cancel the regularization for the intercept
+            w[intercept_dim] = - (alpha ** -1)
         c = np.dot(U, self._diag_dot(w, UT_y)) + (alpha ** -1) * y
         G_diag = self._decomp_diag(w, U) + (alpha ** -1)
         if len(y.shape) != 1:
@@ -1057,24 +1371,16 @@ def _errors_and_values_svd_helper(self, alpha, y, v, U, UT_y):
             G_diag = G_diag[:, np.newaxis]
         return G_diag, c
 
-    def _errors_svd(self, alpha, y, v, U, UT_y):
-        G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y)
-        return (c / G_diag) ** 2, c
-
-    def _values_svd(self, alpha, y, v, U, UT_y):
-        G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y)
-        return y - (c / G_diag), c
-
     def fit(self, X, y, sample_weight=None):
         """Fit Ridge regression model
 
         Parameters
         ----------
         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
-            Training data
+            Training data. Will be cast to float64 if necessary
 
         y : array-like, shape = [n_samples] or [n_samples, n_targets]
-            Target values. Will be cast to X's dtype if necessary
+            Target values. Will be cast to float64 if necessary
 
         sample_weight : float or array-like of shape [n_samples]
             Sample weight
@@ -1083,10 +1389,15 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        X, y = check_X_y(X, y,
-                         accept_sparse=['csr', 'csc', 'coo'],
-                         dtype=[np.float64, np.float32],
+        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'],
+                         dtype=[np.float64],
                          multi_output=True, y_numeric=True)
+
+        if np.any(self.alphas <= 0):
+            raise ValueError(
+                "alphas must be positive. Got {} containing some "
+                "negative or null value instead.".format(self.alphas))
+
         if sample_weight is not None and not isinstance(sample_weight, float):
             sample_weight = check_array(sample_weight, ensure_2d=False,
                                         dtype=X.dtype)
@@ -1096,56 +1407,42 @@ def fit(self, X, y, sample_weight=None):
             X, y, self.fit_intercept, self.normalize, self.copy_X,
             sample_weight=sample_weight)
 
-        gcv_mode = self.gcv_mode
-        with_sw = len(np.shape(sample_weight))
-
-        if gcv_mode is None or gcv_mode == 'auto':
-            if sparse.issparse(X) or n_features > n_samples or with_sw:
-                gcv_mode = 'eigen'
-            else:
-                gcv_mode = 'svd'
-        elif gcv_mode == "svd" and with_sw:
-            # FIXME non-uniform sample weights not yet supported
-            warnings.warn("non-uniform sample weights unsupported for svd, "
-                          "forcing usage of eigen")
-            gcv_mode = 'eigen'
+        gcv_mode = _check_gcv_mode(X, self.gcv_mode)
 
         if gcv_mode == 'eigen':
-            _pre_compute = self._pre_compute
-            _errors = self._errors
-            _values = self._values
+            decompose = self._eigen_decompose_gram
+            solve = self._solve_eigen_gram
         elif gcv_mode == 'svd':
-            # assert n_samples >= n_features
-            _pre_compute = self._pre_compute_svd
-            _errors = self._errors_svd
-            _values = self._values_svd
-        else:
-            raise ValueError('bad gcv_mode "%s"' % gcv_mode)
+            if sparse.issparse(X):
+                decompose = self._eigen_decompose_covariance
+                solve = self._solve_eigen_covariance
+            else:
+                decompose = self._svd_decompose_design_matrix
+                solve = self._solve_svd_design_matrix
 
         if sample_weight is not None:
             X, y = _rescale_data(X, y, sample_weight)
-
-        centered_kernel = not sparse.issparse(X) and self.fit_intercept
-
-        v, Q, QT_y = _pre_compute(X, y, centered_kernel)
-        n_y = 1 if len(y.shape) == 1 else y.shape[1]
-        cv_values = np.zeros((n_samples * n_y, len(self.alphas)))
-        C = []
+            sqrt_sw = np.sqrt(sample_weight)
+        else:
+            sqrt_sw = np.ones(X.shape[0], dtype=X.dtype)
 
         scorer = check_scoring(self, scoring=self.scoring, allow_none=True)
         error = scorer is None
 
-        if np.any(self.alphas < 0):
-            raise ValueError("alphas cannot be negative. "
-                             "Got {} containing some "
-                             "negative value instead.".format(self.alphas))
-
+        n_y = 1 if len(y.shape) == 1 else y.shape[1]
+        cv_values = np.zeros((n_samples * n_y, len(self.alphas)),
+                             dtype=X.dtype)
+        C = []
+        X_mean, *decomposition = decompose(X, y, sqrt_sw)
         for i, alpha in enumerate(self.alphas):
+            G_diag, c = solve(
+                float(alpha), y, sqrt_sw, X_mean, *decomposition)
             if error:
-                out, c = _errors(float(alpha), y, v, Q, QT_y)
+                squared_errors = (c / G_diag) ** 2
+                cv_values[:, i] = squared_errors.ravel()
             else:
-                out, c = _values(float(alpha), y, v, Q, QT_y)
-            cv_values[:, i] = out.ravel()
+                predictions = y - (c / G_diag)
+                cv_values[:, i] = predictions.ravel()
             C.append(c)
 
         if error:
@@ -1167,6 +1464,7 @@ def identity_estimator():
         self.dual_coef_ = C[best]
         self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)
 
+        X_offset += X_mean * X_scale
         self._set_intercept(X_offset, y_offset, X_scale)
 
         if self.store_cv_values:
@@ -1198,7 +1496,8 @@ def fit(self, X, y, sample_weight=None):
         Parameters
         ----------
         X : array-like, shape = [n_samples, n_features]
-            Training data
+            Training data. If using GCV, will be cast to float64
+            if necessary.
 
         y : array-like, shape = [n_samples] or [n_samples, n_targets]
             Target values. Will be cast to X's dtype if necessary
@@ -1209,8 +1508,17 @@ def fit(self, X, y, sample_weight=None):
         Returns
         -------
         self : object
+
+        Notes
+        -----
+        When sample_weight is provided, the selected hyperparameter may depend
+        on whether we use generalized cross-validation (cv=None or cv='auto')
+        or another form of cross-validation, because only generalized
+        cross-validation takes the sample weights into account when computing
+        the validation score.
         """
-        if self.cv is None:
+        cv = self.cv
+        if cv is None:
             estimator = _RidgeGCV(self.alphas,
                                   fit_intercept=self.fit_intercept,
                                   normalize=self.normalize,
@@ -1226,9 +1534,11 @@ def fit(self, X, y, sample_weight=None):
                 raise ValueError("cv!=None and store_cv_values=True "
                                  " are incompatible")
             parameters = {'alpha': self.alphas}
+            solver = 'sparse_cg' if sparse.issparse(X) else 'auto'
             gs = GridSearchCV(Ridge(fit_intercept=self.fit_intercept,
-                                    normalize=self.normalize),
-                              parameters, cv=self.cv, scoring=self.scoring)
+                                    normalize=self.normalize,
+                                    solver=solver),
+                              parameters, cv=cv, scoring=self.scoring)
             gs.fit(X, y, sample_weight=sample_weight)
             estimator = gs.best_estimator_
             self.alpha_ = gs.best_estimator_.alpha
@@ -1258,6 +1568,7 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin):
         the estimates. Larger values specify stronger regularization.
         Alpha corresponds to ``C^-1`` in other linear models such as
         LogisticRegression or LinearSVC.
+        If using generalized cross-validation, alphas must be positive.
 
     fit_intercept : boolean
         Whether to calculate the intercept for this model. If set
@@ -1276,12 +1587,15 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin):
         A string (see model evaluation documentation) or
         a scorer callable object / function with signature
         ``scorer(estimator, X, y)``.
+        If None, the negative mean squared error if cv is 'auto' or None
+        (i.e. when using generalized cross-validation), and r2 score otherwise.
 
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
         - None, to use the efficient Leave-One-Out cross-validation
+          (also known as Generalized Cross-Validation).
         - integer, to specify the number of folds.
         - :term:`CV splitter`,
         - An iterable yielding (train, test) splits as arrays of indices.
@@ -1297,15 +1611,13 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin):
         Flag indicating which strategy to use when performing
         Generalized Cross-Validation. Options are::
 
-            'auto' : use svd if n_samples > n_features or when X is a sparse
-                     matrix, otherwise use eigen
-            'svd' : force computation via singular value decomposition of X
-                    (does not work for sparse matrices)
-            'eigen' : force computation via eigendecomposition of X^T X
+            'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen'
+            'svd' : force use of singular value decomposition of X when X is
+                dense, eigenvalue decomposition of X^T.X when X is sparse.
+            'eigen' : force computation via eigendecomposition of X.X^T
 
         The 'auto' mode is the default and is intended to pick the cheaper
-        option of the two depending upon the shape and format of the training
-        data.
+        option of the two depending on the shape of the training data.
 
     store_cv_values : boolean, default=False
         Flag indicating if the cross-validation values corresponding to
@@ -1472,7 +1784,8 @@ def fit(self, X, y, sample_weight=None):
         ----------
         X : array-like, shape (n_samples, n_features)
             Training vectors, where n_samples is the number of samples
-            and n_features is the number of features.
+            and n_features is the number of features. When using GCV,
+            will be cast to float64 if necessary.
 
         y : array-like, shape (n_samples,)
             Target values. Will be cast to X's dtype if necessary
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index a9c2f98a48b90..1cd386ee5d618 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -9,7 +9,6 @@
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import assert_array_almost_equal
-from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_greater
@@ -35,10 +34,12 @@
 from sklearn.linear_model.ridge import RidgeClassifierCV
 from sklearn.linear_model.ridge import _solve_cholesky
 from sklearn.linear_model.ridge import _solve_cholesky_kernel
+from sklearn.linear_model.ridge import _check_gcv_mode
+from sklearn.linear_model.ridge import _X_operator
 from sklearn.datasets import make_regression
 
 from sklearn.model_selection import GridSearchCV
-from sklearn.model_selection import KFold
+from sklearn.model_selection import KFold, GroupKFold, cross_val_predict
 
 from sklearn.utils import check_random_state, _IS_32BIT
 from sklearn.datasets import make_multilabel_classification
@@ -313,6 +314,213 @@ def test_ridge_individual_penalties():
     assert_raises(ValueError, ridge.fit, X, y)
 
 
+@pytest.mark.parametrize('n_col', [(), (1,), (3,)])
+def test_x_operator(n_col):
+    rng = np.random.RandomState(0)
+    X = rng.randn(11, 8)
+    X_m = rng.randn(8)
+    sqrt_sw = rng.randn(len(X))
+    Y = rng.randn(11, *n_col)
+    A = rng.randn(9, *n_col)
+    operator = _X_operator(sp.csr_matrix(X), X_m, sqrt_sw)
+    reference_operator = np.hstack(
+        [X - sqrt_sw[:, None] * X_m, sqrt_sw[:, None]])
+    assert_allclose(reference_operator.dot(A), operator.dot(A))
+    assert_allclose(reference_operator.T.dot(Y), operator.T.dot(Y))
+
+
+@pytest.mark.parametrize('shape', [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)])
+@pytest.mark.parametrize('uniform_weights', [True, False])
+def test_compute_gram(shape, uniform_weights):
+    rng = np.random.RandomState(0)
+    X = rng.randn(*shape)
+    if uniform_weights:
+        sw = np.ones(X.shape[0])
+    else:
+        sw = rng.chisquare(1, shape[0])
+    sqrt_sw = np.sqrt(sw)
+    X_mean = np.average(X, axis=0, weights=sw)
+    X_centered = (X - X_mean) * sqrt_sw[:, None]
+    true_gram = X_centered.dot(X_centered.T)
+    X_sparse = sp.csr_matrix(X * sqrt_sw[:, None])
+    gcv = _RidgeGCV(fit_intercept=True)
+    computed_gram, computed_mean = gcv._compute_gram(X_sparse, sqrt_sw)
+    assert_allclose(X_mean, computed_mean)
+    assert_allclose(true_gram, computed_gram)
+
+
+@pytest.mark.parametrize('shape', [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)])
+@pytest.mark.parametrize('uniform_weights', [True, False])
+def test_compute_covariance(shape, uniform_weights):
+    rng = np.random.RandomState(0)
+    X = rng.randn(*shape)
+    if uniform_weights:
+        sw = np.ones(X.shape[0])
+    else:
+        sw = rng.chisquare(1, shape[0])
+    sqrt_sw = np.sqrt(sw)
+    X_mean = np.average(X, axis=0, weights=sw)
+    X_centered = (X - X_mean) * sqrt_sw[:, None]
+    true_covariance = X_centered.T.dot(X_centered)
+    X_sparse = sp.csr_matrix(X * sqrt_sw[:, None])
+    gcv = _RidgeGCV(fit_intercept=True)
+    computed_cov, computed_mean = gcv._compute_covariance(X_sparse, sqrt_sw)
+    assert_allclose(X_mean, computed_mean)
+    assert_allclose(true_covariance, computed_cov)
+
+
+def _make_sparse_offset_regression(
+        n_samples=100, n_features=100, proportion_nonzero=.5,
+        n_informative=10, n_targets=1, bias=13., X_offset=30.,
+        noise=30., shuffle=True, coef=False, random_state=None):
+    X, y, c = make_regression(
+        n_samples=n_samples, n_features=n_features,
+        n_informative=n_informative, n_targets=n_targets, bias=bias,
+        noise=noise, shuffle=shuffle,
+        coef=True, random_state=random_state)
+    if n_features == 1:
+        c = np.asarray([c])
+    X += X_offset
+    mask = np.random.RandomState(random_state).binomial(
+        1, proportion_nonzero, X.shape) > 0
+    removed_X = X.copy()
+    X[~mask] = 0.
+    removed_X[mask] = 0.
+    y -= removed_X.dot(c)
+    if n_features == 1:
+        c = c[0]
+    if coef:
+        return X, y, c
+    return X, y
+
+
+@pytest.mark.parametrize('gcv_mode', ['svd', 'eigen'])
+@pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix])
+@pytest.mark.parametrize('X_shape', [(11, 8), (11, 20)])
+@pytest.mark.parametrize('fit_intercept', [True, False])
+@pytest.mark.parametrize(
+    'y_shape, normalize, noise',
+    [
+        ((11,), True, 1.),
+        ((11, 1), False, 30.),
+        ((11, 3), False, 150.),
+    ]
+)
+def test_ridge_gcv_vs_ridge_loo_cv(
+        gcv_mode, X_constructor, X_shape, y_shape,
+        fit_intercept, normalize, noise):
+    n_samples, n_features = X_shape
+    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
+    X, y = _make_sparse_offset_regression(
+        n_samples=n_samples, n_features=n_features, n_targets=n_targets,
+        random_state=0, shuffle=False, noise=noise, n_informative=5
+    )
+    y = y.reshape(y_shape)
+
+    alphas = [1e-3, .1, 1., 10., 1e3]
+    loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept,
+                        alphas=alphas, scoring='neg_mean_squared_error',
+                        normalize=normalize)
+    gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept,
+                        alphas=alphas, normalize=normalize)
+
+    loo_ridge.fit(X, y)
+
+    X_gcv = X_constructor(X)
+    gcv_ridge.fit(X_gcv, y)
+
+    assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_)
+    assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3)
+    assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
+
+
+@pytest.mark.parametrize('gcv_mode', ['svd', 'eigen'])
+@pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix])
+@pytest.mark.parametrize('n_features', [8, 20])
+@pytest.mark.parametrize('y_shape, fit_intercept, noise',
+                         [((11,), True, 1.),
+                          ((11, 1), True, 20.),
+                          ((11, 3), True, 150.),
+                          ((11, 3), False, 30.)])
+def test_ridge_gcv_sample_weights(
+        gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise):
+    alphas = [1e-3, .1, 1., 10., 1e3]
+    rng = np.random.RandomState(0)
+    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
+    X, y = _make_sparse_offset_regression(
+        n_samples=11, n_features=n_features, n_targets=n_targets,
+        random_state=0, shuffle=False, noise=noise)
+    y = y.reshape(y_shape)
+
+    sample_weight = 3 * rng.randn(len(X))
+    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
+    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
+    sample_weight = sample_weight.astype(float)
+    X_tiled, y_tiled = X[indices], y[indices]
+
+    cv = GroupKFold(n_splits=X.shape[0])
+    splits = cv.split(X_tiled, y_tiled, groups=indices)
+    kfold = RidgeCV(
+        alphas=alphas, cv=splits, scoring='neg_mean_squared_error',
+        fit_intercept=fit_intercept)
+    # ignore warning from GridSearchCV: DeprecationWarning: The default of the
+    # `iid` parameter will change from True to False in version 0.22 and will
+    # be removed in 0.24
+    with ignore_warnings(category=DeprecationWarning):
+        kfold.fit(X_tiled, y_tiled)
+
+    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
+    splits = cv.split(X_tiled, y_tiled, groups=indices)
+    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
+    kfold_errors = (y_tiled - predictions)**2
+    kfold_errors = [
+        np.sum(kfold_errors[indices == i], axis=0) for
+        i in np.arange(X.shape[0])]
+    kfold_errors = np.asarray(kfold_errors)
+
+    X_gcv = X_constructor(X)
+    gcv_ridge = RidgeCV(
+        alphas=alphas, store_cv_values=True,
+        gcv_mode=gcv_mode, fit_intercept=fit_intercept)
+    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
+    if len(y_shape) == 2:
+        gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
+    else:
+        gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]
+
+    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
+    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
+    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
+    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
+
+
+@pytest.mark.parametrize('mode', [True, 1, 5, 'bad', 'gcv'])
+def test_check_gcv_mode_error(mode):
+    X, y = make_regression(n_samples=5, n_features=2)
+    gcv = RidgeCV(gcv_mode=mode)
+    with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"):
+        gcv.fit(X, y)
+    with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"):
+        _check_gcv_mode(X, mode)
+
+
+@pytest.mark.parametrize("sparse", [True, False])
+@pytest.mark.parametrize(
+    'mode, mode_n_greater_than_p, mode_p_greater_than_n',
+    [(None, 'svd', 'eigen'),
+     ('auto', 'svd', 'eigen'),
+     ('eigen', 'eigen', 'eigen'),
+     ('svd', 'svd', 'svd')]
+)
+def test_check_gcv_mode_choice(sparse, mode, mode_n_greater_than_p,
+                               mode_p_greater_than_n):
+    X, _ = make_regression(n_samples=5, n_features=2)
+    if sparse:
+        X = sp.csr_matrix(X)
+    assert _check_gcv_mode(X, mode) == mode_n_greater_than_p
+    assert _check_gcv_mode(X.T, mode) == mode_p_greater_than_n
+
+
 def _test_ridge_loo(filter_):
     # test that can work with both dense or sparse matrices
     n_samples = X_diabetes.shape[0]
@@ -320,46 +528,7 @@ def _test_ridge_loo(filter_):
     ret = []
 
     fit_intercept = filter_ == DENSE_FILTER
-    if fit_intercept:
-        X_diabetes_ = X_diabetes - X_diabetes.mean(0)
-    else:
-        X_diabetes_ = X_diabetes
     ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept)
-    ridge = Ridge(alpha=1.0, fit_intercept=fit_intercept)
-
-    # because fit_intercept is applied
-
-    # generalized cross-validation (efficient leave-one-out)
-    decomp = ridge_gcv._pre_compute(X_diabetes_, y_diabetes, fit_intercept)
-    errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp)
-    values, c = ridge_gcv._values(1.0, y_diabetes, *decomp)
-
-    # brute-force leave-one-out: remove one example at a time
-    errors2 = []
-    values2 = []
-    for i in range(n_samples):
-        sel = np.arange(n_samples) != i
-        X_new = X_diabetes_[sel]
-        y_new = y_diabetes[sel]
-        ridge.fit(X_new, y_new)
-        value = ridge.predict([X_diabetes_[i]])[0]
-        error = (y_diabetes[i] - value) ** 2
-        errors2.append(error)
-        values2.append(value)
-
-    # check that efficient and brute-force LOO give same results
-    assert_almost_equal(errors, errors2)
-    assert_almost_equal(values, values2)
-
-    # generalized cross-validation (efficient leave-one-out,
-    # SVD variation)
-    decomp = ridge_gcv._pre_compute_svd(X_diabetes_, y_diabetes, fit_intercept)
-    errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp)
-    values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp)
-
-    # check that efficient and SVD efficient LOO give same results
-    assert_almost_equal(errors, errors3)
-    assert_almost_equal(values, values3)
 
     # check best alpha
     ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
@@ -371,25 +540,26 @@ def _test_ridge_loo(filter_):
     scoring = make_scorer(mean_squared_error, greater_is_better=False)
     ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
     f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
-    assert_equal(ridge_gcv2.alpha_, alpha_)
+    assert ridge_gcv2.alpha_ == pytest.approx(alpha_)
 
     # check that we get same best alpha with custom score_func
     func = lambda x, y: -mean_squared_error(x, y)
     scoring = make_scorer(func)
     ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
     f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
-    assert_equal(ridge_gcv3.alpha_, alpha_)
+    assert ridge_gcv3.alpha_ == pytest.approx(alpha_)
 
     # check that we get same best alpha with a scorer
     scorer = get_scorer('neg_mean_squared_error')
     ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
     ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
-    assert_equal(ridge_gcv4.alpha_, alpha_)
+    assert ridge_gcv4.alpha_ == pytest.approx(alpha_)
 
     # check that we get same best alpha with sample weights
-    ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
-                  sample_weight=np.ones(n_samples))
-    assert_equal(ridge_gcv.alpha_, alpha_)
+    if filter_ == DENSE_FILTER:
+        ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
+                      sample_weight=np.ones(n_samples))
+        assert ridge_gcv.alpha_ == pytest.approx(alpha_)
 
     # simulate several responses
     Y = np.vstack((y_diabetes, y_diabetes)).T
@@ -399,8 +569,8 @@ def _test_ridge_loo(filter_):
     ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
     y_pred = ridge_gcv.predict(filter_(X_diabetes))
 
-    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T,
-                              Y_pred, decimal=5)
+    assert_allclose(np.vstack((y_pred, y_pred)).T,
+                    Y_pred, rtol=1e-5)
 
     return ret
 
@@ -409,7 +579,7 @@ def _test_ridge_cv_normalize(filter_):
     ridge_cv = RidgeCV(normalize=True, cv=3)
     ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes)
 
-    gs = GridSearchCV(Ridge(normalize=True), cv=3,
+    gs = GridSearchCV(Ridge(normalize=True, solver='sparse_cg'), cv=3,
                       param_grid={'alpha': ridge_cv.alphas})
     gs.fit(filter_(10. * X_diabetes), y_diabetes)
     assert_equal(gs.best_estimator_.alpha, ridge_cv.alpha_)
@@ -503,12 +673,6 @@ def test_dense_sparse(test_func):
     check_dense_sparse(test_func)
 
 
-def test_ridge_cv_sparse_svd():
-    X = sp.csr_matrix(X_diabetes)
-    ridge = RidgeCV(gcv_mode="svd")
-    assert_raises(TypeError, ridge.fit, X)
-
-
 def test_ridge_sparse_svd():
     X = sp.csc_matrix(rng.rand(100, 10))
     y = rng.rand(100)
@@ -622,6 +786,10 @@ def test_ridgecv_store_cv_values():
     r.fit(x, y)
     assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)
 
+    r = RidgeCV(cv=3, store_cv_values=True)
+    assert_raises_regex(ValueError, 'cv!=None and store_cv_values',
+                        r.fit, x, y)
+
 
 @pytest.mark.filterwarnings('ignore: The default value of cv')  # 0.22
 def test_ridge_classifier_cv_store_cv_values():
@@ -764,13 +932,13 @@ def test_ridgecv_negative_alphas():
     # Negative integers
     ridge = RidgeCV(alphas=(-1, -10, -100))
     assert_raises_regex(ValueError,
-                        "alphas cannot be negative.",
+                        "alphas must be positive",
                         ridge.fit, X, y)
 
     # Negative floats
     ridge = RidgeCV(alphas=(-0.1, -1.0, -10.0))
     assert_raises_regex(ValueError,
-                        "alphas cannot be negative.",
+                        "alphas must be positive",
                         ridge.fit, X, y)
 
 
@@ -889,48 +1057,6 @@ def test_ridge_regression_check_arguments_validity(return_intercept,
         assert_allclose(out, true_coefs, rtol=0, atol=atol)
 
 
-def test_errors_and_values_helper():
-    ridgecv = _RidgeGCV()
-    rng = check_random_state(42)
-    alpha = 1.
-    n = 5
-    y = rng.randn(n)
-    v = rng.randn(n)
-    Q = rng.randn(len(v), len(v))
-    QT_y = Q.T.dot(y)
-    G_diag, c = ridgecv._errors_and_values_helper(alpha, y, v, Q, QT_y)
-
-    # test that helper function behaves as expected
-    out, c_ = ridgecv._errors(alpha, y, v, Q, QT_y)
-    np.testing.assert_array_equal(out, (c / G_diag) ** 2)
-    np.testing.assert_array_equal(c, c)
-
-    out, c_ = ridgecv._values(alpha, y, v, Q, QT_y)
-    np.testing.assert_array_equal(out, y - (c / G_diag))
-    np.testing.assert_array_equal(c_, c)
-
-
-def test_errors_and_values_svd_helper():
-    ridgecv = _RidgeGCV()
-    rng = check_random_state(42)
-    alpha = 1.
-    for n, p in zip((5, 10), (12, 6)):
-        y = rng.randn(n)
-        v = rng.randn(p)
-        U = rng.randn(n, p)
-        UT_y = U.T.dot(y)
-        G_diag, c = ridgecv._errors_and_values_svd_helper(alpha, y, v, U, UT_y)
-
-        # test that helper function behaves as expected
-        out, c_ = ridgecv._errors_svd(alpha, y, v, U, UT_y)
-        np.testing.assert_array_equal(out, (c / G_diag) ** 2)
-        np.testing.assert_array_equal(c, c)
-
-        out, c_ = ridgecv._values_svd(alpha, y, v, U, UT_y)
-        np.testing.assert_array_equal(out, y - (c / G_diag))
-        np.testing.assert_array_equal(c_, c)
-
-
 def test_ridge_classifier_no_support_multilabel():
     X, y = make_multilabel_classification(n_samples=10, random_state=0)
     assert_raises(ValueError, RidgeClassifier().fit, X, y)