scikit-learn · rithvikrao · Jun 11, 2020 · Jun 16, 2020 · Jun 16, 2020 · Jun 17, 2020
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
@@ -61,6 +61,12 @@ example, when data are collected without an experimental design.
 
    * :ref:`sphx_glr_auto_examples_linear_model_plot_ols.py`
 
+Ordinary Least Squares uses a Singular Value Decomposition (SVD) based
+approach. The ``LinearRegression`` class has an additional, optional
+``solver`` parameter, which if set to ``"cholesky"`` uses the Cholesky
+factorization instead. See `these notes
+<https://www.cs.ubc.ca/~schmidtm/Courses/540-F14/leastSquares.pdf>` for a
-<https://www.cs.ubc.ca/~schmidtm/Courses/540-F14/leastSquares.pdf>` for a
+<https://www.cs.ubc.ca/~schmidtm/Courses/540-F14/leastSquares.pdf>`__ for a
-<https://www.cs.ubc.ca/~schmidtm/Courses/540-F14/leastSquares.pdf>` for a
+<https://www.cs.ubc.ca/~schmidtm/Courses/540-F14/leastSquares.pdf>`__ for a
+discussion of the tradeoffs.
 
 Ordinary Least Squares Complexity
 ---------------------------------
@@ -71,6 +77,16 @@ this method has a cost of
 :math:`O(n_{\text{samples}} n_{\text{features}}^2)`, assuming that
 :math:`n_{\text{samples}} \geq n_{\text{features}}`.
 
+Cholesky Complexity
+---------------------------------
+
+The Cholesky solution is computed using the Cholesky factorization of
+X. If X is a matrix of shape `(n_samples, n_features)` this  method has
+a cost of
+:math:`O(n_{\text{samples}} n_{\text{features}}^2)` to form
+:math:`X^{\intercal}X` and :math:`O(n_{\text{features}}^3)` to run the
+solver.
+
 .. _ridge_regression:
 
 Ridge regression and classification

diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
@@ -8,7 +8,7 @@
 
 from .base import BaseEstimator, RegressorMixin, MultiOutputMixin
 from .metrics.pairwise import pairwise_kernels
-from .linear_model._ridge import _solve_cholesky_kernel
+from .linear_model._ridge_solvers import _solve_cholesky_kernel
 from .utils.validation import check_is_fitted, _check_sample_weight
 from .utils.validation import _deprecate_positional_args
 

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
@@ -26,6 +26,7 @@
 
 from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin,
                     MultiOutputMixin)
+from ._ridge_solvers import _cholesky_helper
 from ..utils import check_array
 from ..utils.validation import FLOAT_DTYPES
 from ..utils.validation import _deprecate_positional_args
@@ -419,6 +420,13 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+    solver : str, default="lstsq"
+        The solver to use. ``"lstsq"`` uses a SVD-based least-squares
+        approach, by calling ``scipy.linalg.lstsq``. ``"cholesky"`` uses the
+        Cholesky decomposition. If X is singular, then ``"cholesky"`` will
+        instead use an SVD-based solver. ``"cholesky"`` does not support `X`
+        matrices which are both singular and sparse.
+
     Attributes
     ----------
     coef_ : array of shape (n_features, ) or (n_targets, n_features)
@@ -472,11 +480,14 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
     """
     @_deprecate_positional_args
     def __init__(self, *, fit_intercept=True, normalize=False, copy_X=True,
-                 n_jobs=None):
+                 n_jobs=None, solver="lstsq"):
         self.fit_intercept = fit_intercept
         self.normalize = normalize
         self.copy_X = copy_X
         self.n_jobs = n_jobs
+        if solver not in ["lstsq", "cholesky"]:
+            raise ValueError("Solver must be either `lstsq` or `cholesky`")
+        self.solver = solver
 
     def fit(self, X, y, sample_weight=None):
         """
@@ -518,7 +529,29 @@ def fit(self, X, y, sample_weight=None):
             # Sample weight can be implemented via a simple rescaling.
             X, y = _rescale_data(X, y, sample_weight)
 
-        if sp.issparse(X):
+        if self.solver == "cholesky":
+            n_samples, n_features = X.shape
+            ravel = False
-            ravel = False
+            y_1d = y.ndim == 1
-            ravel = False
+            y_1d = y.ndim == 1
+            if y.ndim == 1:
+                y = y.reshape(-1, 1)
+                ravel = True
+            n_samples_, n_targets = y.shape
+            alpha = np.array([0], dtype=X.dtype)
+
+            if n_targets > 1:
+                alpha = np.repeat(alpha, n_targets)
+
+            try:
+                self.coef_ = _cholesky_helper(X, y, alpha, n_features,
+                                              n_samples)
+            except TypeError:
+                raise TypeError('X matrix is singular and sparse, and not'
+                                'supported by the Cholesky solver. ')
+
+            if ravel:
+                # When y was passed as a 1d-array, we flatten the coefficients.
+                self.coef_ = self.coef_.ravel()
+        elif sp.issparse(X):
             X_offset_scale = X_offset / X_scale
 
             def matvec(b):
@@ -544,7 +577,7 @@ def rmatvec(b):
                 self._residues = np.vstack([out[3] for out in outs])
         else:
             self.coef_, self._residues, self.rank_, self.singular_ = \
-                linalg.lstsq(X, y)
+                linalg.lstsq(X, y, check_finite=False)
             self.coef_ = self.coef_.T
 
         if y.ndim == 1:

diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
@@ -15,9 +15,11 @@
 import numpy as np
 from scipy import linalg
 from scipy import sparse
-from scipy.sparse import linalg as sp_linalg
 
 from ._base import LinearClassifierMixin, LinearModel, _rescale_data
+from ._ridge_solvers import (
+    _solve_sparse_cg, _solve_lsqr, _solve_svd, _cholesky_helper
+)
 from ._sag import sag_solver
 from ..base import RegressorMixin, MultiOutputMixin, is_classifier
 from ..utils.extmath import safe_sparse_dot
@@ -31,203 +33,9 @@
 from ..preprocessing import LabelBinarizer
 from ..model_selection import GridSearchCV
 from ..metrics import check_scoring
-from ..exceptions import ConvergenceWarning
 from ..utils.sparsefuncs import mean_variance_axis
 
 
-def _solve_sparse_cg(X, y, alpha, max_iter=None, tol=1e-3, verbose=0,
-                     X_offset=None, X_scale=None):
-
-    def _get_rescaled_operator(X):
-
-        X_offset_scale = X_offset / X_scale
-
-        def matvec(b):
-            return X.dot(b) - b.dot(X_offset_scale)
-
-        def rmatvec(b):
-            return X.T.dot(b) - X_offset_scale * np.sum(b)
-
-        X1 = sparse.linalg.LinearOperator(shape=X.shape,
-                                          matvec=matvec,
-                                          rmatvec=rmatvec)
-        return X1
-
-    n_samples, n_features = X.shape
-
-    if X_offset is None or X_scale is None:
-        X1 = sp_linalg.aslinearoperator(X)
-    else:
-        X1 = _get_rescaled_operator(X)
-
-    coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)
-
-    if n_features > n_samples:
-        def create_mv(curr_alpha):
-            def _mv(x):
-                return X1.matvec(X1.rmatvec(x)) + curr_alpha * x
-            return _mv
-    else:
-        def create_mv(curr_alpha):
-            def _mv(x):
-                return X1.rmatvec(X1.matvec(x)) + curr_alpha * x
-            return _mv
-
-    for i in range(y.shape[1]):
-        y_column = y[:, i]
-
-        mv = create_mv(alpha[i])
-        if n_features > n_samples:
-            # kernel ridge
-            # w = X.T * inv(X X^t + alpha*Id) y
-            C = sp_linalg.LinearOperator(
-                (n_samples, n_samples), matvec=mv, dtype=X.dtype)
-            # FIXME atol
-            try:
-                coef, info = sp_linalg.cg(C, y_column, tol=tol, atol='legacy')
-            except TypeError:
-                # old scipy
-                coef, info = sp_linalg.cg(C, y_column, tol=tol)
-            coefs[i] = X1.rmatvec(coef)
-        else:
-            # linear ridge
-            # w = inv(X^t X + alpha*Id) * X.T y
-            y_column = X1.rmatvec(y_column)
-            C = sp_linalg.LinearOperator(
-                (n_features, n_features), matvec=mv, dtype=X.dtype)
-            # FIXME atol
-            try:
-                coefs[i], info = sp_linalg.cg(C, y_column, maxiter=max_iter,
-                                              tol=tol, atol='legacy')
-            except TypeError:
-                # old scipy
-                coefs[i], info = sp_linalg.cg(C, y_column, maxiter=max_iter,
-                                              tol=tol)
-
-        if info < 0:
-            raise ValueError("Failed with error code %d" % info)
-
-        if max_iter is None and info > 0 and verbose:
-            warnings.warn("sparse_cg did not converge after %d iterations." %
-                          info, ConvergenceWarning)
-
-    return coefs
-
-
-def _solve_lsqr(X, y, alpha, max_iter=None, tol=1e-3):
-    n_samples, n_features = X.shape
-    coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)
-    n_iter = np.empty(y.shape[1], dtype=np.int32)
-
-    # According to the lsqr documentation, alpha = damp^2.
-    sqrt_alpha = np.sqrt(alpha)
-
-    for i in range(y.shape[1]):
-        y_column = y[:, i]
-        info = sp_linalg.lsqr(X, y_column, damp=sqrt_alpha[i],
-                              atol=tol, btol=tol, iter_lim=max_iter)
-        coefs[i] = info[0]
-        n_iter[i] = info[2]
-
-    return coefs, n_iter
-
-
-def _solve_cholesky(X, y, alpha):
-    # w = inv(X^t X + alpha*Id) * X.T y
-    n_features = X.shape[1]
-    n_targets = y.shape[1]
-
-    A = safe_sparse_dot(X.T, X, dense_output=True)
-    Xy = safe_sparse_dot(X.T, y, dense_output=True)
-
-    one_alpha = np.array_equal(alpha, len(alpha) * [alpha[0]])
-
-    if one_alpha:
-        A.flat[::n_features + 1] += alpha[0]
-        return linalg.solve(A, Xy, sym_pos=True,
-                            overwrite_a=True).T
-    else:
-        coefs = np.empty([n_targets, n_features], dtype=X.dtype)
-        for coef, target, current_alpha in zip(coefs, Xy.T, alpha):
-            A.flat[::n_features + 1] += current_alpha
-            coef[:] = linalg.solve(A, target, sym_pos=True,
-                                   overwrite_a=False).ravel()
-            A.flat[::n_features + 1] -= current_alpha
-        return coefs
-
-
-def _solve_cholesky_kernel(K, y, alpha, sample_weight=None, copy=False):
-    # dual_coef = inv(X X^t + alpha*Id) y
-    n_samples = K.shape[0]
-    n_targets = y.shape[1]
-
-    if copy:
-        K = K.copy()
-
-    alpha = np.atleast_1d(alpha)
-    one_alpha = (alpha == alpha[0]).all()
-    has_sw = isinstance(sample_weight, np.ndarray) \
-        or sample_weight not in [1.0, None]
-
-    if has_sw:
-        # Unlike other solvers, we need to support sample_weight directly
-        # because K might be a pre-computed kernel.
-        sw = np.sqrt(np.atleast_1d(sample_weight))
-        y = y * sw[:, np.newaxis]
-        K *= np.outer(sw, sw)
-
-    if one_alpha:
-        # Only one penalty, we can solve multi-target problems in one time.
-        K.flat[::n_samples + 1] += alpha[0]
-
-        try:
-            # Note: we must use overwrite_a=False in order to be able to
-            #       use the fall-back solution below in case a LinAlgError
-            #       is raised
-            dual_coef = linalg.solve(K, y, sym_pos=True,
-                                     overwrite_a=False)
-        except np.linalg.LinAlgError:
-            warnings.warn("Singular matrix in solving dual problem. Using "
-                          "least-squares solution instead.")
-            dual_coef = linalg.lstsq(K, y)[0]
-
-        # K is expensive to compute and store in memory so change it back in
-        # case it was user-given.
-        K.flat[::n_samples + 1] -= alpha[0]
-
-        if has_sw:
-            dual_coef *= sw[:, np.newaxis]
-
-        return dual_coef
-    else:
-        # One penalty per target. We need to solve each target separately.
-        dual_coefs = np.empty([n_targets, n_samples], K.dtype)
-
-        for dual_coef, target, current_alpha in zip(dual_coefs, y.T, alpha):
-            K.flat[::n_samples + 1] += current_alpha
-
-            dual_coef[:] = linalg.solve(K, target, sym_pos=True,
-                                        overwrite_a=False).ravel()
-
-            K.flat[::n_samples + 1] -= current_alpha
-
-        if has_sw:
-            dual_coefs *= sw[np.newaxis, :]
-
-        return dual_coefs.T
-
-
-def _solve_svd(X, y, alpha):
-    U, s, Vt = linalg.svd(X, full_matrices=False)
-    idx = s > 1e-15  # same default value as scipy.linalg.pinv
-    s_nnz = s[idx][:, np.newaxis]
-    UTy = np.dot(U.T, y)
-    d = np.zeros((s.size, alpha.size), dtype=X.dtype)
-    d[idx] = s_nnz / (s_nnz ** 2 + alpha)
-    d_UT_y = d * UTy
-    return np.dot(Vt.T, d_UT_y).T
-
-
 def _get_valid_accept_sparse(is_X_sparse, solver):
     if is_X_sparse and solver in ['auto', 'sag', 'saga']:
         return 'csr'
@@ -457,21 +265,7 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         coef, n_iter = _solve_lsqr(X, y, alpha, max_iter, tol)
 
     elif solver == 'cholesky':
-        if n_features > n_samples:
-            K = safe_sparse_dot(X, X.T, dense_output=True)
-            try:
-                dual_coef = _solve_cholesky_kernel(K, y, alpha)
-
-                coef = safe_sparse_dot(X.T, dual_coef, dense_output=True).T
-            except linalg.LinAlgError:
-                # use SVD solver if matrix is singular
-                solver = 'svd'
-        else:
-            try:
-                coef = _solve_cholesky(X, y, alpha)
-            except linalg.LinAlgError:
-                # use SVD solver if matrix is singular
-                solver = 'svd'
+        coef = _cholesky_helper(X, y, alpha, n_features, n_samples)
 
     elif solver in ['sag', 'saga']:
         # precompute max_squared_sum for all targets