diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index a370791d248e2..c01b74775684f 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -136,17 +136,24 @@ Setting the regularization parameter: generalized Cross-Validation ------------------------------------------------------------------ :class:`RidgeCV` implements ridge regression with built-in -cross-validation of the alpha parameter. The object works in the same way +cross-validation of the alpha parameter. The object works in the same way as GridSearchCV except that it defaults to Generalized Cross-Validation (GCV), an efficient form of leave-one-out cross-validation:: + >>> import numpy as np >>> from sklearn import linear_model - >>> reg = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0], cv=3) - >>> reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) # doctest: +SKIP - RidgeCV(alphas=[0.1, 1.0, 10.0], cv=3, fit_intercept=True, scoring=None, - normalize=False) - >>> reg.alpha_ # doctest: +SKIP - 0.1 + >>> reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13)) + >>> reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) # doctest: +NORMALIZE_WHITESPACE + RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, + 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]), + cv=None, fit_intercept=True, gcv_mode=None, normalize=False, + scoring=None, store_cv_values=False) + >>> reg.alpha_ + 0.01 + +Specifying the value of the `cv` attribute will trigger the use of +cross-validation with `GridSearchCV`, for example `cv=10` for 10-fold +cross-validation, rather than Generalized Cross-Validation. .. topic:: References diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 473bbf91878d5..38ca0a96040ea 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -38,6 +38,8 @@ random sampling procedures. seed, including :class:`linear_model.LogisticRegression`, :class:`linear_model.LogisticRegressionCV`, :class:`linear_model.Ridge`, and :class:`linear_model.RidgeCV` with 'sag' solver. |Fix| +- :class:`linear_model.ridge.RidgeCV` when using generalized cross-validation + with sparse inputs. |Fix| Details are listed in the changelog below. @@ -384,7 +386,7 @@ Support for Python 3.4 and below has been officially dropped. :mod:`sklearn.linear_model` ........................... -- |Enhancement| :mod:`linear_model.ridge` now preserves ``float32`` and +- |Enhancement| :class:`linear_model.Ridge` now preserves ``float32`` and ``float64`` dtypes. :issues:`8769` and :issues:`11000` by :user:`Guillaume Lemaitre `, and :user:`Joan Massich ` @@ -482,6 +484,10 @@ Support for Python 3.4 and below has been officially dropped. in version 0.21 and will be removed in version 0.23. :pr:`12821` by :user:`Nicolas Hug `. +- |Fix| :class:`linear_model.ridge.RidgeCV` with generalized cross-validation + now correctly fits an intercept when ``fit_intercept=True`` and the design + matrix is sparse. :issue:`13350` by :user:`Jérôme Dockès ` + :mod:`sklearn.manifold` ....................... diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py index b36535ffaeff1..0e54126e52c33 100644 --- a/sklearn/linear_model/ridge.py +++ b/sklearn/linear_model/ridge.py @@ -31,6 +31,7 @@ from ..model_selection import GridSearchCV from ..metrics.scorer import check_scoring from ..exceptions import ConvergenceWarning +from ..utils.sparsefuncs import mean_variance_axis def _solve_sparse_cg(X, y, alpha, max_iter=None, tol=1e-3, verbose=0, @@ -927,6 +928,106 @@ def classes_(self): return self._label_binarizer.classes_ +def _check_gcv_mode(X, gcv_mode): + possible_gcv_modes = [None, 'auto', 'svd', 'eigen'] + if gcv_mode not in possible_gcv_modes: + raise ValueError( + "Unknown value for 'gcv_mode'. " + "Got {} instead of one of {}" .format( + gcv_mode, possible_gcv_modes)) + if gcv_mode in ['eigen', 'svd']: + return gcv_mode + # if X has more rows than columns, use decomposition of X^T.X, + # otherwise X.X^T + if X.shape[0] > X.shape[1]: + return 'svd' + return 'eigen' + + +def _find_smallest_angle(query, vectors): + """Find the column of vectors that is most aligned with the query. + + Both query and the columns of vectors must have their l2 norm equal to 1. + + Parameters + ---------- + query : ndarray, shape (n_samples,) + Normalized query vector. + + vectors : ndarray, shape (n_samples, n_features) + Vectors to which we compare query, as columns. Must be normalized. + """ + abs_cosine = np.abs(query.dot(vectors)) + index = np.argmax(abs_cosine) + return index + + +class _X_operator(sparse.linalg.LinearOperator): + """Behaves as centered and scaled X with an added intercept column. + + This operator behaves as + np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]) + """ + + def __init__(self, X, X_mean, sqrt_sw): + n_samples, n_features = X.shape + super().__init__(X.dtype, (n_samples, n_features + 1)) + self.X = X + self.X_mean = X_mean + self.sqrt_sw = sqrt_sw + + def _matvec(self, v): + v = v.ravel() + return safe_sparse_dot( + self.X, v[:-1], dense_output=True + ) - self.sqrt_sw * self.X_mean.dot(v[:-1]) + v[-1] * self.sqrt_sw + + def _matmat(self, v): + return ( + safe_sparse_dot(self.X, v[:-1], dense_output=True) - + self.sqrt_sw[:, None] * self.X_mean.dot(v[:-1]) + v[-1] * + self.sqrt_sw[:, None]) + + def _transpose(self): + return _Xt_operator(self.X, self.X_mean, self.sqrt_sw) + + +class _Xt_operator(sparse.linalg.LinearOperator): + """Behaves as transposed centered and scaled X with an intercept column. + + This operator behaves as + np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T + """ + + def __init__(self, X, X_mean, sqrt_sw): + n_samples, n_features = X.shape + super().__init__(X.dtype, (n_features + 1, n_samples)) + self.X = X + self.X_mean = X_mean + self.sqrt_sw = sqrt_sw + + def _matvec(self, v): + v = v.ravel() + n_features = self.shape[0] + res = np.empty(n_features, dtype=self.X.dtype) + res[:-1] = ( + safe_sparse_dot(self.X.T, v, dense_output=True) - + (self.X_mean * self.sqrt_sw.dot(v)) + ) + res[-1] = np.dot(v, self.sqrt_sw) + return res + + def _matmat(self, v): + n_features = self.shape[0] + res = np.empty((n_features, v.shape[1]), dtype=self.X.dtype) + res[:-1] = ( + safe_sparse_dot(self.X.T, v, dense_output=True) - + self.X_mean[:, None] * self.sqrt_sw.dot(v) + ) + res[-1] = np.dot(self.sqrt_sw, v) + return res + + class _RidgeGCV(LinearModel): """Ridge regression with built-in Generalized Cross-Validation @@ -978,18 +1079,6 @@ def __init__(self, alphas=(0.1, 1.0, 10.0), self.gcv_mode = gcv_mode self.store_cv_values = store_cv_values - def _pre_compute(self, X, y, centered_kernel=True): - # even if X is very sparse, K is usually very dense - K = safe_sparse_dot(X, X.T, dense_output=True) - # the following emulates an additional constant regressor - # corresponding to fit_intercept=True - # but this is done only when the features have been centered - if centered_kernel: - K += np.ones_like(K) - v, Q = linalg.eigh(K) - QT_y = np.dot(Q.T, y) - return v, Q, QT_y - def _decomp_diag(self, v_prime, Q): # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T)) return (v_prime * Q ** 2).sum(axis=-1) @@ -1001,18 +1090,161 @@ def _diag_dot(self, D, B): D = D[(slice(None), ) + (np.newaxis, ) * (len(B.shape) - 1)] return D * B - def _errors_and_values_helper(self, alpha, y, v, Q, QT_y): - """Helper function to avoid code duplication between self._errors and - self._values. + def _compute_gram(self, X, sqrt_sw): + """Computes the Gram matrix with possible centering. - Notes - ----- - We don't construct matrix G, instead compute action on y & diagonal. + If ``center`` is ``True``, compute + (X - X.mean(axis=0)).dot((X - X.mean(axis=0)).T) + else X.dot(X.T) + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input uncentered data. + + sqrt_sw : ndarray, shape (n_samples,) + square roots of sample weights + + center : bool, default is True + Whether or not to remove the mean from ``X``. + + Returns + ------- + gram : ndarray, shape (n_samples, n_samples) + The Gram matrix. + X_mean : ndarray, shape (n_feature,) + The mean of ``X`` for each feature. + """ + center = self.fit_intercept and sparse.issparse(X) + if not center: + # in this case centering has been done in preprocessing + # or we are not fitting an intercept. + X_mean = np.zeros(X.shape[1], dtype=X.dtype) + return safe_sparse_dot(X, X.T, dense_output=True), X_mean + # otherwise X is always sparse + n_samples = X.shape[0] + sample_weight_matrix = sparse.dia_matrix( + (sqrt_sw, 0), shape=(n_samples, n_samples)) + X_weighted = sample_weight_matrix.dot(X) + X_mean, _ = mean_variance_axis(X_weighted, axis=0) + X_mean *= n_samples / sqrt_sw.dot(sqrt_sw) + X_mX = sqrt_sw[:, None] * safe_sparse_dot( + X_mean, X.T, dense_output=True) + X_mX_m = np.outer(sqrt_sw, sqrt_sw) * np.dot(X_mean, X_mean) + return (safe_sparse_dot(X, X.T, dense_output=True) + X_mX_m + - X_mX - X_mX.T, X_mean) + + def _compute_covariance(self, X, sqrt_sw): + """Computes centered covariance matrix. + + If ``center`` is ``True``, compute + (X - X.mean(axis=0)).T.dot(X - X.mean(axis=0)) + else + X.T.dot(X) + + Parameters + ---------- + X : sparse matrix, shape (n_samples, n_features) + The input uncentered data. + + sqrt_sw : ndarray, shape (n_samples,) + square roots of sample weights + + center : bool, default is True + Whether or not to remove the mean from ``X``. + + Returns + ------- + covariance : ndarray, shape (n_features, n_features) + The covariance matrix. + X_mean : ndarray, shape (n_feature,) + The mean of ``X`` for each feature. + """ + if not self.fit_intercept: + # in this case centering has been done in preprocessing + # or we are not fitting an intercept. + X_mean = np.zeros(X.shape[1], dtype=X.dtype) + return safe_sparse_dot(X.T, X, dense_output=True), X_mean + # this function only gets called for sparse X + n_samples = X.shape[0] + sample_weight_matrix = sparse.dia_matrix( + (sqrt_sw, 0), shape=(n_samples, n_samples)) + X_weighted = sample_weight_matrix.dot(X) + X_mean, _ = mean_variance_axis(X_weighted, axis=0) + X_mean = X_mean * n_samples / sqrt_sw.dot(sqrt_sw) + weight_sum = sqrt_sw.dot(sqrt_sw) + return (safe_sparse_dot(X.T, X, dense_output=True) - + weight_sum * np.outer(X_mean, X_mean), + X_mean) + + def _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw): + """Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T) + without explicitely centering X nor computing X.dot(A) + when X is sparse. + + Parameters + ---------- + X : sparse matrix, shape = (n_samples, n_features) + + A : np.ndarray, shape = (n_features, n_features) + + X_mean : np.ndarray, shape = (n_features,) + + sqrt_sw : np.ndarray, shape = (n_features,) + square roots of sample weights + + Returns + ------- + diag : np.ndarray, shape = (n_samples,) + The computed diagonal. + """ + intercept_col = sqrt_sw + scale = sqrt_sw + batch_size = X.shape[1] + diag = np.empty(X.shape[0], dtype=X.dtype) + for start in range(0, X.shape[0], batch_size): + batch = slice(start, min(X.shape[0], start + batch_size), 1) + X_batch = np.empty( + (X[batch].shape[0], X.shape[1] + self.fit_intercept), + dtype=X.dtype + ) + if self.fit_intercept: + X_batch[:, :-1] = X[batch].A - X_mean * scale[batch][:, None] + X_batch[:, -1] = intercept_col[batch] + else: + X_batch = X[batch].A + diag[batch] = (X_batch.dot(A) * X_batch).sum(axis=1) + return diag + + def _eigen_decompose_gram(self, X, y, sqrt_sw): + """Eigendecomposition of X.X^T, used when n_samples <= n_features""" + # if X is dense it has already been centered in preprocessing + K, X_mean = self._compute_gram(X, sqrt_sw) + if self.fit_intercept: + # to emulate centering X with sample weights, + # ie removing the weighted average, we add a column + # containing the square roots of the sample weights. + # by centering, it is orthogonal to the other columns + K += np.outer(sqrt_sw, sqrt_sw) + v, Q = linalg.eigh(K) + QT_y = np.dot(Q.T, y) + return X_mean, v, Q, QT_y + + def _solve_eigen_gram(self, alpha, y, sqrt_sw, X_mean, v, Q, QT_y): + """Compute dual coefficients and diagonal of (Identity - Hat_matrix) + + Used when we have a decomposition of X.X^T (n_features >= n_samples). """ w = 1. / (v + alpha) - constant_column = np.var(Q, 0) < 1.e-12 - # detect constant columns - w[constant_column] = 0 # cancel the regularization for the intercept + if self.fit_intercept: + # the vector containing the square roots of the sample weights (1 + # when no sample weights) is the eigenvector of XX^T which + # corresponds to the intercept; we cancel the regularization on + # this dimension. the corresponding eigenvalue is + # sum(sample_weight). + normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw) + intercept_dim = _find_smallest_angle(normalized_sw, Q) + w[intercept_dim] = 0 # cancel regularization for the intercept c = np.dot(Q, self._diag_dot(w, QT_y)) G_diag = self._decomp_diag(w, Q) @@ -1021,35 +1253,117 @@ def _errors_and_values_helper(self, alpha, y, v, Q, QT_y): G_diag = G_diag[:, np.newaxis] return G_diag, c - def _errors(self, alpha, y, v, Q, QT_y): - G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y) - return (c / G_diag) ** 2, c + def _eigen_decompose_covariance(self, X, y, sqrt_sw): + """Eigendecomposition of X^T.X, used when n_samples > n_features.""" + n_samples, n_features = X.shape + cov = np.empty((n_features + 1, n_features + 1), dtype=X.dtype) + cov[:-1, :-1], X_mean = self._compute_covariance(X, sqrt_sw) + if not self.fit_intercept: + cov = cov[:-1, :-1] + # to emulate centering X with sample weights, + # ie removing the weighted average, we add a column + # containing the square roots of the sample weights. + # by centering, it is orthogonal to the other columns + # when all samples have the same weight we add a column of 1 + else: + cov[-1] = 0 + cov[:, -1] = 0 + cov[-1, -1] = sqrt_sw.dot(sqrt_sw) + nullspace_dim = max(0, X.shape[1] - X.shape[0]) + s, V = linalg.eigh(cov) + # remove eigenvalues and vectors in the null space of X^T.X + s = s[nullspace_dim:] + V = V[:, nullspace_dim:] + return X_mean, s, V, X + + def _solve_eigen_covariance_no_intercept( + self, alpha, y, sqrt_sw, X_mean, s, V, X): + """Compute dual coefficients and diagonal of (Identity - Hat_matrix) + + Used when we have a decomposition of X^T.X + (n_features < n_samples and X is sparse), and not fitting an intercept. + """ + w = 1 / (s + alpha) + A = (V * w).dot(V.T) + AXy = A.dot(safe_sparse_dot(X.T, y, dense_output=True)) + y_hat = safe_sparse_dot(X, AXy, dense_output=True) + hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw) + if len(y.shape) != 1: + # handle case where y is 2-d + hat_diag = hat_diag[:, np.newaxis] + return (1 - hat_diag) / alpha, (y - y_hat) / alpha - def _values(self, alpha, y, v, Q, QT_y): - G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y) - return y - (c / G_diag), c + def _solve_eigen_covariance_intercept( + self, alpha, y, sqrt_sw, X_mean, s, V, X): + """Compute dual coefficients and diagonal of (Identity - Hat_matrix) - def _pre_compute_svd(self, X, y, centered_kernel=True): - if sparse.issparse(X): - raise TypeError("SVD not supported for sparse matrices") - if centered_kernel: - X = np.hstack((X, np.ones((X.shape[0], 1)))) - # to emulate fit_intercept=True situation, add a column on ones - # Note that by centering, the other columns are orthogonal to that one + Used when we have a decomposition of X^T.X + (n_features < n_samples and X is sparse), + and we are fitting an intercept. + """ + # the vector [0, 0, ..., 0, 1] + # is the eigenvector of X^TX which + # corresponds to the intercept; we cancel the regularization on + # this dimension. the corresponding eigenvalue is + # sum(sample_weight), e.g. n when uniform sample weights. + intercept_sv = np.zeros(V.shape[0]) + intercept_sv[-1] = 1 + intercept_dim = _find_smallest_angle(intercept_sv, V) + w = 1 / (s + alpha) + w[intercept_dim] = 1 / s[intercept_dim] + A = (V * w).dot(V.T) + # add a column to X containing the square roots of sample weights + X_op = _X_operator(X, X_mean, sqrt_sw) + AXy = A.dot(X_op.T.dot(y)) + y_hat = X_op.dot(AXy) + hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw) + # return (1 - hat_diag), (y - y_hat) + if len(y.shape) != 1: + # handle case where y is 2-d + hat_diag = hat_diag[:, np.newaxis] + return (1 - hat_diag) / alpha, (y - y_hat) / alpha + + def _solve_eigen_covariance( + self, alpha, y, sqrt_sw, X_mean, s, V, X): + """Compute dual coefficients and diagonal of (Identity - Hat_matrix) + + Used when we have a decomposition of X^T.X + (n_features < n_samples and X is sparse). + """ + if self.fit_intercept: + return self._solve_eigen_covariance_intercept( + alpha, y, sqrt_sw, X_mean, s, V, X) + return self._solve_eigen_covariance_no_intercept( + alpha, y, sqrt_sw, X_mean, s, V, X) + + def _svd_decompose_design_matrix(self, X, y, sqrt_sw): + # X already centered + X_mean = np.zeros(X.shape[1], dtype=X.dtype) + if self.fit_intercept: + # to emulate fit_intercept=True situation, add a column + # containing the square roots of the sample weights + # by centering, the other columns are orthogonal to that one + intercept_column = sqrt_sw[:, None] + X = np.hstack((X, intercept_column)) U, s, _ = linalg.svd(X, full_matrices=0) v = s ** 2 UT_y = np.dot(U.T, y) - return v, U, UT_y + return X_mean, v, U, UT_y - def _errors_and_values_svd_helper(self, alpha, y, v, U, UT_y): - """Helper function to avoid code duplication between self._errors_svd - and self._values_svd. + def _solve_svd_design_matrix( + self, alpha, y, sqrt_sw, X_mean, v, U, UT_y): + """Compute dual coefficients and diagonal of (Identity - Hat_matrix) + + Used when we have an SVD decomposition of X + (n_features >= n_samples and X is dense). """ - constant_column = np.var(U, 0) < 1.e-12 - # detect columns colinear to ones w = ((v + alpha) ** -1) - (alpha ** -1) - w[constant_column] = - (alpha ** -1) - # cancel the regularization for the intercept + if self.fit_intercept: + # detect intercept column + normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw) + intercept_dim = _find_smallest_angle(normalized_sw, U) + # cancel the regularization for the intercept + w[intercept_dim] = - (alpha ** -1) c = np.dot(U, self._diag_dot(w, UT_y)) + (alpha ** -1) * y G_diag = self._decomp_diag(w, U) + (alpha ** -1) if len(y.shape) != 1: @@ -1057,24 +1371,16 @@ def _errors_and_values_svd_helper(self, alpha, y, v, U, UT_y): G_diag = G_diag[:, np.newaxis] return G_diag, c - def _errors_svd(self, alpha, y, v, U, UT_y): - G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y) - return (c / G_diag) ** 2, c - - def _values_svd(self, alpha, y, v, U, UT_y): - G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y) - return y - (c / G_diag), c - def fit(self, X, y, sample_weight=None): """Fit Ridge regression model Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] - Training data + Training data. Will be cast to float64 if necessary y : array-like, shape = [n_samples] or [n_samples, n_targets] - Target values. Will be cast to X's dtype if necessary + Target values. Will be cast to float64 if necessary sample_weight : float or array-like of shape [n_samples] Sample weight @@ -1083,10 +1389,15 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = check_X_y(X, y, - accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float64, np.float32], + X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], + dtype=[np.float64], multi_output=True, y_numeric=True) + + if np.any(self.alphas <= 0): + raise ValueError( + "alphas must be positive. Got {} containing some " + "negative or null value instead.".format(self.alphas)) + if sample_weight is not None and not isinstance(sample_weight, float): sample_weight = check_array(sample_weight, ensure_2d=False, dtype=X.dtype) @@ -1096,56 +1407,42 @@ def fit(self, X, y, sample_weight=None): X, y, self.fit_intercept, self.normalize, self.copy_X, sample_weight=sample_weight) - gcv_mode = self.gcv_mode - with_sw = len(np.shape(sample_weight)) - - if gcv_mode is None or gcv_mode == 'auto': - if sparse.issparse(X) or n_features > n_samples or with_sw: - gcv_mode = 'eigen' - else: - gcv_mode = 'svd' - elif gcv_mode == "svd" and with_sw: - # FIXME non-uniform sample weights not yet supported - warnings.warn("non-uniform sample weights unsupported for svd, " - "forcing usage of eigen") - gcv_mode = 'eigen' + gcv_mode = _check_gcv_mode(X, self.gcv_mode) if gcv_mode == 'eigen': - _pre_compute = self._pre_compute - _errors = self._errors - _values = self._values + decompose = self._eigen_decompose_gram + solve = self._solve_eigen_gram elif gcv_mode == 'svd': - # assert n_samples >= n_features - _pre_compute = self._pre_compute_svd - _errors = self._errors_svd - _values = self._values_svd - else: - raise ValueError('bad gcv_mode "%s"' % gcv_mode) + if sparse.issparse(X): + decompose = self._eigen_decompose_covariance + solve = self._solve_eigen_covariance + else: + decompose = self._svd_decompose_design_matrix + solve = self._solve_svd_design_matrix if sample_weight is not None: X, y = _rescale_data(X, y, sample_weight) - - centered_kernel = not sparse.issparse(X) and self.fit_intercept - - v, Q, QT_y = _pre_compute(X, y, centered_kernel) - n_y = 1 if len(y.shape) == 1 else y.shape[1] - cv_values = np.zeros((n_samples * n_y, len(self.alphas))) - C = [] + sqrt_sw = np.sqrt(sample_weight) + else: + sqrt_sw = np.ones(X.shape[0], dtype=X.dtype) scorer = check_scoring(self, scoring=self.scoring, allow_none=True) error = scorer is None - if np.any(self.alphas < 0): - raise ValueError("alphas cannot be negative. " - "Got {} containing some " - "negative value instead.".format(self.alphas)) - + n_y = 1 if len(y.shape) == 1 else y.shape[1] + cv_values = np.zeros((n_samples * n_y, len(self.alphas)), + dtype=X.dtype) + C = [] + X_mean, *decomposition = decompose(X, y, sqrt_sw) for i, alpha in enumerate(self.alphas): + G_diag, c = solve( + float(alpha), y, sqrt_sw, X_mean, *decomposition) if error: - out, c = _errors(float(alpha), y, v, Q, QT_y) + squared_errors = (c / G_diag) ** 2 + cv_values[:, i] = squared_errors.ravel() else: - out, c = _values(float(alpha), y, v, Q, QT_y) - cv_values[:, i] = out.ravel() + predictions = y - (c / G_diag) + cv_values[:, i] = predictions.ravel() C.append(c) if error: @@ -1167,6 +1464,7 @@ def identity_estimator(): self.dual_coef_ = C[best] self.coef_ = safe_sparse_dot(self.dual_coef_.T, X) + X_offset += X_mean * X_scale self._set_intercept(X_offset, y_offset, X_scale) if self.store_cv_values: @@ -1198,7 +1496,8 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- X : array-like, shape = [n_samples, n_features] - Training data + Training data. If using GCV, will be cast to float64 + if necessary. y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values. Will be cast to X's dtype if necessary @@ -1209,8 +1508,17 @@ def fit(self, X, y, sample_weight=None): Returns ------- self : object + + Notes + ----- + When sample_weight is provided, the selected hyperparameter may depend + on whether we use generalized cross-validation (cv=None or cv='auto') + or another form of cross-validation, because only generalized + cross-validation takes the sample weights into account when computing + the validation score. """ - if self.cv is None: + cv = self.cv + if cv is None: estimator = _RidgeGCV(self.alphas, fit_intercept=self.fit_intercept, normalize=self.normalize, @@ -1226,9 +1534,11 @@ def fit(self, X, y, sample_weight=None): raise ValueError("cv!=None and store_cv_values=True " " are incompatible") parameters = {'alpha': self.alphas} + solver = 'sparse_cg' if sparse.issparse(X) else 'auto' gs = GridSearchCV(Ridge(fit_intercept=self.fit_intercept, - normalize=self.normalize), - parameters, cv=self.cv, scoring=self.scoring) + normalize=self.normalize, + solver=solver), + parameters, cv=cv, scoring=self.scoring) gs.fit(X, y, sample_weight=sample_weight) estimator = gs.best_estimator_ self.alpha_ = gs.best_estimator_.alpha @@ -1258,6 +1568,7 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin): the estimates. Larger values specify stronger regularization. Alpha corresponds to ``C^-1`` in other linear models such as LogisticRegression or LinearSVC. + If using generalized cross-validation, alphas must be positive. fit_intercept : boolean Whether to calculate the intercept for this model. If set @@ -1276,12 +1587,15 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin): A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. + If None, the negative mean squared error if cv is 'auto' or None + (i.e. when using generalized cross-validation), and r2 score otherwise. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the efficient Leave-One-Out cross-validation + (also known as Generalized Cross-Validation). - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. @@ -1297,15 +1611,13 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin): Flag indicating which strategy to use when performing Generalized Cross-Validation. Options are:: - 'auto' : use svd if n_samples > n_features or when X is a sparse - matrix, otherwise use eigen - 'svd' : force computation via singular value decomposition of X - (does not work for sparse matrices) - 'eigen' : force computation via eigendecomposition of X^T X + 'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen' + 'svd' : force use of singular value decomposition of X when X is + dense, eigenvalue decomposition of X^T.X when X is sparse. + 'eigen' : force computation via eigendecomposition of X.X^T The 'auto' mode is the default and is intended to pick the cheaper - option of the two depending upon the shape and format of the training - data. + option of the two depending on the shape of the training data. store_cv_values : boolean, default=False Flag indicating if the cross-validation values corresponding to @@ -1472,7 +1784,8 @@ def fit(self, X, y, sample_weight=None): ---------- X : array-like, shape (n_samples, n_features) Training vectors, where n_samples is the number of samples - and n_features is the number of features. + and n_features is the number of features. When using GCV, + will be cast to float64 if necessary. y : array-like, shape (n_samples,) Target values. Will be cast to X's dtype if necessary diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index a9c2f98a48b90..1cd386ee5d618 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -9,7 +9,6 @@ from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_almost_equal -from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_greater @@ -35,10 +34,12 @@ from sklearn.linear_model.ridge import RidgeClassifierCV from sklearn.linear_model.ridge import _solve_cholesky from sklearn.linear_model.ridge import _solve_cholesky_kernel +from sklearn.linear_model.ridge import _check_gcv_mode +from sklearn.linear_model.ridge import _X_operator from sklearn.datasets import make_regression from sklearn.model_selection import GridSearchCV -from sklearn.model_selection import KFold +from sklearn.model_selection import KFold, GroupKFold, cross_val_predict from sklearn.utils import check_random_state, _IS_32BIT from sklearn.datasets import make_multilabel_classification @@ -313,6 +314,213 @@ def test_ridge_individual_penalties(): assert_raises(ValueError, ridge.fit, X, y) +@pytest.mark.parametrize('n_col', [(), (1,), (3,)]) +def test_x_operator(n_col): + rng = np.random.RandomState(0) + X = rng.randn(11, 8) + X_m = rng.randn(8) + sqrt_sw = rng.randn(len(X)) + Y = rng.randn(11, *n_col) + A = rng.randn(9, *n_col) + operator = _X_operator(sp.csr_matrix(X), X_m, sqrt_sw) + reference_operator = np.hstack( + [X - sqrt_sw[:, None] * X_m, sqrt_sw[:, None]]) + assert_allclose(reference_operator.dot(A), operator.dot(A)) + assert_allclose(reference_operator.T.dot(Y), operator.T.dot(Y)) + + +@pytest.mark.parametrize('shape', [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)]) +@pytest.mark.parametrize('uniform_weights', [True, False]) +def test_compute_gram(shape, uniform_weights): + rng = np.random.RandomState(0) + X = rng.randn(*shape) + if uniform_weights: + sw = np.ones(X.shape[0]) + else: + sw = rng.chisquare(1, shape[0]) + sqrt_sw = np.sqrt(sw) + X_mean = np.average(X, axis=0, weights=sw) + X_centered = (X - X_mean) * sqrt_sw[:, None] + true_gram = X_centered.dot(X_centered.T) + X_sparse = sp.csr_matrix(X * sqrt_sw[:, None]) + gcv = _RidgeGCV(fit_intercept=True) + computed_gram, computed_mean = gcv._compute_gram(X_sparse, sqrt_sw) + assert_allclose(X_mean, computed_mean) + assert_allclose(true_gram, computed_gram) + + +@pytest.mark.parametrize('shape', [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)]) +@pytest.mark.parametrize('uniform_weights', [True, False]) +def test_compute_covariance(shape, uniform_weights): + rng = np.random.RandomState(0) + X = rng.randn(*shape) + if uniform_weights: + sw = np.ones(X.shape[0]) + else: + sw = rng.chisquare(1, shape[0]) + sqrt_sw = np.sqrt(sw) + X_mean = np.average(X, axis=0, weights=sw) + X_centered = (X - X_mean) * sqrt_sw[:, None] + true_covariance = X_centered.T.dot(X_centered) + X_sparse = sp.csr_matrix(X * sqrt_sw[:, None]) + gcv = _RidgeGCV(fit_intercept=True) + computed_cov, computed_mean = gcv._compute_covariance(X_sparse, sqrt_sw) + assert_allclose(X_mean, computed_mean) + assert_allclose(true_covariance, computed_cov) + + +def _make_sparse_offset_regression( + n_samples=100, n_features=100, proportion_nonzero=.5, + n_informative=10, n_targets=1, bias=13., X_offset=30., + noise=30., shuffle=True, coef=False, random_state=None): + X, y, c = make_regression( + n_samples=n_samples, n_features=n_features, + n_informative=n_informative, n_targets=n_targets, bias=bias, + noise=noise, shuffle=shuffle, + coef=True, random_state=random_state) + if n_features == 1: + c = np.asarray([c]) + X += X_offset + mask = np.random.RandomState(random_state).binomial( + 1, proportion_nonzero, X.shape) > 0 + removed_X = X.copy() + X[~mask] = 0. + removed_X[mask] = 0. + y -= removed_X.dot(c) + if n_features == 1: + c = c[0] + if coef: + return X, y, c + return X, y + + +@pytest.mark.parametrize('gcv_mode', ['svd', 'eigen']) +@pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix]) +@pytest.mark.parametrize('X_shape', [(11, 8), (11, 20)]) +@pytest.mark.parametrize('fit_intercept', [True, False]) +@pytest.mark.parametrize( + 'y_shape, normalize, noise', + [ + ((11,), True, 1.), + ((11, 1), False, 30.), + ((11, 3), False, 150.), + ] +) +def test_ridge_gcv_vs_ridge_loo_cv( + gcv_mode, X_constructor, X_shape, y_shape, + fit_intercept, normalize, noise): + n_samples, n_features = X_shape + n_targets = y_shape[-1] if len(y_shape) == 2 else 1 + X, y = _make_sparse_offset_regression( + n_samples=n_samples, n_features=n_features, n_targets=n_targets, + random_state=0, shuffle=False, noise=noise, n_informative=5 + ) + y = y.reshape(y_shape) + + alphas = [1e-3, .1, 1., 10., 1e3] + loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept, + alphas=alphas, scoring='neg_mean_squared_error', + normalize=normalize) + gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept, + alphas=alphas, normalize=normalize) + + loo_ridge.fit(X, y) + + X_gcv = X_constructor(X) + gcv_ridge.fit(X_gcv, y) + + assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_) + assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3) + assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3) + + +@pytest.mark.parametrize('gcv_mode', ['svd', 'eigen']) +@pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix]) +@pytest.mark.parametrize('n_features', [8, 20]) +@pytest.mark.parametrize('y_shape, fit_intercept, noise', + [((11,), True, 1.), + ((11, 1), True, 20.), + ((11, 3), True, 150.), + ((11, 3), False, 30.)]) +def test_ridge_gcv_sample_weights( + gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise): + alphas = [1e-3, .1, 1., 10., 1e3] + rng = np.random.RandomState(0) + n_targets = y_shape[-1] if len(y_shape) == 2 else 1 + X, y = _make_sparse_offset_regression( + n_samples=11, n_features=n_features, n_targets=n_targets, + random_state=0, shuffle=False, noise=noise) + y = y.reshape(y_shape) + + sample_weight = 3 * rng.randn(len(X)) + sample_weight = (sample_weight - sample_weight.min() + 1).astype(int) + indices = np.repeat(np.arange(X.shape[0]), sample_weight) + sample_weight = sample_weight.astype(float) + X_tiled, y_tiled = X[indices], y[indices] + + cv = GroupKFold(n_splits=X.shape[0]) + splits = cv.split(X_tiled, y_tiled, groups=indices) + kfold = RidgeCV( + alphas=alphas, cv=splits, scoring='neg_mean_squared_error', + fit_intercept=fit_intercept) + # ignore warning from GridSearchCV: DeprecationWarning: The default of the + # `iid` parameter will change from True to False in version 0.22 and will + # be removed in 0.24 + with ignore_warnings(category=DeprecationWarning): + kfold.fit(X_tiled, y_tiled) + + ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept) + splits = cv.split(X_tiled, y_tiled, groups=indices) + predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits) + kfold_errors = (y_tiled - predictions)**2 + kfold_errors = [ + np.sum(kfold_errors[indices == i], axis=0) for + i in np.arange(X.shape[0])] + kfold_errors = np.asarray(kfold_errors) + + X_gcv = X_constructor(X) + gcv_ridge = RidgeCV( + alphas=alphas, store_cv_values=True, + gcv_mode=gcv_mode, fit_intercept=fit_intercept) + gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight) + if len(y_shape) == 2: + gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)] + else: + gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)] + + assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_) + assert_allclose(gcv_errors, kfold_errors, rtol=1e-3) + assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3) + assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3) + + +@pytest.mark.parametrize('mode', [True, 1, 5, 'bad', 'gcv']) +def test_check_gcv_mode_error(mode): + X, y = make_regression(n_samples=5, n_features=2) + gcv = RidgeCV(gcv_mode=mode) + with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): + gcv.fit(X, y) + with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): + _check_gcv_mode(X, mode) + + +@pytest.mark.parametrize("sparse", [True, False]) +@pytest.mark.parametrize( + 'mode, mode_n_greater_than_p, mode_p_greater_than_n', + [(None, 'svd', 'eigen'), + ('auto', 'svd', 'eigen'), + ('eigen', 'eigen', 'eigen'), + ('svd', 'svd', 'svd')] +) +def test_check_gcv_mode_choice(sparse, mode, mode_n_greater_than_p, + mode_p_greater_than_n): + X, _ = make_regression(n_samples=5, n_features=2) + if sparse: + X = sp.csr_matrix(X) + assert _check_gcv_mode(X, mode) == mode_n_greater_than_p + assert _check_gcv_mode(X.T, mode) == mode_p_greater_than_n + + def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] @@ -320,46 +528,7 @@ def _test_ridge_loo(filter_): ret = [] fit_intercept = filter_ == DENSE_FILTER - if fit_intercept: - X_diabetes_ = X_diabetes - X_diabetes.mean(0) - else: - X_diabetes_ = X_diabetes ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept) - ridge = Ridge(alpha=1.0, fit_intercept=fit_intercept) - - # because fit_intercept is applied - - # generalized cross-validation (efficient leave-one-out) - decomp = ridge_gcv._pre_compute(X_diabetes_, y_diabetes, fit_intercept) - errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp) - values, c = ridge_gcv._values(1.0, y_diabetes, *decomp) - - # brute-force leave-one-out: remove one example at a time - errors2 = [] - values2 = [] - for i in range(n_samples): - sel = np.arange(n_samples) != i - X_new = X_diabetes_[sel] - y_new = y_diabetes[sel] - ridge.fit(X_new, y_new) - value = ridge.predict([X_diabetes_[i]])[0] - error = (y_diabetes[i] - value) ** 2 - errors2.append(error) - values2.append(value) - - # check that efficient and brute-force LOO give same results - assert_almost_equal(errors, errors2) - assert_almost_equal(values, values2) - - # generalized cross-validation (efficient leave-one-out, - # SVD variation) - decomp = ridge_gcv._pre_compute_svd(X_diabetes_, y_diabetes, fit_intercept) - errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp) - values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp) - - # check that efficient and SVD efficient LOO give same results - assert_almost_equal(errors, errors3) - assert_almost_equal(values, values3) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) @@ -371,25 +540,26 @@ def _test_ridge_loo(filter_): scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) - assert_equal(ridge_gcv2.alpha_, alpha_) + assert ridge_gcv2.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) - assert_equal(ridge_gcv3.alpha_, alpha_) + assert ridge_gcv3.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('neg_mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) - assert_equal(ridge_gcv4.alpha_, alpha_) + assert ridge_gcv4.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with sample weights - ridge_gcv.fit(filter_(X_diabetes), y_diabetes, - sample_weight=np.ones(n_samples)) - assert_equal(ridge_gcv.alpha_, alpha_) + if filter_ == DENSE_FILTER: + ridge_gcv.fit(filter_(X_diabetes), y_diabetes, + sample_weight=np.ones(n_samples)) + assert ridge_gcv.alpha_ == pytest.approx(alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T @@ -399,8 +569,8 @@ def _test_ridge_loo(filter_): ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) - assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, - Y_pred, decimal=5) + assert_allclose(np.vstack((y_pred, y_pred)).T, + Y_pred, rtol=1e-5) return ret @@ -409,7 +579,7 @@ def _test_ridge_cv_normalize(filter_): ridge_cv = RidgeCV(normalize=True, cv=3) ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes) - gs = GridSearchCV(Ridge(normalize=True), cv=3, + gs = GridSearchCV(Ridge(normalize=True, solver='sparse_cg'), cv=3, param_grid={'alpha': ridge_cv.alphas}) gs.fit(filter_(10. * X_diabetes), y_diabetes) assert_equal(gs.best_estimator_.alpha, ridge_cv.alpha_) @@ -503,12 +673,6 @@ def test_dense_sparse(test_func): check_dense_sparse(test_func) -def test_ridge_cv_sparse_svd(): - X = sp.csr_matrix(X_diabetes) - ridge = RidgeCV(gcv_mode="svd") - assert_raises(TypeError, ridge.fit, X) - - def test_ridge_sparse_svd(): X = sp.csc_matrix(rng.rand(100, 10)) y = rng.rand(100) @@ -622,6 +786,10 @@ def test_ridgecv_store_cv_values(): r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_targets, n_alphas) + r = RidgeCV(cv=3, store_cv_values=True) + assert_raises_regex(ValueError, 'cv!=None and store_cv_values', + r.fit, x, y) + @pytest.mark.filterwarnings('ignore: The default value of cv') # 0.22 def test_ridge_classifier_cv_store_cv_values(): @@ -764,13 +932,13 @@ def test_ridgecv_negative_alphas(): # Negative integers ridge = RidgeCV(alphas=(-1, -10, -100)) assert_raises_regex(ValueError, - "alphas cannot be negative.", + "alphas must be positive", ridge.fit, X, y) # Negative floats ridge = RidgeCV(alphas=(-0.1, -1.0, -10.0)) assert_raises_regex(ValueError, - "alphas cannot be negative.", + "alphas must be positive", ridge.fit, X, y) @@ -889,48 +1057,6 @@ def test_ridge_regression_check_arguments_validity(return_intercept, assert_allclose(out, true_coefs, rtol=0, atol=atol) -def test_errors_and_values_helper(): - ridgecv = _RidgeGCV() - rng = check_random_state(42) - alpha = 1. - n = 5 - y = rng.randn(n) - v = rng.randn(n) - Q = rng.randn(len(v), len(v)) - QT_y = Q.T.dot(y) - G_diag, c = ridgecv._errors_and_values_helper(alpha, y, v, Q, QT_y) - - # test that helper function behaves as expected - out, c_ = ridgecv._errors(alpha, y, v, Q, QT_y) - np.testing.assert_array_equal(out, (c / G_diag) ** 2) - np.testing.assert_array_equal(c, c) - - out, c_ = ridgecv._values(alpha, y, v, Q, QT_y) - np.testing.assert_array_equal(out, y - (c / G_diag)) - np.testing.assert_array_equal(c_, c) - - -def test_errors_and_values_svd_helper(): - ridgecv = _RidgeGCV() - rng = check_random_state(42) - alpha = 1. - for n, p in zip((5, 10), (12, 6)): - y = rng.randn(n) - v = rng.randn(p) - U = rng.randn(n, p) - UT_y = U.T.dot(y) - G_diag, c = ridgecv._errors_and_values_svd_helper(alpha, y, v, U, UT_y) - - # test that helper function behaves as expected - out, c_ = ridgecv._errors_svd(alpha, y, v, U, UT_y) - np.testing.assert_array_equal(out, (c / G_diag) ** 2) - np.testing.assert_array_equal(c, c) - - out, c_ = ridgecv._values_svd(alpha, y, v, U, UT_y) - np.testing.assert_array_equal(out, y - (c / G_diag)) - np.testing.assert_array_equal(c_, c) - - def test_ridge_classifier_no_support_multilabel(): X, y = make_multilabel_classification(n_samples=10, random_state=0) assert_raises(ValueError, RidgeClassifier().fit, X, y)