Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion doc/whats_new/v0.22.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ random sampling procedures.
- :class:`impute.IterativeImputer` when `X` has features with no missing
values. |Feature|
- :class:`linear_model.Ridge` when `X` is sparse. |Fix|
- :class:`linear_model.RidgeCV` when `cv` is `None`. |Efficiency| |Fix|
- :class:`model_selection.StratifiedKFold` and any use of `cv=int` with a
classifier. |Fix|

Expand Down Expand Up @@ -504,6 +505,22 @@ Changelog
requires less memory.
:pr:`14108`, :pr:`14170`, :pr:`14296` by :user:`Alex Henrie <alexhenrie>`.

- |Efficiency| :class:`linear_model.RidgeCV` and
:class:`linear_model.RidgeClassifierCV` now does not allocate a
potentially large array to store dual coefficients for all hyperparameters
during its `fit`, nor an array to store all LOO predictions unless
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LOO predictions or mean squared errors

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually it depends of scoring: LOO predictions if scoring is not None otherwise mean squared errors.
But agreed that it should be added.

`store_cv_values` is `True`.
:pr:`15183` by :user:`Jérôme Dockès <jeromedockes>`.

- |Fix| :class:`linear_model.RidgeCV` returns predictions in the original space
when `scoring is not None`. In addition, the reported mean squared errors are
normalized by the sample weights for consistency.
:pr:`15183` by :user:`Jérôme Dockès <jeromedockes>`.

- |Fix| In :class:`linear_model.RidgeCV`, the predicitons reported by
`cv_values_` are now rescaled in the original space when `scoring` is not
`None`. :pr:`13995` by :user:`Jérôme Dockès <jeromedockes>`.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#13995 is an issue, not the PR


- |Fix| :class:`linear_model.Ridge` now correctly fits an intercept when `X` is
sparse, `solver="auto"` and `fit_intercept=True`, because the default solver
in this configuration has changed to `sparse_cg`, which can fit an intercept
Expand Down Expand Up @@ -807,7 +824,6 @@ Changelog
- |Fix| The liblinear solver now supports ``sample_weight``.
:pr:`15038` by :user:`Guillaume Lemaitre <glemaitre>`.


:mod:`sklearn.tree`
...................

Expand Down
66 changes: 39 additions & 27 deletions sklearn/linear_model/_ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1054,6 +1054,16 @@ def _matmat(self, v):
return res


class _IdentityEstimator:
"""Hack to call a scorer when we already have the predictions."""

def decision_function(self, y_predict):
return y_predict

def predict(self, y_predict):
return y_predict


class _RidgeGCV(LinearModel):
"""Ridge regression with built-in Generalized Cross-Validation

Expand Down Expand Up @@ -1087,6 +1097,10 @@ class _RidgeGCV(LinearModel):

looe = y - loov = c / diag(G^-1)

The best score (negative mean squared error or user-provided scoring) is
stored in the `best_score_` attribute, and the selected hyperparameter in
`alpha_`.

References
----------
http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf
Expand Down Expand Up @@ -1462,43 +1476,41 @@ def fit(self, X, y, sample_weight=None):
else:
sqrt_sw = np.ones(X.shape[0], dtype=X.dtype)

X_mean, *decomposition = decompose(X, y, sqrt_sw)

scorer = check_scoring(self, scoring=self.scoring, allow_none=True)
error = scorer is None

n_y = 1 if len(y.shape) == 1 else y.shape[1]
cv_values = np.zeros((n_samples * n_y, len(self.alphas)),
dtype=X.dtype)
C = []
X_mean, *decomposition = decompose(X, y, sqrt_sw)

if self.store_cv_values:
self.cv_values_ = np.empty(
(n_samples * n_y, len(self.alphas)), dtype=X.dtype)

best_coef, best_score, best_alpha = None, None, None

for i, alpha in enumerate(self.alphas):
G_inverse_diag, c = solve(
float(alpha), y, sqrt_sw, X_mean, *decomposition)
if error:
squared_errors = (c / G_inverse_diag) ** 2
cv_values[:, i] = squared_errors.ravel()
alpha_score = -squared_errors.mean()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there's another bug: we need to devide sample_weight here, otherwise we get weighted error.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why shouldn't sample weights be taken into account when computing the error? Note that this has always been the behaviour of the GCV estimator and is made explicit here

or another form of cross-validation, because only generalized

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's embarassing that such important thing is documented in private functions.
I think users will expect that RidgeCV() should be equivalent to GridSearchCV(Ridge(), cv=LeaveOneOut())

if self.store_cv_values:
self.cv_values_[:, i] = squared_errors.ravel()
else:
predictions = y - (c / G_inverse_diag)
cv_values[:, i] = predictions.ravel()
C.append(c)

if error:
best = cv_values.mean(axis=0).argmin()
else:
# The scorer want an object that will make the predictions but
# they are already computed efficiently by _RidgeGCV. This
# identity_estimator will just return them
def identity_estimator():
pass
identity_estimator.decision_function = lambda y_predict: y_predict
identity_estimator.predict = lambda y_predict: y_predict

# signature of scorer is (estimator, X, y)
out = [scorer(identity_estimator, cv_values[:, i], y.ravel())
for i in range(len(self.alphas))]
best = np.argmax(out)

self.alpha_ = self.alphas[best]
self.dual_coef_ = C[best]
alpha_score = scorer(
_IdentityEstimator(), predictions.ravel(), y.ravel())
if self.store_cv_values:
self.cv_values_[:, i] = \
(predictions.ravel() / np.repeat(sqrt_sw, n_y)
+ y_offset)
if (best_score is None) or (alpha_score > best_score):
best_coef, best_score, best_alpha = c, alpha_score, alpha

self.alpha_ = best_alpha
self.best_score_ = best_score
self.dual_coef_ = best_coef
self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)

X_offset += X_mean * X_scale
Expand All @@ -1509,7 +1521,7 @@ def identity_estimator():
cv_values_shape = n_samples, len(self.alphas)
else:
cv_values_shape = n_samples, n_y, len(self.alphas)
self.cv_values_ = cv_values.reshape(cv_values_shape)
self.cv_values_ = self.cv_values_.reshape(cv_values_shape)

return self

Expand Down
79 changes: 79 additions & 0 deletions sklearn/linear_model/tests/test_ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,85 @@ def test_ridge_gcv_sample_weights(
assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)


@pytest.mark.parametrize("fit_intercept", [True, False])
def test_ridge_gcv_stored_predictions(fit_intercept):
# check the predictions stored in `cv_values_` are equivalent to the
# prediction computed with a LOO with a Ridge at a specific alpha when the
# `scoring` is not `None`. We check as well that the predictions are
# rescaled if fitted with an intercept.
# Partly a regression test for #13998
X, y = make_regression(n_samples=6, n_features=2, random_state=42)
ridge_cv = RidgeCV(
fit_intercept=fit_intercept, scoring='neg_mean_squared_error',
store_cv_values=True, alphas=[1.]
)
ridge_cv.fit(X, y)
ridge = Ridge(fit_intercept=fit_intercept)
loo_pred = cross_val_predict(ridge, X, y, cv=len(X))
assert_allclose(loo_pred.ravel(), ridge_cv.cv_values_.ravel())


@pytest.mark.parametrize("fit_intercept", [True, False])
@pytest.mark.parametrize("with_sample_weight", [False, True])
def test_ridge_gcv_equivalence_prediction_metric(fit_intercept,
with_sample_weight):
# Check the consistency between the cv_values_ obtained with and without
# a score. The default score being the mean squared error, we can compute
# the error from the prediction with `scoring='neg_mean_squared_error`
# the score reported when `scoring=None`
X, y = make_regression(n_samples=6, n_features=2, random_state=42)
sample_weight = np.arange(1, len(y) + 1) if with_sample_weight else None
from sklearn.utils.validation import _check_sample_weight
sample_weight = _check_sample_weight(sample_weight, X)
ridge_cv_no_score = RidgeCV(
fit_intercept=fit_intercept, scoring=None, store_cv_values=True,
alphas=[1.]
)
ridge_cv_score = RidgeCV(
fit_intercept=fit_intercept, scoring='neg_mean_squared_error',
store_cv_values=True, alphas=[1.]
)

ridge_cv_no_score.fit(X, y, sample_weight=sample_weight)
ridge_cv_score.fit(X, y, sample_weight=sample_weight)

# compute the mean squared error and apply the sample weight
mse = ((y - ridge_cv_score.cv_values_.ravel()) ** 2) * sample_weight
mse = mse.sum() / len(mse)
assert ridge_cv_no_score.cv_values_.mean() == pytest.approx(mse)


def test_ridge_gcv_cv_values_not_stored():
# Check that `cv_values_` is not stored when store_cv_values is False
X, y = make_regression(n_samples=6, n_features=2, random_state=42)
ridge_cv = RidgeCV(store_cv_values=False, alphas=[1.])
ridge_cv.fit(X, y)

assert not hasattr(ridge_cv, "cv_values_")


@pytest.mark.parametrize("fit_intercept", [True, False])
@pytest.mark.parametrize("with_sample_weight", [False, True])
def test_ridge_gcv_decision_function_scoring(fit_intercept,
with_sample_weight):
# check that the passing a scorer computing the mean squared error is
# equivalent to `scoring=None`

def scorer(estimator, X, Y):
predict = estimator.decision_function(X)
return - np.mean((Y - predict) ** 2)

X, y = make_regression(n_samples=10, n_features=2, random_state=0)
sample_weight = np.arange(1, len(y) + 1) if with_sample_weight else None
ridge_1 = _RidgeGCV(
fit_intercept=fit_intercept, scoring=None, store_cv_values=True,
alphas=[1.0]).fit(X, y, sample_weight=sample_weight)
ridge_2 = _RidgeGCV(
fit_intercept=fit_intercept, scoring=scorer, store_cv_values=True,
alphas=[1.0]).fit(X, y, sample_weight=sample_weight)
assert ridge_1.best_score_ == pytest.approx(ridge_2.best_score_)


@pytest.mark.parametrize('mode', [True, 1, 5, 'bad', 'gcv'])
def test_check_gcv_mode_error(mode):
X, y = make_regression(n_samples=5, n_features=2)
Expand Down