Skip to content

[MRG] FIX sample_weight invariance for linear models #19616

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jun 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ Changelog
BayesianRidge, ARDRegression were deprecated in:
:pr:`17746` by :user:`Maria Telenczuk <maikia>`.

- |Fix|: `sample_weight` are now fully taken into account in linear models
- |Fix| `sample_weight` are now fully taken into account in linear models
when `normalize=True` for both feature centering and feature
scaling.
:pr:`19426` by :user:`Alexandre Gramfort <agramfort>` and
Expand All @@ -348,6 +348,10 @@ Changelog
:class:`Lars`, :class:`LassoLars`, :class:`LassoLars`, :class:`LarsCV` and
:class:`LassoLarsCV`. :pr:`20155` by :user:`Takeshi Oura <takoika>`.

- |Fix| Sample weight invariance for class:`Ridge` was fixed in :pr:`19616` by
:user:`Oliver Grisel <ogrisel>` and
:user:`Christian Lorentzen <lorentzenchr>`.

:mod:`sklearn.manifold`
.......................

Expand Down
5 changes: 4 additions & 1 deletion sklearn/linear_model/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,10 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
# the np.sqrt. Otherwise constant features cannot be detected with
# sample weights.
constant_mask = _is_constant_feature(X_var, X_offset, X.shape[0])
X_var *= X.shape[0]
if sample_weight is None:
X_var *= X.shape[0]
else:
X_var *= sample_weight.sum()
X_scale = np.sqrt(X_var, out=X_var)
X_scale[constant_mask] = 1.
if sp.issparse(X):
Expand Down
15 changes: 11 additions & 4 deletions sklearn/linear_model/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,9 @@ def test_preprocess_data_weighted(is_sparse):
axis=0)
constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
assert_array_equal(constant_mask, [0, 0, 1, 1])
expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples)
expected_X_scale = (
np.sqrt(X_sample_weight_var) * np.sqrt(sample_weight.sum())
)

# near constant features should not be scaled
expected_X_scale[constant_mask] = 1
Expand Down Expand Up @@ -510,14 +512,16 @@ def test_preprocess_data_weighted(is_sparse):
# _preprocess_data with normalize=True scales the data by the feature-wise
# euclidean norms while StandardScaler scales the data by the feature-wise
# standard deviations.
# The two are equivalent up to a ratio of np.sqrt(n_samples).
# The two are equivalent up to a ratio of np.sqrt(n_samples) if unweighted
# or np.sqrt(sample_weight.sum()) if weighted.
if is_sparse:
scaler = StandardScaler(with_mean=False).fit(
X, sample_weight=sample_weight)

# Non-constant features are scaled similarly with np.sqrt(n_samples)
assert_array_almost_equal(
scaler.transform(X).toarray()[:, :2] / np.sqrt(n_samples),
scaler.transform(X).toarray()[:, :2]
/ np.sqrt(sample_weight.sum()),
Xt.toarray()[:, :2]
)

Expand All @@ -530,7 +534,10 @@ def test_preprocess_data_weighted(is_sparse):
scaler = StandardScaler(with_mean=True).fit(
X, sample_weight=sample_weight)
assert_array_almost_equal(scaler.mean_, X_mean)
assert_array_almost_equal(scaler.transform(X) / np.sqrt(n_samples), Xt)
assert_array_almost_equal(
scaler.transform(X) / np.sqrt(sample_weight.sum()),
Xt,
)
assert_array_almost_equal(yt, y - expected_y_mean)


Expand Down
106 changes: 76 additions & 30 deletions sklearn/linear_model/tests/test_coordinate_descent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import joblib

from sklearn.base import is_classifier
from sklearn.base import clone
from sklearn.datasets import load_diabetes
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
Expand Down Expand Up @@ -453,7 +454,7 @@ def test_linear_model_sample_weights_normalize_in_pipeline(
X_train = sparse.csr_matrix(X_train)
X_test = _convert_container(X_train, 'sparse')

sample_weight = rng.rand(X_train.shape[0])
sample_weight = rng.uniform(low=0.1, high=100, size=X_train.shape[0])

# linear estimator with built-in feature normalization
reg_with_normalize = estimator(normalize=True, fit_intercept=True,
Expand All @@ -462,7 +463,12 @@ def test_linear_model_sample_weights_normalize_in_pipeline(

# linear estimator in a pipeline with a StandardScaler, normalize=False
linear_regressor = estimator(normalize=False, fit_intercept=True, **params)
_scale_alpha_inplace(linear_regressor, X_train.shape[0]) # rescale alpha

# rescale alpha
if model_name in ["Lasso", "ElasticNet"]:
_scale_alpha_inplace(linear_regressor, y_test.shape[0])
else:
_scale_alpha_inplace(linear_regressor, sample_weight.sum())
reg_with_scaler = Pipeline([
("scaler", StandardScaler(with_mean=with_mean)),
("linear_regressor", linear_regressor)
Expand All @@ -479,7 +485,8 @@ def test_linear_model_sample_weights_normalize_in_pipeline(
# sense that they predict exactly the same outcome.
y_pred_normalize = reg_with_normalize.predict(X_test)
y_pred_scaler = reg_with_scaler.predict(X_test)
assert_allclose(y_pred_normalize, y_pred_scaler)
assert_allclose(y_pred_normalize, y_pred_scaler)

# Check intercept computation when normalize is True
y_train_mean = np.average(y_train, weights=sample_weight)
if is_sparse:
Expand Down Expand Up @@ -1446,39 +1453,78 @@ def test_enet_ridge_consistency(normalize, ridge_alpha):
# effective_rank are more problematic in particular.

rng = np.random.RandomState(42)
n_samples = 300
X, y = make_regression(
n_samples=100,
n_features=300,
effective_rank=100,
n_samples=n_samples,
n_features=100,
effective_rank=10,
n_informative=50,
random_state=rng,
)
sw = rng.uniform(low=0.01, high=2, size=X.shape[0])

ridge = Ridge(
alpha=ridge_alpha,
sw = rng.uniform(low=0.01, high=10, size=X.shape[0])
alpha = 1.
common_params = dict(
normalize=normalize,
).fit(X, y, sample_weight=sw)

enet = ElasticNet(
alpha=ridge_alpha / sw.sum(),
normalize=normalize,
l1_ratio=0.,
max_iter=1000,
tol=1e-12,
)
# Even when the ElasticNet model has actually converged, the duality gap
# convergence criterion is never met when l1_ratio is 0 and for any value
# of the `tol` parameter. The convergence message should point the user to
# Ridge instead:
expected_msg = (
r"Objective did not converge\. .* "
r"Linear regression models with null weight for the "
r"l1 regularization term are more efficiently fitted "
r"using one of the solvers implemented in "
r"sklearn\.linear_model\.Ridge/RidgeCV instead\."
ridge = Ridge(alpha=alpha, **common_params).fit(
X, y, sample_weight=sw
)
if normalize:
alpha_enet = alpha / n_samples
else:
alpha_enet = alpha / sw.sum()
enet = ElasticNet(alpha=alpha_enet, l1_ratio=0, **common_params).fit(
X, y, sample_weight=sw
)
with pytest.warns(ConvergenceWarning, match=expected_msg):
enet.fit(X, y, sample_weight=sw)

assert_allclose(ridge.coef_, enet.coef_)
assert_allclose(ridge.intercept_, enet.intercept_)


@pytest.mark.parametrize(
"estimator", [
Lasso(alpha=1.),
ElasticNet(alpha=1., l1_ratio=0.1),
]
)
def test_sample_weight_invariance(estimator):
rng = np.random.RandomState(42)
X, y = make_regression(
n_samples=100,
n_features=300,
effective_rank=10,
n_informative=50,
random_state=rng,
)
normalize = False # These tests don't work for normalize=True.
sw = rng.uniform(low=0.01, high=2, size=X.shape[0])
params = dict(normalize=normalize, tol=1e-12)

# Check that setting some weights to 0 is equivalent to trimming the
# samples:
cutoff = X.shape[0] // 3
sw_with_null = sw.copy()
sw_with_null[:cutoff] = 0.
X_trimmed, y_trimmed = X[cutoff:, :], y[cutoff:]
sw_trimmed = sw[cutoff:]

reg_trimmed = clone(estimator).set_params(**params).fit(
X_trimmed, y_trimmed, sample_weight=sw_trimmed)
reg_null_weighted = clone(estimator).set_params(**params).fit(
X, y, sample_weight=sw_with_null)
assert_allclose(reg_null_weighted.coef_, reg_trimmed.coef_)
assert_allclose(reg_null_weighted.intercept_, reg_trimmed.intercept_)

# Check that duplicating the training dataset is equivalent to multiplying
# the weights by 2:
X_dup = np.concatenate([X, X], axis=0)
y_dup = np.concatenate([y, y], axis=0)
sw_dup = np.concatenate([sw, sw], axis=0)

reg_2sw = clone(estimator).set_params(**params).fit(
X, y, sample_weight=2 * sw)
reg_dup = clone(estimator).set_params(**params).fit(
X_dup, y_dup, sample_weight=sw_dup)

assert_allclose(reg_2sw.coef_, reg_dup.coef_)
assert_allclose(reg_2sw.intercept_, reg_dup.intercept_)
51 changes: 51 additions & 0 deletions sklearn/linear_model/tests/test_ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import ignore_warnings
from sklearn.utils.estimator_checks import check_sample_weights_invariance

from sklearn.exceptions import ConvergenceWarning

Expand Down Expand Up @@ -1414,3 +1415,53 @@ def test_ridge_sag_with_X_fortran():
X = X[::2, :]
y = y[::2]
Ridge(solver='sag').fit(X, y)


# FIXME: 'normalize' to be removed in 1.2
@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize(
"solver",
["cholesky", "lsqr", "sparse_cg", "svd", "sag", "saga"]
)
def test_ridge_sample_weight_invariance(normalize, solver):
"""Test that Ridge fulfils sample weight invariance.

Note that this test is stricter than the common test
check_sample_weights_invariance alone.
"""
params = dict(
alpha=1.,
normalize=normalize,
solver=solver,
tol=1e-12,
)
reg = Ridge(**params)
name = reg.__class__.__name__
check_sample_weights_invariance(name, reg, kind="ones")
check_sample_weights_invariance(name, reg, kind="zeros")

# Check that duplicating the training dataset is equivalent to multiplying
# the weights by 2:
if solver.startswith("sag") and normalize:
pytest.xfail("sag/saga diverge on the second part of this test")

rng = np.random.RandomState(42)
X, y = make_regression(
n_samples=100,
n_features=300,
effective_rank=10,
n_informative=50,
random_state=rng,
)
sw = rng.uniform(low=0.01, high=2, size=X.shape[0])
X_dup = np.concatenate([X, X], axis=0)
y_dup = np.concatenate([y, y], axis=0)
sw_dup = np.concatenate([sw, sw], axis=0)

ridge_2sw = Ridge(**params).fit(X, y, sample_weight=2 * sw)
ridge_dup = Ridge(**params).fit(
X_dup, y_dup, sample_weight=sw_dup)

assert_allclose(ridge_2sw.coef_, ridge_dup.coef_)
assert_allclose(ridge_2sw.intercept_, ridge_dup.intercept_)