scikit-learn · rth · Jun 12, 2021 · Mar 4, 2021 · Mar 4, 2021 · Apr 10, 2021
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -334,7 +334,7 @@ Changelog
   BayesianRidge, ARDRegression were deprecated in:
   :pr:`17746` by :user:`Maria Telenczuk <maikia>`.
 
-- |Fix|: `sample_weight` are now fully taken into account in linear models
+- |Fix| `sample_weight` are now fully taken into account in linear models
   when `normalize=True` for both feature centering and feature
   scaling.
   :pr:`19426` by :user:`Alexandre Gramfort <agramfort>` and
@@ -348,6 +348,10 @@ Changelog
   :class:`Lars`, :class:`LassoLars`, :class:`LassoLars`, :class:`LarsCV` and
   :class:`LassoLarsCV`. :pr:`20155` by :user:`Takeshi Oura <takoika>`.
 
+- |Fix| Sample weight invariance for class:`Ridge` was fixed in :pr:`19616` by
+  :user:`Oliver Grisel <ogrisel>` and
+  :user:`Christian Lorentzen <lorentzenchr>`.
+
 :mod:`sklearn.manifold`
 .......................
 

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
@@ -272,7 +272,10 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
             # the np.sqrt. Otherwise constant features cannot be detected with
             # sample weights.
             constant_mask = _is_constant_feature(X_var, X_offset, X.shape[0])
-            X_var *= X.shape[0]
+            if sample_weight is None:
+                X_var *= X.shape[0]
+            else:
+                X_var *= sample_weight.sum()
             X_scale = np.sqrt(X_var, out=X_var)
             X_scale[constant_mask] = 1.
             if sp.issparse(X):

diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
@@ -467,7 +467,9 @@ def test_preprocess_data_weighted(is_sparse):
                                      axis=0)
     constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
     assert_array_equal(constant_mask, [0, 0, 1, 1])
-    expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples)
+    expected_X_scale = (
+        np.sqrt(X_sample_weight_var) * np.sqrt(sample_weight.sum())
+    )
 
     # near constant features should not be scaled
     expected_X_scale[constant_mask] = 1
@@ -510,14 +512,16 @@ def test_preprocess_data_weighted(is_sparse):
     # _preprocess_data with normalize=True scales the data by the feature-wise
     # euclidean norms while StandardScaler scales the data by the feature-wise
     # standard deviations.
-    # The two are equivalent up to a ratio of np.sqrt(n_samples).
+    # The two are equivalent up to a ratio of np.sqrt(n_samples) if unweighted
+    # or np.sqrt(sample_weight.sum()) if weighted.
     if is_sparse:
         scaler = StandardScaler(with_mean=False).fit(
             X, sample_weight=sample_weight)
 
         # Non-constant features are scaled similarly with np.sqrt(n_samples)
         assert_array_almost_equal(
-            scaler.transform(X).toarray()[:, :2] / np.sqrt(n_samples),
+            scaler.transform(X).toarray()[:, :2]
+            / np.sqrt(sample_weight.sum()),
             Xt.toarray()[:, :2]
         )
 
@@ -530,7 +534,10 @@ def test_preprocess_data_weighted(is_sparse):
         scaler = StandardScaler(with_mean=True).fit(
             X, sample_weight=sample_weight)
         assert_array_almost_equal(scaler.mean_, X_mean)
-        assert_array_almost_equal(scaler.transform(X) / np.sqrt(n_samples), Xt)
+        assert_array_almost_equal(
+            scaler.transform(X) / np.sqrt(sample_weight.sum()),
+            Xt,
+        )
     assert_array_almost_equal(yt, y - expected_y_mean)
 
 

diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -9,6 +9,7 @@
 import joblib
 
 from sklearn.base import is_classifier
+from sklearn.base import clone
 from sklearn.datasets import load_diabetes
 from sklearn.datasets import make_regression
 from sklearn.model_selection import train_test_split
@@ -453,7 +454,7 @@ def test_linear_model_sample_weights_normalize_in_pipeline(
         X_train = sparse.csr_matrix(X_train)
         X_test = _convert_container(X_train, 'sparse')
 
-    sample_weight = rng.rand(X_train.shape[0])
+    sample_weight = rng.uniform(low=0.1, high=100, size=X_train.shape[0])
 
     # linear estimator with built-in feature normalization
     reg_with_normalize = estimator(normalize=True, fit_intercept=True,
@@ -462,7 +463,12 @@ def test_linear_model_sample_weights_normalize_in_pipeline(
 
     # linear estimator in a pipeline with a StandardScaler, normalize=False
     linear_regressor = estimator(normalize=False, fit_intercept=True, **params)
-    _scale_alpha_inplace(linear_regressor, X_train.shape[0])  # rescale alpha
+
+    # rescale alpha
+    if model_name in ["Lasso", "ElasticNet"]:
+        _scale_alpha_inplace(linear_regressor, y_test.shape[0])
+    else:
+        _scale_alpha_inplace(linear_regressor, sample_weight.sum())
     reg_with_scaler = Pipeline([
         ("scaler", StandardScaler(with_mean=with_mean)),
         ("linear_regressor", linear_regressor)
@@ -479,7 +485,8 @@ def test_linear_model_sample_weights_normalize_in_pipeline(
     # sense that they predict exactly the same outcome.
     y_pred_normalize = reg_with_normalize.predict(X_test)
     y_pred_scaler = reg_with_scaler.predict(X_test)
-    assert_allclose(y_pred_normalize,  y_pred_scaler)
+    assert_allclose(y_pred_normalize, y_pred_scaler)
+
     # Check intercept computation when normalize is True
     y_train_mean = np.average(y_train, weights=sample_weight)
     if is_sparse:
@@ -1446,39 +1453,78 @@ def test_enet_ridge_consistency(normalize, ridge_alpha):
     # effective_rank are more problematic in particular.
 
     rng = np.random.RandomState(42)
+    n_samples = 300
     X, y = make_regression(
-        n_samples=100,
-        n_features=300,
-        effective_rank=100,
+        n_samples=n_samples,
+        n_features=100,
+        effective_rank=10,
         n_informative=50,
         random_state=rng,
     )
-    sw = rng.uniform(low=0.01, high=2, size=X.shape[0])
-
-    ridge = Ridge(
-        alpha=ridge_alpha,
+    sw = rng.uniform(low=0.01, high=10, size=X.shape[0])
+    alpha = 1.
+    common_params = dict(
         normalize=normalize,
-    ).fit(X, y, sample_weight=sw)
-
-    enet = ElasticNet(
-        alpha=ridge_alpha / sw.sum(),
-        normalize=normalize,
-        l1_ratio=0.,
-        max_iter=1000,
+        tol=1e-12,
     )
-    # Even when the ElasticNet model has actually converged, the duality gap
-    # convergence criterion is never met when l1_ratio is 0 and for any value
-    # of the `tol` parameter. The convergence message should point the user to
-    # Ridge instead:
-    expected_msg = (
-        r"Objective did not converge\. .* "
-        r"Linear regression models with null weight for the "
-        r"l1 regularization term are more efficiently fitted "
-        r"using one of the solvers implemented in "
-        r"sklearn\.linear_model\.Ridge/RidgeCV instead\."
+    ridge = Ridge(alpha=alpha, **common_params).fit(
+        X, y, sample_weight=sw
+    )
+    if normalize:
+        alpha_enet = alpha / n_samples
+    else:
+        alpha_enet = alpha / sw.sum()
+    enet = ElasticNet(alpha=alpha_enet, l1_ratio=0, **common_params).fit(
+        X, y, sample_weight=sw
     )
-    with pytest.warns(ConvergenceWarning, match=expected_msg):
-        enet.fit(X, y, sample_weight=sw)
-
     assert_allclose(ridge.coef_, enet.coef_)
     assert_allclose(ridge.intercept_, enet.intercept_)
+
+
+@pytest.mark.parametrize(
+    "estimator", [
+        Lasso(alpha=1.),
+        ElasticNet(alpha=1., l1_ratio=0.1),
+    ]
+)
+def test_sample_weight_invariance(estimator):
+    rng = np.random.RandomState(42)
+    X, y = make_regression(
+        n_samples=100,
+        n_features=300,
+        effective_rank=10,
+        n_informative=50,
+        random_state=rng,
+    )
+    normalize = False  # These tests don't work for normalize=True.
+    sw = rng.uniform(low=0.01, high=2, size=X.shape[0])
+    params = dict(normalize=normalize, tol=1e-12)
+
+    # Check that setting some weights to 0 is equivalent to trimming the
+    # samples:
+    cutoff = X.shape[0] // 3
+    sw_with_null = sw.copy()
+    sw_with_null[:cutoff] = 0.
+    X_trimmed, y_trimmed = X[cutoff:, :], y[cutoff:]
+    sw_trimmed = sw[cutoff:]
+
+    reg_trimmed = clone(estimator).set_params(**params).fit(
+        X_trimmed, y_trimmed, sample_weight=sw_trimmed)
+    reg_null_weighted = clone(estimator).set_params(**params).fit(
+        X, y, sample_weight=sw_with_null)
+    assert_allclose(reg_null_weighted.coef_, reg_trimmed.coef_)
+    assert_allclose(reg_null_weighted.intercept_, reg_trimmed.intercept_)
+
+    # Check that duplicating the training dataset is equivalent to multiplying
+    # the weights by 2:
+    X_dup = np.concatenate([X, X], axis=0)
+    y_dup = np.concatenate([y, y], axis=0)
+    sw_dup = np.concatenate([sw, sw], axis=0)
+
+    reg_2sw = clone(estimator).set_params(**params).fit(
+        X, y, sample_weight=2 * sw)
+    reg_dup = clone(estimator).set_params(**params).fit(
+        X_dup, y_dup, sample_weight=sw_dup)
+
+    assert_allclose(reg_2sw.coef_, reg_dup.coef_)
+    assert_allclose(reg_2sw.intercept_, reg_dup.intercept_)
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
@@ -11,6 +11,7 @@
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import ignore_warnings
+from sklearn.utils.estimator_checks import check_sample_weights_invariance
 
 from sklearn.exceptions import ConvergenceWarning
 
@@ -1414,3 +1415,53 @@ def test_ridge_sag_with_X_fortran():
     X = X[::2, :]
     y = y[::2]
     Ridge(solver='sag').fit(X, y)
+
+
+# FIXME: 'normalize' to be removed in 1.2
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
+@pytest.mark.parametrize("normalize", [True, False])
+@pytest.mark.parametrize(
+    "solver",
+    ["cholesky", "lsqr", "sparse_cg", "svd", "sag", "saga"]
+)
+def test_ridge_sample_weight_invariance(normalize, solver):
+    """Test that Ridge fulfils sample weight invariance.
+
+    Note that this test is stricter than the common test
+    check_sample_weights_invariance alone.
+    """
+    params = dict(
+        alpha=1.,
+        normalize=normalize,
+        solver=solver,
+        tol=1e-12,
+    )
+    reg = Ridge(**params)
+    name = reg.__class__.__name__
+    check_sample_weights_invariance(name, reg, kind="ones")
+    check_sample_weights_invariance(name, reg, kind="zeros")
+
+    # Check that duplicating the training dataset is equivalent to multiplying
+    # the weights by 2:
+    if solver.startswith("sag") and normalize:
+        pytest.xfail("sag/saga diverge on the second part of this test")
+
+    rng = np.random.RandomState(42)
+    X, y = make_regression(
+        n_samples=100,
+        n_features=300,
+        effective_rank=10,
+        n_informative=50,
+        random_state=rng,
+    )
+    sw = rng.uniform(low=0.01, high=2, size=X.shape[0])
+    X_dup = np.concatenate([X, X], axis=0)
+    y_dup = np.concatenate([y, y], axis=0)
+    sw_dup = np.concatenate([sw, sw], axis=0)
+
+    ridge_2sw = Ridge(**params).fit(X, y, sample_weight=2 * sw)
+    ridge_dup = Ridge(**params).fit(
+        X_dup, y_dup, sample_weight=sw_dup)
+
+    assert_allclose(ridge_2sw.coef_, ridge_dup.coef_)
+    assert_allclose(ridge_2sw.intercept_, ridge_dup.intercept_)