TST update test_linear_regression_sample_weight_consistency

lorentzenchr · lorentzenchr · commit e3b9ef842873 · 2021-06-27T15:52:55.000+02:00
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
@@ -753,82 +753,93 @@ def test_fused_types_make_dataset():
     assert_array_equal(yi_64, yicsr_64)
 
 
-@pytest.mark.parametrize(
-    "sparseX",
-    [
-        False,
-        pytest.param(
-            True,
-            marks=pytest.mark.xfail(
-                reason="Compare issue #15438: sparse with "
-                "sample weight gives wrong results."
-            ),
-        ),
-    ],
-)
-@pytest.mark.parametrize("fit_intercept", [False, True])
+# FIXME: 'normalize' to be removed in 1.2
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize("normalize", [False, True])
-def test_linear_regression_sample_weight_consistentcy(
-    sparseX, fit_intercept, normalize
+@pytest.mark.parametrize("sparseX", [False, True])
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("data", ["tall", "wide"])
+def test_linear_regression_sample_weight_consistency(
+    normalize, sparseX, fit_intercept, data
 ):
-    """Test that the impact of sample_weight is consistent."""
-    rng = np.random.RandomState(0)
-    n_samples, n_features = 10, 5
+    """Test that the impact of sample_weight is consistent.
 
-    X = rng.rand(n_samples, n_features)
-    y = rng.rand(n_samples)
+    Note that this test is stricter than the common test
+    check_sample_weights_invariance alone.
+    """
+    tol = 1e-4
+    if fit_intercept:
+        tol = 1e-1
+    n_samples = 100
+    if data == "tall":
+        n_features = n_samples // 2
+    else:
+        n_features = n_samples * 2
+    rng = np.random.RandomState(42)
+    X, y = make_regression(
+        n_samples=n_samples,
+        n_features=n_features,
+        effective_rank=None,
+        n_informative=50,
+        bias=int(fit_intercept),
+        random_state=rng,
+    )
     if sparseX:
         X = sparse.csr_matrix(X)
     params = dict(fit_intercept=fit_intercept)
 
-    reg = LinearRegression(**params).fit(X, y)
+    # 1) sample_weight=np.ones(..) should be equivalent to sample_weight=None
+    # same check as check_sample_weights_invariance(name, reg, kind="ones"), but we also
+    # test with sparse input.
+    reg = LinearRegression(**params).fit(X, y, sample_weight=None)
     coef = reg.coef_.copy()
     if fit_intercept:
         intercept = reg.intercept_
-
-    # sample_weight=np.ones(..) should be equivalent to sample_weight=None
-    # same check is done as check_sample_weights_invariance in
-    # sklearn.utils.estimator_checks.py, at least for dense input
     sample_weight = np.ones_like(y)
     reg.fit(X, y, sample_weight=sample_weight)
-    assert_allclose(reg.coef_, coef, rtol=1e-6)
+    assert_allclose(reg.coef_, coef, rtol=tol)
     if fit_intercept:
         assert_allclose(reg.intercept_, intercept)
 
-    # scaling of sample_weight should have no effect
-    # Note: For models with penalty, scaling the penalty term might work.
-    sample_weight = np.pi * np.ones_like(y)
+    # 2) setting elements of sample_weight to 0 is equivalent to removing these samples
+    # same check as check_sample_weights_invariance(name, reg, kind="zeros"), but we
+    # also test with sparse input
+    sample_weight = rng.uniform(low=0.01, high=2, size=X.shape[0])
+    sample_weight[-5:] = 0
+    y[-5:] *= 1000  # to make excluding those samples important
     reg.fit(X, y, sample_weight=sample_weight)
-    assert_allclose(reg.coef_, coef, rtol=1e-6)
+    coef = reg.coef_.copy()
     if fit_intercept:
-        assert_allclose(reg.intercept_, intercept)
-
-    # setting one element of sample_weight to 0 is equivalent to removing
-    # the corresponding sample, see PR #15015
-    sample_weight = np.ones_like(y)
-    sample_weight[-1] = 0
-    reg.fit(X, y, sample_weight=sample_weight)
-    coef1 = reg.coef_.copy()
+        intercept = reg.intercept_
+    reg.fit(X[:-5, :], y[:-5], sample_weight=sample_weight[:-5])
+    assert_allclose(reg.coef_, coef, rtol=tol)
     if fit_intercept:
-        intercept1 = reg.intercept_
-    reg.fit(X[:-1], y[:-1])
-    assert_allclose(reg.coef_, coef1, rtol=1e-6)
+        assert_allclose(reg.intercept_, intercept, atol=1e-14, rtol=1e-6)
+
+    # 3) scaling of sample_weight should have no effect
+    # Note: For models with penalty, scaling the penalty term might work.
+    reg2 = LinearRegression(**params)
+    reg2.fit(X, y, sample_weight=np.pi * sample_weight)
+    assert_allclose(reg2.coef_, coef, rtol=tol)
     if fit_intercept:
-        assert_allclose(reg.intercept_, intercept1)
+        assert_allclose(reg2.intercept_, intercept, atol=1e-14, rtol=1e-5)
 
-    # check that multiplying sample_weight by 2 is equivalent
+    # 4) check that multiplying sample_weight by 2 is equivalent
     # to repeating correspoding samples twice
     if sparseX:
         X = X.toarray()
     X2 = np.concatenate([X, X[: n_samples // 2]], axis=0)
     y2 = np.concatenate([y, y[: n_samples // 2]])
-    sample_weight_1 = np.ones_like(y)
-    sample_weight_1[: n_samples // 2] = 2
+    sample_weight_1 = sample_weight.copy()
+    sample_weight_1[: n_samples // 2] *= 2
+    sample_weight_2 = np.concatenate(
+        [sample_weight, sample_weight[: n_samples // 2]], axis=0
+    )
     if sparseX:
         X = sparse.csr_matrix(X)
         X2 = sparse.csr_matrix(X2)
     reg1 = LinearRegression(**params).fit(X, y, sample_weight=sample_weight_1)
-    reg2 = LinearRegression(**params).fit(X2, y2, sample_weight=None)
-    assert_allclose(reg1.coef_, reg2.coef_)
+    reg2 = LinearRegression(**params).fit(X2, y2, sample_weight=sample_weight_2)
+    assert_allclose(reg1.coef_, reg2.coef_, rtol=tol)
     if fit_intercept:
-        assert_allclose(reg1.intercept_, reg2.intercept_)
+        assert_allclose(reg1.intercept_, reg2.intercept_, atol=1e-14, rtol=1e-6)
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
@@ -1524,7 +1524,7 @@ def test_ridge_sample_weight_consistency(
 
     n_samples = 100
     if data == "tall":
-        n_features = n_samples // 30
+        n_features = n_samples // 2
     else:
         n_features = n_samples * 2
     rng = np.random.RandomState(42)