scikit-learn · thomasjpfan · Oct 24, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024
diff --git a/doc/whats_new/upcoming_changes/sklearn.linear_model/30040.fix.rst b/doc/whats_new/upcoming_changes/sklearn.linear_model/30040.fix.rst
@@ -0,0 +1,6 @@
+- :class:`linear_model.LinearRegression` now sets the `cond` parameter when
+  calling the `scipy.linalg.lstsq` solver on dense input data. This ensures
+  more numerically robust results on rank-deficient data. In particular, it
+  empirically fixes the expected equivalence property between fitting with
+  reweighted or with repeated data points.
+  :pr:`30040` by :user:`Antoine Baker <antoinebaker>`.
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
@@ -673,30 +673,16 @@ def rmatvec(b):
                 )
                 self.coef_ = np.vstack([out[0] for out in outs])
         else:
-            self.coef_, _, self.rank_, self.singular_ = linalg.lstsq(X, y)
+            # cut-off ratio for small singular values
+            cond = max(X.shape) * np.finfo(X.dtype).eps
+            self.coef_, _, self.rank_, self.singular_ = linalg.lstsq(X, y, cond=cond)
             self.coef_ = self.coef_.T
 
         if y.ndim == 1:
             self.coef_ = np.ravel(self.coef_)
         self._set_intercept(X_offset, y_offset, X_scale)
         return self
 
-    def __sklearn_tags__(self):
-        tags = super().__sklearn_tags__()
-        # TODO: investigate failure see meta-issue #16298
-        #
-        # Note: this model should converge to the minimum norm solution of the
-        # least squares problem and as result be numerically stable enough when
-        # running the equivalence check even if n_features > n_samples. Maybe
-        # this is is not the case and a different choice of solver could fix
-        # this problem.
-        tags._xfail_checks = {
-            "check_sample_weight_equivalence": (
-                "sample_weight is not equivalent to removing/repeating samples."
-            ),
-        }
-        return tags
-
 
 def _check_precomputed_gram_matrix(
     X, precompute, X_offset, X_scale, rtol=None, atol=1e-5

diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
@@ -692,10 +692,23 @@ def test_fused_types_make_dataset(csr_container):
     assert_array_equal(yi_64, yicsr_64)
 
 
-@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+@pytest.mark.parametrize("X_shape", [(10, 5), (10, 20), (100, 100)])
+@pytest.mark.parametrize(
+    "sparse_container",
+    [None]
+    + [
+        pytest.param(
+            container,
+            marks=pytest.mark.xfail(
+                reason="Known to fail for CSR arrays, see issue #30131."
+            ),
+        )
+        for container in CSR_CONTAINERS
+    ],
+)
 @pytest.mark.parametrize("fit_intercept", [False, True])
 def test_linear_regression_sample_weight_consistency(
-    sparse_container, fit_intercept, global_random_seed
+    X_shape, sparse_container, fit_intercept, global_random_seed
 ):
     """Test that the impact of sample_weight is consistent.
 
@@ -704,7 +717,7 @@ def test_linear_regression_sample_weight_consistency(
     It is very similar to test_enet_sample_weight_consistency.
     """
     rng = np.random.RandomState(global_random_seed)
-    n_samples, n_features = 10, 5
+    n_samples, n_features = X_shape
 
     X = rng.rand(n_samples, n_features)
     y = rng.rand(n_samples)
@@ -754,17 +767,9 @@ def test_linear_regression_sample_weight_consistency(
     if fit_intercept:
         intercept_0 = reg.intercept_
     reg.fit(X[:-5], y[:-5], sample_weight=sample_weight[:-5])
-    if fit_intercept and sparse_container is None:
-        # FIXME: https://github.com/scikit-learn/scikit-learn/issues/26164
-        # This often fails, e.g. when calling
-        # SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all" pytest \
-        # sklearn/linear_model/tests/test_base.py\
-        # ::test_linear_regression_sample_weight_consistency
-        pass
-    else:
-        assert_allclose(reg.coef_, coef_0, rtol=1e-5)
-        if fit_intercept:
-            assert_allclose(reg.intercept_, intercept_0)
+    assert_allclose(reg.coef_, coef_0, rtol=1e-5)
+    if fit_intercept:
+        assert_allclose(reg.intercept_, intercept_0)
 
     # 5) check that multiplying sample_weight by 2 is equivalent to repeating
     # corresponding samples twice