scikit-learn · OmarManzoor · Jan 13, 2025 · Dec 20, 2024 · Dec 27, 2024 · Dec 27, 2024
diff --git a/doc/whats_new/upcoming_changes/sklearn.linear_model/30521.fix.rst b/doc/whats_new/upcoming_changes/sklearn.linear_model/30521.fix.rst
@@ -0,0 +1,4 @@
+-  |Enhancement| Added a new paramenter `tol` to
+   :class:`linear_model.LinearRegression` that determines the precision of the
+   solution `coef_` when fitting on sparse data. :pr:`30521` by :user:`Success Moses
+   <SuccessMoses>`.
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
@@ -8,7 +8,7 @@
 import numbers
 import warnings
 from abc import ABCMeta, abstractmethod
-from numbers import Integral
+from numbers import Integral, Real
 
 import numpy as np
 import scipy.sparse as sp
@@ -32,6 +32,7 @@
     indexing_dtype,
     supported_float_dtypes,
 )
+from ..utils._param_validation import Interval
 from ..utils._seq_dataset import (
     ArrayDataset32,
     ArrayDataset64,
@@ -472,6 +473,15 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
+    tol : float, default=1e-4
+        The precision of the solution (`coef_`) is determined by `tol` which
+        specifies a different convergence criterion for the `lsqr` solver.
+        `tol` is set as `atol` and `btol` of `scipy.sparse.linalg.lsqr` when
+        fitting on sparse training data. This parameter has no effect when fitting
+        on dense data.
+
+        .. versionadded:: 1.7
+
     n_jobs : int, default=None
         The number of jobs to use for the computation. This will only provide
         speedup in case of sufficiently large problems, that is if firstly
@@ -555,18 +565,21 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
         "copy_X": ["boolean"],
         "n_jobs": [None, Integral],
         "positive": ["boolean"],
+        "tol": [Interval(Real, 0, None, closed="left")],
     }
 
     def __init__(
         self,
         *,
         fit_intercept=True,
         copy_X=True,
+        tol=1e-4,
         n_jobs=None,
         positive=False,
     ):
         self.fit_intercept = fit_intercept
         self.copy_X = copy_X
+        self.tol = tol
         self.n_jobs = n_jobs
         self.positive = positive
 
@@ -668,11 +681,13 @@ def rmatvec(b):
             )
 
             if y.ndim < 2:
-                self.coef_ = lsqr(X_centered, y)[0]
+                self.coef_ = lsqr(X_centered, y, atol=self.tol, btol=self.tol)[0]
             else:
                 # sparse_lstsq cannot handle y with shape (M, K)
                 outs = Parallel(n_jobs=n_jobs_)(
-                    delayed(lsqr)(X_centered, y[:, j].ravel())
+                    delayed(lsqr)(
+                        X_centered, y[:, j].ravel(), atol=self.tol, btol=self.tol
+                    )
                     for j in range(y.shape[1])
                 )
                 self.coef_ = np.vstack([out[0] for out in outs])

diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
@@ -371,7 +371,7 @@ def test_pipeline_raise_set_params_error():
     # expected error message for invalid inner parameter
     error_msg = re.escape(
         "Invalid parameter 'invalid_param' for estimator LinearRegression(). Valid"
-        " parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive']."
+        " parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive', 'tol']."
     )
     with pytest.raises(ValueError, match=error_msg):
         pipe.set_params(cls__invalid_param="nope")

diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
@@ -575,6 +575,7 @@
             dict(positive=False),
             dict(positive=True),
         ],
+        "check_sample_weight_equivalence_on_sparse_data": [dict(tol=1e-12)],
     },
     LocallyLinearEmbedding: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
     LogisticRegression: {
@@ -983,18 +984,6 @@ def _yield_instances_for_check(check, estimator_orig):
     KNeighborsTransformer: {
         "check_methods_sample_order_invariance": "check is not applicable."
     },
-    LinearRegression: {
-        # TODO: this model should converge to the minimum norm solution of the
-        # least squares problem and as result be numerically stable enough when
-        # running the equivalence check even if n_features > n_samples. Maybe
-        # this is is not the case and a different choice of solver could fix
-        # this problem. This might require setting a low enough value for the
-        # tolerance of the lsqr solver:
-        # https://github.com/scikit-learn/scikit-learn/issues/30131
-        "check_sample_weight_equivalence_on_sparse_data": (
-            "sample_weight is not equivalent to removing/repeating samples."
-        ),
-    },
     LinearSVC: {
         # TODO: replace by a statistical test when _dual=True, see meta-issue #16298
         "check_sample_weight_equivalence_on_dense_data": (