Skip to content

Add tol to LinearRegression #30521

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- |Enhancement| Added a new paramenter `tol` to
:class:`linear_model.LinearRegression` that determines the precision of the
solution `coef_` when fitting on sparse data. :pr:`30521` by :user:`Success Moses
<SuccessMoses>`.
21 changes: 18 additions & 3 deletions sklearn/linear_model/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numbers
import warnings
from abc import ABCMeta, abstractmethod
from numbers import Integral
from numbers import Integral, Real

import numpy as np
import scipy.sparse as sp
Expand All @@ -32,6 +32,7 @@
indexing_dtype,
supported_float_dtypes,
)
from ..utils._param_validation import Interval
from ..utils._seq_dataset import (
ArrayDataset32,
ArrayDataset64,
Expand Down Expand Up @@ -472,6 +473,15 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
copy_X : bool, default=True
If True, X will be copied; else, it may be overwritten.

tol : float, default=1e-4
The precision of the solution (`coef_`) is determined by `tol` which
specifies a different convergence criterion for the `lsqr` solver.
`tol` is set as `atol` and `btol` of `scipy.sparse.linalg.lsqr` when
fitting on sparse training data. This parameter has no effect when fitting
on dense data.

.. versionadded:: 1.7

n_jobs : int, default=None
The number of jobs to use for the computation. This will only provide
speedup in case of sufficiently large problems, that is if firstly
Expand Down Expand Up @@ -555,18 +565,21 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
"copy_X": ["boolean"],
"n_jobs": [None, Integral],
"positive": ["boolean"],
"tol": [Interval(Real, 0, None, closed="left")],
}

def __init__(
self,
*,
fit_intercept=True,
copy_X=True,
tol=1e-4,
n_jobs=None,
positive=False,
):
self.fit_intercept = fit_intercept
self.copy_X = copy_X
self.tol = tol
self.n_jobs = n_jobs
self.positive = positive

Expand Down Expand Up @@ -668,11 +681,13 @@ def rmatvec(b):
)

if y.ndim < 2:
self.coef_ = lsqr(X_centered, y)[0]
self.coef_ = lsqr(X_centered, y, atol=self.tol, btol=self.tol)[0]
else:
# sparse_lstsq cannot handle y with shape (M, K)
outs = Parallel(n_jobs=n_jobs_)(
delayed(lsqr)(X_centered, y[:, j].ravel())
delayed(lsqr)(
X_centered, y[:, j].ravel(), atol=self.tol, btol=self.tol
)
for j in range(y.shape[1])
)
self.coef_ = np.vstack([out[0] for out in outs])
Expand Down
2 changes: 1 addition & 1 deletion sklearn/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ def test_pipeline_raise_set_params_error():
# expected error message for invalid inner parameter
error_msg = re.escape(
"Invalid parameter 'invalid_param' for estimator LinearRegression(). Valid"
" parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive']."
" parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive', 'tol']."
)
with pytest.raises(ValueError, match=error_msg):
pipe.set_params(cls__invalid_param="nope")
Expand Down
13 changes: 1 addition & 12 deletions sklearn/utils/_test_common/instance_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,7 @@
dict(positive=False),
dict(positive=True),
],
"check_sample_weight_equivalence_on_sparse_data": [dict(tol=1e-12)],
},
LocallyLinearEmbedding: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
LogisticRegression: {
Expand Down Expand Up @@ -983,18 +984,6 @@ def _yield_instances_for_check(check, estimator_orig):
KNeighborsTransformer: {
"check_methods_sample_order_invariance": "check is not applicable."
},
LinearRegression: {
# TODO: this model should converge to the minimum norm solution of the
# least squares problem and as result be numerically stable enough when
# running the equivalence check even if n_features > n_samples. Maybe
# this is is not the case and a different choice of solver could fix
# this problem. This might require setting a low enough value for the
# tolerance of the lsqr solver:
# https://github.com/scikit-learn/scikit-learn/issues/30131
"check_sample_weight_equivalence_on_sparse_data": (
"sample_weight is not equivalent to removing/repeating samples."
),
},
LinearSVC: {
# TODO: replace by a statistical test when _dual=True, see meta-issue #16298
"check_sample_weight_equivalence_on_dense_data": (
Expand Down
Loading