Skip to content

FIX: make LinearRegression perfectly consistent across sparse or dense #13279

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new/v0.21.rst
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ Support for Python 3.4 and below has been officially dropped.
parameter value ``copy_X=True`` in ``fit``.
:issue:`12972` by :user:`Lucio Fernandez-Arjona <luk-f-a>`

- |Fix| Fixed a bug in :class:`linear_model.LinearRegression` that
was not returning the same coeffecients and intercepts with
``fit_intercept=True`` in sparse and dense case.
:issue:`13279` by `Alexandre Gramfort`_

:mod:`sklearn.manifold`
............................

Expand Down
19 changes: 16 additions & 3 deletions sklearn/linear_model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,21 +459,34 @@ def fit(self, X, y, sample_weight=None):

X, y, X_offset, y_offset, X_scale = self._preprocess_data(
X, y, fit_intercept=self.fit_intercept, normalize=self.normalize,
copy=self.copy_X, sample_weight=sample_weight)
copy=self.copy_X, sample_weight=sample_weight,
return_mean=True)

if sample_weight is not None:
# Sample weight can be implemented via a simple rescaling.
X, y = _rescale_data(X, y, sample_weight)

if sp.issparse(X):
X_offset_scale = X_offset / X_scale

def matvec(b):
return X.dot(b) - b.dot(X_offset_scale)

def rmatvec(b):
return X.T.dot(b) - X_offset_scale * np.sum(b)

X_centered = sparse.linalg.LinearOperator(shape=X.shape,
matvec=matvec,
rmatvec=rmatvec)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very elegant!


if y.ndim < 2:
out = sparse_lsqr(X, y)
out = sparse_lsqr(X_centered, y)
self.coef_ = out[0]
self._residues = out[3]
else:
# sparse_lstsq cannot handle y with shape (M, K)
outs = Parallel(n_jobs=n_jobs_)(
delayed(sparse_lsqr)(X, y[:, j].ravel())
delayed(sparse_lsqr)(X_centered, y[:, j].ravel())
for j in range(y.shape[1]))
self.coef_ = np.vstack([out[0] for out in outs])
self._residues = np.vstack([out[3] for out in outs])
Expand Down
21 changes: 21 additions & 0 deletions sklearn/linear_model/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_allclose

from sklearn.linear_model.base import LinearRegression
from sklearn.linear_model.base import _preprocess_data
Expand Down Expand Up @@ -150,6 +151,26 @@ def test_linear_regression_sparse(random_state=0):
assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)


@pytest.mark.parametrize('normalize', [True, False])
@pytest.mark.parametrize('fit_intercept', [True, False])
def test_linear_regression_sparse_equal_dense(normalize, fit_intercept):
# Test that linear regression agrees between sparse and dense
rng = check_random_state(0)
n_samples = 200
n_features = 2
X = rng.randn(n_samples, n_features)
X[X < 0.1] = 0.
Xcsr = sparse.csr_matrix(X)
y = rng.rand(n_samples)
params = dict(normalize=normalize, fit_intercept=fit_intercept)
clf_dense = LinearRegression(**params)
clf_sparse = LinearRegression(**params)
clf_dense.fit(X, y)
clf_sparse.fit(Xcsr, y)
assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_)
assert_allclose(clf_dense.coef_, clf_sparse.coef_)


def test_linear_regression_multiple_outcome(random_state=0):
# Test multiple-outcome linear regressions
X, y = make_regression(random_state=random_state)
Expand Down