From 8c8179e5d6e316249b4ce68266ed86b5f3a93bec Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Tue, 8 Oct 2024 10:52:20 +0200 Subject: [PATCH 1/5] use numpy least squares solver --- sklearn/linear_model/_base.py | 6 +++--- sklearn/linear_model/tests/test_base.py | 18 ++++++------------ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 02e6e75758a57..998b7566741d1 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -12,7 +12,7 @@ import numpy as np import scipy.sparse as sp -from scipy import linalg, optimize, sparse +from scipy import optimize, sparse from scipy.sparse.linalg import lsqr from scipy.special import expit @@ -525,7 +525,7 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel): Notes ----- From the implementation point of view, this is just plain Ordinary - Least Squares (scipy.linalg.lstsq) or Non Negative Least Squares + Least Squares (numpy.linalg.lstsq) or Non Negative Least Squares (scipy.optimize.nnls) wrapped as a predictor object. Examples @@ -673,7 +673,7 @@ def rmatvec(b): ) self.coef_ = np.vstack([out[0] for out in outs]) else: - self.coef_, _, self.rank_, self.singular_ = linalg.lstsq(X, y) + self.coef_, _, self.rank_, self.singular_ = np.linalg.lstsq(X, y) self.coef_ = self.coef_.T if y.ndim == 1: diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 05b7712113228..e8e0eef869630 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -704,7 +704,7 @@ def test_linear_regression_sample_weight_consistency( It is very similar to test_enet_sample_weight_consistency. """ rng = np.random.RandomState(global_random_seed) - n_samples, n_features = 10, 5 + n_samples, n_features = 10, 15 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) @@ -754,17 +754,11 @@ def test_linear_regression_sample_weight_consistency( if fit_intercept: intercept_0 = reg.intercept_ reg.fit(X[:-5], y[:-5], sample_weight=sample_weight[:-5]) - if fit_intercept and sparse_container is None: - # FIXME: https://github.com/scikit-learn/scikit-learn/issues/26164 - # This often fails, e.g. when calling - # SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all" pytest \ - # sklearn/linear_model/tests/test_base.py\ - # ::test_linear_regression_sample_weight_consistency - pass - else: - assert_allclose(reg.coef_, coef_0, rtol=1e-5) - if fit_intercept: - assert_allclose(reg.intercept_, intercept_0) + # TODO: see if it fixes + # https://github.com/scikit-learn/scikit-learn/issues/26164 + assert_allclose(reg.coef_, coef_0, rtol=1e-5) + if fit_intercept: + assert_allclose(reg.intercept_, intercept_0) # 5) check that multiplying sample_weight by 2 is equivalent to repeating # corresponding samples twice From f7d27432b873f043df31161f204945cf01907cae Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 9 Oct 2024 16:56:59 +0200 Subject: [PATCH 2/5] pass rcond [all random seeds] test_linear_regression_sample_weight_consistency --- sklearn/linear_model/_base.py | 6 +++++- sklearn/linear_model/tests/test_base.py | 6 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 998b7566741d1..d9db8bcf29f87 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -673,7 +673,11 @@ def rmatvec(b): ) self.coef_ = np.vstack([out[0] for out in outs]) else: - self.coef_, _, self.rank_, self.singular_ = np.linalg.lstsq(X, y) + # cut-off ratio for small singular values (numpy 2.0 default value) + rcond = max(X.shape) * np.finfo(X.dtype).eps + self.coef_, _, self.rank_, self.singular_ = np.linalg.lstsq( + X, y, rcond=rcond + ) self.coef_ = self.coef_.T if y.ndim == 1: diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index e8e0eef869630..af0c1d0817942 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -741,7 +741,7 @@ def test_linear_regression_sample_weight_consistency( intercept = reg.intercept_ reg.fit(X, y, sample_weight=np.pi * sample_weight) - assert_allclose(reg.coef_, coef, rtol=1e-6 if sparse_container is None else 1e-5) + assert_allclose(reg.coef_, coef, rtol=1e-6) if fit_intercept: assert_allclose(reg.intercept_, intercept) @@ -756,7 +756,7 @@ def test_linear_regression_sample_weight_consistency( reg.fit(X[:-5], y[:-5], sample_weight=sample_weight[:-5]) # TODO: see if it fixes # https://github.com/scikit-learn/scikit-learn/issues/26164 - assert_allclose(reg.coef_, coef_0, rtol=1e-5) + assert_allclose(reg.coef_, coef_0, rtol=1e-6) if fit_intercept: assert_allclose(reg.intercept_, intercept_0) @@ -775,6 +775,6 @@ def test_linear_regression_sample_weight_consistency( reg1 = LinearRegression(**params).fit(X, y, sample_weight=sample_weight_1) reg2 = LinearRegression(**params).fit(X2, y2, sample_weight=sample_weight_2) - assert_allclose(reg1.coef_, reg2.coef_, rtol=1e-6) + assert_allclose(reg1.coef_, reg2.coef_, rtol=1e-5) if fit_intercept: assert_allclose(reg1.intercept_, reg2.intercept_) From 1ff56260e3bc909a490dca625c551144adacb901 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Thu, 10 Oct 2024 10:05:56 +0200 Subject: [PATCH 3/5] revert to old test [all random seeds] test_linear_regression_sample_weight_consistency --- sklearn/linear_model/tests/test_base.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index af0c1d0817942..1581ecdffc3ae 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -704,7 +704,7 @@ def test_linear_regression_sample_weight_consistency( It is very similar to test_enet_sample_weight_consistency. """ rng = np.random.RandomState(global_random_seed) - n_samples, n_features = 10, 15 + n_samples, n_features = 10, 5 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) @@ -741,7 +741,7 @@ def test_linear_regression_sample_weight_consistency( intercept = reg.intercept_ reg.fit(X, y, sample_weight=np.pi * sample_weight) - assert_allclose(reg.coef_, coef, rtol=1e-6) + assert_allclose(reg.coef_, coef, rtol=1e-6 if sparse_container is None else 1e-5) if fit_intercept: assert_allclose(reg.intercept_, intercept) @@ -754,9 +754,7 @@ def test_linear_regression_sample_weight_consistency( if fit_intercept: intercept_0 = reg.intercept_ reg.fit(X[:-5], y[:-5], sample_weight=sample_weight[:-5]) - # TODO: see if it fixes - # https://github.com/scikit-learn/scikit-learn/issues/26164 - assert_allclose(reg.coef_, coef_0, rtol=1e-6) + assert_allclose(reg.coef_, coef_0, rtol=1e-5) if fit_intercept: assert_allclose(reg.intercept_, intercept_0) @@ -775,6 +773,6 @@ def test_linear_regression_sample_weight_consistency( reg1 = LinearRegression(**params).fit(X, y, sample_weight=sample_weight_1) reg2 = LinearRegression(**params).fit(X2, y2, sample_weight=sample_weight_2) - assert_allclose(reg1.coef_, reg2.coef_, rtol=1e-5) + assert_allclose(reg1.coef_, reg2.coef_, rtol=1e-6) if fit_intercept: assert_allclose(reg1.intercept_, reg2.intercept_) From 784f54d2b821ae7d939a595ab592b6f8b6435cbd Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Fri, 11 Oct 2024 17:53:02 +0200 Subject: [PATCH 4/5] changelog [all random seeds] test_linear_regression_sample_weight_consistency --- doc/whats_new/v1.6.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 426884ca0513c..7634e5b1e3d43 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -261,6 +261,10 @@ Changelog :mod:`sklearn.linear_model` ........................... +- |Fix| :class:`linear_model.LinearRegression` replaces the `scipy.linalg.lstsq` + solver by `numpy.linalg.lstsq` and sets the `rcond` parameter. + :pr:`30040` by :user:`Antoine Baker `. + - |Fix| :class:`linear_model.LogisticRegressionCV` corrects sample weight handling for the calculation of test scores. :pr:`29419` by :user:`Shruti Nath `. From 4c8d6e991575e9726e023b927454d2b589fd6e53 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Fri, 11 Oct 2024 18:15:27 +0200 Subject: [PATCH 5/5] oops wrong PR [all random seeds] test_linear_regression_sample_weight_consistency --- doc/whats_new/v1.6.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 7634e5b1e3d43..c7e9d525a4c70 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -263,7 +263,7 @@ Changelog - |Fix| :class:`linear_model.LinearRegression` replaces the `scipy.linalg.lstsq` solver by `numpy.linalg.lstsq` and sets the `rcond` parameter. - :pr:`30040` by :user:`Antoine Baker `. + :pr:`30030` by :user:`Antoine Baker `. - |Fix| :class:`linear_model.LogisticRegressionCV` corrects sample weight handling for the calculation of test scores.