From 483c9744a2e259f3036ac2042d5a5f45d792996d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 18 Apr 2023 11:05:06 +0200 Subject: [PATCH 1/8] PERF fix overhead of _rescale_data in LinearRegression --- sklearn/linear_model/_base.py | 54 ++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index e8e907906efc3..6e46ddce81fa7 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -317,12 +317,31 @@ def _rescale_data(X, y, sample_weight): """ n_samples = X.shape[0] sample_weight = np.asarray(sample_weight) + if sample_weight.ndim == 0: + # XXX: compat for scalar weight: do we really need to support this? sample_weight = np.full(n_samples, sample_weight, dtype=sample_weight.dtype) + sample_weight_sqrt = np.sqrt(sample_weight) - sw_matrix = sparse.dia_matrix((sample_weight_sqrt, 0), shape=(n_samples, n_samples)) - X = safe_sparse_dot(sw_matrix, X) - y = safe_sparse_dot(sw_matrix, y) + + if sp.issparse(X) or sp.issparse(y): + sw_matrix = sparse.dia_matrix( + (sample_weight_sqrt, 0), shape=(n_samples, n_samples) + ) + if sp.issparse(X): + X = safe_sparse_dot(sw_matrix, X) + else: + # XXX: we do not do inplace multiplication on X for consistency + # with the sparse case and because the _rescale_data currently + # does not make it explicit if it's ok to do it or not. + X = X * sample_weight_sqrt[:, np.newaxis] + if sp.issparse(y): + y = safe_sparse_dot(sw_matrix, y) + else: + if y.ndim == 1: + y = y * sample_weight_sqrt + else: + y = y * sample_weight_sqrt[:, np.newaxis] return X, y, sample_weight_sqrt @@ -651,9 +670,11 @@ def fit(self, X, y, sample_weight=None): X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True ) - sample_weight = _check_sample_weight( - sample_weight, X, dtype=X.dtype, only_non_negative=True - ) + has_sw = sample_weight is not None + if has_sw: + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype, only_non_negative=True + ) X, y, X_offset, y_offset, X_scale = _preprocess_data( X, @@ -664,7 +685,8 @@ def fit(self, X, y, sample_weight=None): ) # Sample weight can be implemented via a simple rescaling. - X, y, sample_weight_sqrt = _rescale_data(X, y, sample_weight) + if has_sw: + X, y, sample_weight_sqrt = _rescale_data(X, y, sample_weight) if self.positive: if y.ndim < 2: @@ -678,11 +700,21 @@ def fit(self, X, y, sample_weight=None): elif sp.issparse(X): X_offset_scale = X_offset / X_scale - def matvec(b): - return X.dot(b) - sample_weight_sqrt * b.dot(X_offset_scale) + if has_sw: + + def matvec(b): + return X.dot(b) - sample_weight_sqrt * b.dot(X_offset_scale) + + def rmatvec(b): + return X.T.dot(b) - X_offset_scale * b.dot(sample_weight_sqrt) + + else: + + def matvec(b): + return X.dot(b) - b.dot(X_offset_scale) - def rmatvec(b): - return X.T.dot(b) - X_offset_scale * b.dot(sample_weight_sqrt) + def rmatvec(b): + return X.T.dot(b) - X_offset_scale * b.sum() X_centered = sparse.linalg.LinearOperator( shape=X.shape, matvec=matvec, rmatvec=rmatvec From 6092129077754c89ea48494f054648a4f0e064d0 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 18 Apr 2023 11:18:01 +0200 Subject: [PATCH 2/8] DOC add entry to the changelog --- doc/whats_new/v1.3.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 2200458cc5d32..5a0c44c37b58f 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -267,7 +267,7 @@ Changelog meaning that it is note required to call `fit` before calling `transform`. Parameter validation only happens at `fit` time. :pr:`24230` by :user:`Guillaume Lemaitre `. - + :mod:`sklearn.feature_selection` ................................ @@ -294,6 +294,10 @@ Changelog :mod:`sklearn.linear_model` ........................... +- |Efficiency| Avoid uncessary data scaling and conversion to sparse data + structures in :class:`linear_model.LinearRegression` when not necessary. + :pr:`26207` by :user:`Olivier Grisel `. + - |Enhancement| :class:`linear_model.SGDClassifier`, :class:`linear_model.SGDRegressor` and :class:`linear_model.SGDOneClassSVM` now preserve dtype for `numpy.float32`. @@ -309,7 +313,7 @@ Changelog :class:`linear_model.ARDRegression` to expose the actual number of iterations required to reach the stopping criterion. :pr:`25697` by :user:`John Pangas `. - + - |Fix| Use a more robust criterion to detect convergence of :class:`linear_model.LogisticRegression(penalty="l1", solver="liblinear")` on linearly separable problems. From c9fcb4752fbb25a1828902c8217d7a4feca57742 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 18 Apr 2023 11:56:22 +0200 Subject: [PATCH 3/8] Clean-up --- sklearn/linear_model/_base.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 6e46ddce81fa7..d96b7ad0d69e3 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -315,13 +315,9 @@ def _rescale_data(X, y, sample_weight): y_rescaled : {array-like, sparse matrix} """ + # Assume that _validate_data and _check_sample_weight have been called by + # the caller. n_samples = X.shape[0] - sample_weight = np.asarray(sample_weight) - - if sample_weight.ndim == 0: - # XXX: compat for scalar weight: do we really need to support this? - sample_weight = np.full(n_samples, sample_weight, dtype=sample_weight.dtype) - sample_weight_sqrt = np.sqrt(sample_weight) if sp.issparse(X) or sp.issparse(y): From bab265a267ce765af8f58708a846c04d85cb0942 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 18 Apr 2023 14:50:00 +0200 Subject: [PATCH 4/8] TST test_rescale_data with sparse y --- sklearn/linear_model/tests/test_base.py | 35 ++++++++++++++++++++----- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index f8b1bd7b7b65c..bea1111778b27 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -661,7 +661,8 @@ def test_dtype_preprocess_data(global_random_seed): @pytest.mark.parametrize("n_targets", [None, 2]) -def test_rescale_data_dense(n_targets, global_random_seed): +@pytest.mark.parametrize("sparse_data", [True, False]) +def test_rescale_data(n_targets, sparse_data, global_random_seed): rng = np.random.RandomState(global_random_seed) n_samples = 200 n_features = 2 @@ -672,14 +673,34 @@ def test_rescale_data_dense(n_targets, global_random_seed): y = rng.rand(n_samples) else: y = rng.rand(n_samples, n_targets) - rescaled_X, rescaled_y, sqrt_sw = _rescale_data(X, y, sample_weight) - rescaled_X2 = X * sqrt_sw[:, np.newaxis] + + expected_sqrt_sw = np.sqrt(sample_weight) + expected_rescaled_X = X * expected_sqrt_sw[:, np.newaxis] + if n_targets is None: - rescaled_y2 = y * sqrt_sw + expected_rescaled_y = y * expected_sqrt_sw else: - rescaled_y2 = y * sqrt_sw[:, np.newaxis] - assert_array_almost_equal(rescaled_X, rescaled_X2) - assert_array_almost_equal(rescaled_y, rescaled_y2) + expected_rescaled_y = y * expected_sqrt_sw[:, np.newaxis] + + if sparse_data: + X = sparse.csr_matrix(X) + if n_targets is None: + y = sparse.csr_matrix(y.reshape(-1, 1)) + else: + y = sparse.csr_matrix(y) + + rescaled_X, rescaled_y, sqrt_sw = _rescale_data(X, y, sample_weight) + + assert_allclose(sqrt_sw, expected_sqrt_sw) + + if sparse_data: + rescaled_X = rescaled_X.toarray() + rescaled_y = rescaled_y.toarray() + if n_targets is None: + rescaled_y = rescaled_y.ravel() + + assert_allclose(rescaled_X, expected_rescaled_X) + assert_allclose(rescaled_y, expected_rescaled_y) def test_fused_types_make_dataset(): From 74860c48d1684c9afada7d7753ba01cee1cf51e1 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 18 Apr 2023 17:14:51 +0200 Subject: [PATCH 5/8] Insert blank line before if block for readability Co-authored-by: Thomas J. Fan --- sklearn/linear_model/_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index d96b7ad0d69e3..b62466d2a23b6 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -331,6 +331,7 @@ def _rescale_data(X, y, sample_weight): # with the sparse case and because the _rescale_data currently # does not make it explicit if it's ok to do it or not. X = X * sample_weight_sqrt[:, np.newaxis] + if sp.issparse(y): y = safe_sparse_dot(sw_matrix, y) else: From 278c6413fdfcd779f4253b6f74292cffca2143b2 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 18 Apr 2023 19:17:55 +0200 Subject: [PATCH 6/8] one more blank line for readability Co-authored-by: Christian Lorentzen --- sklearn/linear_model/_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index b62466d2a23b6..035cdcc0c8c75 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -324,6 +324,7 @@ def _rescale_data(X, y, sample_weight): sw_matrix = sparse.dia_matrix( (sample_weight_sqrt, 0), shape=(n_samples, n_samples) ) + if sp.issparse(X): X = safe_sparse_dot(sw_matrix, X) else: From 00a69de827fd545403a32c62dee17461798bde45 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 18 Apr 2023 19:31:51 +0200 Subject: [PATCH 7/8] Remove extra data copies when not needed --- sklearn/linear_model/_base.py | 55 +++++++++++++++--------- sklearn/linear_model/tests/test_base.py | 56 +++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 19 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 035cdcc0c8c75..d05713a34a139 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -187,6 +187,7 @@ def _preprocess_data( fit_intercept, normalize=False, copy=True, + copy_y=True, sample_weight=None, check_input=True, ): @@ -230,13 +231,14 @@ def _preprocess_data( if check_input: X = check_array(X, copy=copy, accept_sparse=["csr", "csc"], dtype=FLOAT_DTYPES) - elif copy: - if sp.issparse(X): - X = X.copy() - else: - X = X.copy(order="K") - - y = np.asarray(y, dtype=X.dtype) + y = check_array(y, dtype=X.dtype, copy=copy_y, ensure_2d=False) + else: + y = y.astype(X.dtype, copy=copy_y) + if copy: + if sp.issparse(X): + X = X.copy() + else: + X = X.copy(order="K") if fit_intercept: if sp.issparse(X): @@ -276,7 +278,7 @@ def _preprocess_data( X_scale = np.ones(X.shape[1], dtype=X.dtype) y_offset = np.average(y, axis=0, weights=sample_weight) - y = y - y_offset + y -= y_offset else: X_offset = np.zeros(X.shape[1], dtype=X.dtype) X_scale = np.ones(X.shape[1], dtype=X.dtype) @@ -293,7 +295,7 @@ def _preprocess_data( # sample_weight makes the refactoring tricky. -def _rescale_data(X, y, sample_weight): +def _rescale_data(X, y, sample_weight, inplace=False): """Rescale data sample-wise by square root of sample_weight. For many linear models, this enables easy support for sample_weight because @@ -328,18 +330,24 @@ def _rescale_data(X, y, sample_weight): if sp.issparse(X): X = safe_sparse_dot(sw_matrix, X) else: - # XXX: we do not do inplace multiplication on X for consistency - # with the sparse case and because the _rescale_data currently - # does not make it explicit if it's ok to do it or not. - X = X * sample_weight_sqrt[:, np.newaxis] + if inplace: + X *= sample_weight_sqrt[:, np.newaxis] + else: + X = X * sample_weight_sqrt[:, np.newaxis] if sp.issparse(y): y = safe_sparse_dot(sw_matrix, y) else: - if y.ndim == 1: - y = y * sample_weight_sqrt + if inplace: + if y.ndim == 1: + y *= sample_weight_sqrt + else: + y *= sample_weight_sqrt[:, np.newaxis] else: - y = y * sample_weight_sqrt[:, np.newaxis] + if y.ndim == 1: + y = y * sample_weight_sqrt + else: + y = y * sample_weight_sqrt[:, np.newaxis] return X, y, sample_weight_sqrt @@ -674,17 +682,26 @@ def fit(self, X, y, sample_weight=None): sample_weight, X, dtype=X.dtype, only_non_negative=True ) + # Note that neither _rescale_data nor the rest of the fit method of + # LinearRegression can benefit from in-place operations when X is a + # sparse matrix. Therefore, let's not copy X when it is sparse. + copy_X_in_preprocess_data = self.copy_X and not sp.issparse(X) + X, y, X_offset, y_offset, X_scale = _preprocess_data( X, y, fit_intercept=self.fit_intercept, - copy=self.copy_X, + copy=copy_X_in_preprocess_data, sample_weight=sample_weight, ) - # Sample weight can be implemented via a simple rescaling. if has_sw: - X, y, sample_weight_sqrt = _rescale_data(X, y, sample_weight) + # Sample weight can be implemented via a simple rescaling. Note + # that we safely do inplace rescaling when _preprocess_data has + # already made a copy if requested. + X, y, sample_weight_sqrt = _rescale_data( + X, y, sample_weight, inplace=copy_X_in_preprocess_data + ) if self.positive: if y.ndim < 2: diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index bea1111778b27..112583bae593a 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -315,6 +315,62 @@ def test_linear_regression_positive_vs_nonpositive_when_positive(global_random_s assert np.mean((reg.coef_ - regn.coef_) ** 2) < 1e-6 +@pytest.mark.parametrize("sparse_X", [True, False]) +@pytest.mark.parametrize("use_sw", [True, False]) +def test_inplace_data_preprocessing(sparse_X, use_sw, global_random_seed): + # Check that the data is not modified inplace by the linear regression + # estimator. + rng = np.random.RandomState(global_random_seed) + original_X_data = rng.randn(10, 12) + original_y_data = rng.randn(10, 2) + orginal_sw_data = rng.rand(10) + + if sparse_X: + X = sparse.csr_matrix(original_X_data) + else: + X = original_X_data.copy() + y = original_y_data.copy() + # XXX: Note hat y_sparse is not supported (broken?) in the current + # implementation of LinearRegression. + + if use_sw: + sample_weight = orginal_sw_data.copy() + else: + sample_weight = None + + # Do not allow inplace preprocessing of X and y: + reg = LinearRegression() + reg.fit(X, y, sample_weight=sample_weight) + if sparse_X: + assert_allclose(X.toarray(), original_X_data) + else: + assert_allclose(X, original_X_data) + assert_allclose(y, original_y_data) + + if use_sw: + assert_allclose(sample_weight, orginal_sw_data) + + # Allow inplace preprocessing of X and y + reg = LinearRegression(copy_X=False) + reg.fit(X, y, sample_weight=sample_weight) + if sparse_X: + # No optimization relying on the inplace modification of sparse input + # data has been implemented at this time. + assert_allclose(X.toarray(), original_X_data) + else: + # X has been offset (and optionally rescaled by sample weights) + # inplace. The 0.42 threshold is arbitrary and has been found to be + # robust to any random seed in the admissible range. + assert np.linalg.norm(X - original_X_data) > 0.42 + + # y should not have been modified inplace by LinearRegression.fit. + assert_allclose(y, original_y_data) + + if use_sw: + # Sample weights have no reason to ever be modified inplace. + assert_allclose(sample_weight, orginal_sw_data) + + def test_linear_regression_pd_sparse_dataframe_warning(): pd = pytest.importorskip("pandas") From 683647242f2399a2793a4f5a4ef28c6472f899d3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 18 Apr 2023 19:40:00 +0200 Subject: [PATCH 8/8] DOC better changelog entry --- doc/whats_new/v1.3.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 5a0c44c37b58f..aa5ade8969003 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -294,8 +294,9 @@ Changelog :mod:`sklearn.linear_model` ........................... -- |Efficiency| Avoid uncessary data scaling and conversion to sparse data - structures in :class:`linear_model.LinearRegression` when not necessary. +- |Efficiency| Avoid data scaling when `sample_weight=None` and other + unnecessary data copies and unexpected dense to sparse data conversion in + :class:`linear_model.LinearRegression`. :pr:`26207` by :user:`Olivier Grisel `. - |Enhancement| :class:`linear_model.SGDClassifier`,