From 483c9744a2e259f3036ac2042d5a5f45d792996d Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 18 Apr 2023 11:05:06 +0200
Subject: [PATCH 1/8] PERF fix overhead of _rescale_data in LinearRegression

---
 sklearn/linear_model/_base.py | 54 ++++++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 11 deletions(-)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index e8e907906efc3..6e46ddce81fa7 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -317,12 +317,31 @@ def _rescale_data(X, y, sample_weight):
     """
     n_samples = X.shape[0]
     sample_weight = np.asarray(sample_weight)
+
     if sample_weight.ndim == 0:
+        # XXX: compat for scalar weight: do we really need to support this?
         sample_weight = np.full(n_samples, sample_weight, dtype=sample_weight.dtype)
+
     sample_weight_sqrt = np.sqrt(sample_weight)
-    sw_matrix = sparse.dia_matrix((sample_weight_sqrt, 0), shape=(n_samples, n_samples))
-    X = safe_sparse_dot(sw_matrix, X)
-    y = safe_sparse_dot(sw_matrix, y)
+
+    if sp.issparse(X) or sp.issparse(y):
+        sw_matrix = sparse.dia_matrix(
+            (sample_weight_sqrt, 0), shape=(n_samples, n_samples)
+        )
+    if sp.issparse(X):
+        X = safe_sparse_dot(sw_matrix, X)
+    else:
+        # XXX: we do not do inplace multiplication on X for consistency
+        # with the sparse case and because the _rescale_data currently
+        # does not make it explicit if it's ok to do it or not.
+        X = X * sample_weight_sqrt[:, np.newaxis]
+    if sp.issparse(y):
+        y = safe_sparse_dot(sw_matrix, y)
+    else:
+        if y.ndim == 1:
+            y = y * sample_weight_sqrt
+        else:
+            y = y * sample_weight_sqrt[:, np.newaxis]
     return X, y, sample_weight_sqrt
 
 
@@ -651,9 +670,11 @@ def fit(self, X, y, sample_weight=None):
             X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
         )
 
-        sample_weight = _check_sample_weight(
-            sample_weight, X, dtype=X.dtype, only_non_negative=True
-        )
+        has_sw = sample_weight is not None
+        if has_sw:
+            sample_weight = _check_sample_weight(
+                sample_weight, X, dtype=X.dtype, only_non_negative=True
+            )
 
         X, y, X_offset, y_offset, X_scale = _preprocess_data(
             X,
@@ -664,7 +685,8 @@ def fit(self, X, y, sample_weight=None):
         )
 
         # Sample weight can be implemented via a simple rescaling.
-        X, y, sample_weight_sqrt = _rescale_data(X, y, sample_weight)
+        if has_sw:
+            X, y, sample_weight_sqrt = _rescale_data(X, y, sample_weight)
 
         if self.positive:
             if y.ndim < 2:
@@ -678,11 +700,21 @@ def fit(self, X, y, sample_weight=None):
         elif sp.issparse(X):
             X_offset_scale = X_offset / X_scale
 
-            def matvec(b):
-                return X.dot(b) - sample_weight_sqrt * b.dot(X_offset_scale)
+            if has_sw:
+
+                def matvec(b):
+                    return X.dot(b) - sample_weight_sqrt * b.dot(X_offset_scale)
+
+                def rmatvec(b):
+                    return X.T.dot(b) - X_offset_scale * b.dot(sample_weight_sqrt)
+
+            else:
+
+                def matvec(b):
+                    return X.dot(b) - b.dot(X_offset_scale)
 
-            def rmatvec(b):
-                return X.T.dot(b) - X_offset_scale * b.dot(sample_weight_sqrt)
+                def rmatvec(b):
+                    return X.T.dot(b) - X_offset_scale * b.sum()
 
             X_centered = sparse.linalg.LinearOperator(
                 shape=X.shape, matvec=matvec, rmatvec=rmatvec

From 6092129077754c89ea48494f054648a4f0e064d0 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 18 Apr 2023 11:18:01 +0200
Subject: [PATCH 2/8] DOC add entry to the changelog

---
 doc/whats_new/v1.3.rst | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 2200458cc5d32..5a0c44c37b58f 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -267,7 +267,7 @@ Changelog
   meaning that it is note required to call `fit` before calling `transform`.
   Parameter validation only happens at `fit` time.
   :pr:`24230` by :user:`Guillaume Lemaitre <glemaitre>`.
-  
+
 :mod:`sklearn.feature_selection`
 ................................
 
@@ -294,6 +294,10 @@ Changelog
 :mod:`sklearn.linear_model`
 ...........................
 
+- |Efficiency| Avoid uncessary data scaling and conversion to sparse data
+  structures in :class:`linear_model.LinearRegression` when not necessary.
+  :pr:`26207` by :user:`Olivier Grisel <ogrisel>`.
+
 - |Enhancement| :class:`linear_model.SGDClassifier`,
   :class:`linear_model.SGDRegressor` and :class:`linear_model.SGDOneClassSVM`
   now preserve dtype for `numpy.float32`.
@@ -309,7 +313,7 @@ Changelog
   :class:`linear_model.ARDRegression` to expose the actual number of iterations
   required to reach the stopping criterion.
   :pr:`25697` by :user:`John Pangas <jpangas>`.
-  
+
 - |Fix| Use a more robust criterion to detect convergence of
   :class:`linear_model.LogisticRegression(penalty="l1", solver="liblinear")`
   on linearly separable problems.

From c9fcb4752fbb25a1828902c8217d7a4feca57742 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 18 Apr 2023 11:56:22 +0200
Subject: [PATCH 3/8] Clean-up

---
 sklearn/linear_model/_base.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 6e46ddce81fa7..d96b7ad0d69e3 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -315,13 +315,9 @@ def _rescale_data(X, y, sample_weight):
 
     y_rescaled : {array-like, sparse matrix}
     """
+    # Assume that _validate_data and _check_sample_weight have been called by
+    # the caller.
     n_samples = X.shape[0]
-    sample_weight = np.asarray(sample_weight)
-
-    if sample_weight.ndim == 0:
-        # XXX: compat for scalar weight: do we really need to support this?
-        sample_weight = np.full(n_samples, sample_weight, dtype=sample_weight.dtype)
-
     sample_weight_sqrt = np.sqrt(sample_weight)
 
     if sp.issparse(X) or sp.issparse(y):

From bab265a267ce765af8f58708a846c04d85cb0942 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 18 Apr 2023 14:50:00 +0200
Subject: [PATCH 4/8] TST test_rescale_data with sparse y

---
 sklearn/linear_model/tests/test_base.py | 35 ++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index f8b1bd7b7b65c..bea1111778b27 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -661,7 +661,8 @@ def test_dtype_preprocess_data(global_random_seed):
 
 
 @pytest.mark.parametrize("n_targets", [None, 2])
-def test_rescale_data_dense(n_targets, global_random_seed):
+@pytest.mark.parametrize("sparse_data", [True, False])
+def test_rescale_data(n_targets, sparse_data, global_random_seed):
     rng = np.random.RandomState(global_random_seed)
     n_samples = 200
     n_features = 2
@@ -672,14 +673,34 @@ def test_rescale_data_dense(n_targets, global_random_seed):
         y = rng.rand(n_samples)
     else:
         y = rng.rand(n_samples, n_targets)
-    rescaled_X, rescaled_y, sqrt_sw = _rescale_data(X, y, sample_weight)
-    rescaled_X2 = X * sqrt_sw[:, np.newaxis]
+
+    expected_sqrt_sw = np.sqrt(sample_weight)
+    expected_rescaled_X = X * expected_sqrt_sw[:, np.newaxis]
+
     if n_targets is None:
-        rescaled_y2 = y * sqrt_sw
+        expected_rescaled_y = y * expected_sqrt_sw
     else:
-        rescaled_y2 = y * sqrt_sw[:, np.newaxis]
-    assert_array_almost_equal(rescaled_X, rescaled_X2)
-    assert_array_almost_equal(rescaled_y, rescaled_y2)
+        expected_rescaled_y = y * expected_sqrt_sw[:, np.newaxis]
+
+    if sparse_data:
+        X = sparse.csr_matrix(X)
+        if n_targets is None:
+            y = sparse.csr_matrix(y.reshape(-1, 1))
+        else:
+            y = sparse.csr_matrix(y)
+
+    rescaled_X, rescaled_y, sqrt_sw = _rescale_data(X, y, sample_weight)
+
+    assert_allclose(sqrt_sw, expected_sqrt_sw)
+
+    if sparse_data:
+        rescaled_X = rescaled_X.toarray()
+        rescaled_y = rescaled_y.toarray()
+        if n_targets is None:
+            rescaled_y = rescaled_y.ravel()
+
+    assert_allclose(rescaled_X, expected_rescaled_X)
+    assert_allclose(rescaled_y, expected_rescaled_y)
 
 
 def test_fused_types_make_dataset():

From 74860c48d1684c9afada7d7753ba01cee1cf51e1 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 18 Apr 2023 17:14:51 +0200
Subject: [PATCH 5/8] Insert blank line before if block for readability

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/linear_model/_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index d96b7ad0d69e3..b62466d2a23b6 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -331,6 +331,7 @@ def _rescale_data(X, y, sample_weight):
         # with the sparse case and because the _rescale_data currently
         # does not make it explicit if it's ok to do it or not.
         X = X * sample_weight_sqrt[:, np.newaxis]
+
     if sp.issparse(y):
         y = safe_sparse_dot(sw_matrix, y)
     else:

From 278c6413fdfcd779f4253b6f74292cffca2143b2 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 18 Apr 2023 19:17:55 +0200
Subject: [PATCH 6/8] one more blank line for readability

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 sklearn/linear_model/_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index b62466d2a23b6..035cdcc0c8c75 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -324,6 +324,7 @@ def _rescale_data(X, y, sample_weight):
         sw_matrix = sparse.dia_matrix(
             (sample_weight_sqrt, 0), shape=(n_samples, n_samples)
         )
+
     if sp.issparse(X):
         X = safe_sparse_dot(sw_matrix, X)
     else:

From 00a69de827fd545403a32c62dee17461798bde45 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 18 Apr 2023 19:31:51 +0200
Subject: [PATCH 7/8] Remove extra data copies when not needed

---
 sklearn/linear_model/_base.py           | 55 +++++++++++++++---------
 sklearn/linear_model/tests/test_base.py | 56 +++++++++++++++++++++++++
 2 files changed, 92 insertions(+), 19 deletions(-)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 035cdcc0c8c75..d05713a34a139 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -187,6 +187,7 @@ def _preprocess_data(
     fit_intercept,
     normalize=False,
     copy=True,
+    copy_y=True,
     sample_weight=None,
     check_input=True,
 ):
@@ -230,13 +231,14 @@ def _preprocess_data(
 
     if check_input:
         X = check_array(X, copy=copy, accept_sparse=["csr", "csc"], dtype=FLOAT_DTYPES)
-    elif copy:
-        if sp.issparse(X):
-            X = X.copy()
-        else:
-            X = X.copy(order="K")
-
-    y = np.asarray(y, dtype=X.dtype)
+        y = check_array(y, dtype=X.dtype, copy=copy_y, ensure_2d=False)
+    else:
+        y = y.astype(X.dtype, copy=copy_y)
+        if copy:
+            if sp.issparse(X):
+                X = X.copy()
+            else:
+                X = X.copy(order="K")
 
     if fit_intercept:
         if sp.issparse(X):
@@ -276,7 +278,7 @@ def _preprocess_data(
             X_scale = np.ones(X.shape[1], dtype=X.dtype)
 
         y_offset = np.average(y, axis=0, weights=sample_weight)
-        y = y - y_offset
+        y -= y_offset
     else:
         X_offset = np.zeros(X.shape[1], dtype=X.dtype)
         X_scale = np.ones(X.shape[1], dtype=X.dtype)
@@ -293,7 +295,7 @@ def _preprocess_data(
 # sample_weight makes the refactoring tricky.
 
 
-def _rescale_data(X, y, sample_weight):
+def _rescale_data(X, y, sample_weight, inplace=False):
     """Rescale data sample-wise by square root of sample_weight.
 
     For many linear models, this enables easy support for sample_weight because
@@ -328,18 +330,24 @@ def _rescale_data(X, y, sample_weight):
     if sp.issparse(X):
         X = safe_sparse_dot(sw_matrix, X)
     else:
-        # XXX: we do not do inplace multiplication on X for consistency
-        # with the sparse case and because the _rescale_data currently
-        # does not make it explicit if it's ok to do it or not.
-        X = X * sample_weight_sqrt[:, np.newaxis]
+        if inplace:
+            X *= sample_weight_sqrt[:, np.newaxis]
+        else:
+            X = X * sample_weight_sqrt[:, np.newaxis]
 
     if sp.issparse(y):
         y = safe_sparse_dot(sw_matrix, y)
     else:
-        if y.ndim == 1:
-            y = y * sample_weight_sqrt
+        if inplace:
+            if y.ndim == 1:
+                y *= sample_weight_sqrt
+            else:
+                y *= sample_weight_sqrt[:, np.newaxis]
         else:
-            y = y * sample_weight_sqrt[:, np.newaxis]
+            if y.ndim == 1:
+                y = y * sample_weight_sqrt
+            else:
+                y = y * sample_weight_sqrt[:, np.newaxis]
     return X, y, sample_weight_sqrt
 
 
@@ -674,17 +682,26 @@ def fit(self, X, y, sample_weight=None):
                 sample_weight, X, dtype=X.dtype, only_non_negative=True
             )
 
+        # Note that neither _rescale_data nor the rest of the fit method of
+        # LinearRegression can benefit from in-place operations when X is a
+        # sparse matrix. Therefore, let's not copy X when it is sparse.
+        copy_X_in_preprocess_data = self.copy_X and not sp.issparse(X)
+
         X, y, X_offset, y_offset, X_scale = _preprocess_data(
             X,
             y,
             fit_intercept=self.fit_intercept,
-            copy=self.copy_X,
+            copy=copy_X_in_preprocess_data,
             sample_weight=sample_weight,
         )
 
-        # Sample weight can be implemented via a simple rescaling.
         if has_sw:
-            X, y, sample_weight_sqrt = _rescale_data(X, y, sample_weight)
+            # Sample weight can be implemented via a simple rescaling. Note
+            # that we safely do inplace rescaling when _preprocess_data has
+            # already made a copy if requested.
+            X, y, sample_weight_sqrt = _rescale_data(
+                X, y, sample_weight, inplace=copy_X_in_preprocess_data
+            )
 
         if self.positive:
             if y.ndim < 2:
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index bea1111778b27..112583bae593a 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -315,6 +315,62 @@ def test_linear_regression_positive_vs_nonpositive_when_positive(global_random_s
     assert np.mean((reg.coef_ - regn.coef_) ** 2) < 1e-6
 
 
+@pytest.mark.parametrize("sparse_X", [True, False])
+@pytest.mark.parametrize("use_sw", [True, False])
+def test_inplace_data_preprocessing(sparse_X, use_sw, global_random_seed):
+    # Check that the data is not modified inplace by the linear regression
+    # estimator.
+    rng = np.random.RandomState(global_random_seed)
+    original_X_data = rng.randn(10, 12)
+    original_y_data = rng.randn(10, 2)
+    orginal_sw_data = rng.rand(10)
+
+    if sparse_X:
+        X = sparse.csr_matrix(original_X_data)
+    else:
+        X = original_X_data.copy()
+    y = original_y_data.copy()
+    # XXX: Note hat y_sparse is not supported (broken?) in the current
+    # implementation of LinearRegression.
+
+    if use_sw:
+        sample_weight = orginal_sw_data.copy()
+    else:
+        sample_weight = None
+
+    # Do not allow inplace preprocessing of X and y:
+    reg = LinearRegression()
+    reg.fit(X, y, sample_weight=sample_weight)
+    if sparse_X:
+        assert_allclose(X.toarray(), original_X_data)
+    else:
+        assert_allclose(X, original_X_data)
+    assert_allclose(y, original_y_data)
+
+    if use_sw:
+        assert_allclose(sample_weight, orginal_sw_data)
+
+    # Allow inplace preprocessing of X and y
+    reg = LinearRegression(copy_X=False)
+    reg.fit(X, y, sample_weight=sample_weight)
+    if sparse_X:
+        # No optimization relying on the inplace modification of sparse input
+        # data has been implemented at this time.
+        assert_allclose(X.toarray(), original_X_data)
+    else:
+        # X has been offset (and optionally rescaled by sample weights)
+        # inplace. The 0.42 threshold is arbitrary and has been found to be
+        # robust to any random seed in the admissible range.
+        assert np.linalg.norm(X - original_X_data) > 0.42
+
+    # y should not have been modified inplace by LinearRegression.fit.
+    assert_allclose(y, original_y_data)
+
+    if use_sw:
+        # Sample weights have no reason to ever be modified inplace.
+        assert_allclose(sample_weight, orginal_sw_data)
+
+
 def test_linear_regression_pd_sparse_dataframe_warning():
     pd = pytest.importorskip("pandas")
 

From 683647242f2399a2793a4f5a4ef28c6472f899d3 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 18 Apr 2023 19:40:00 +0200
Subject: [PATCH 8/8] DOC better changelog entry

---
 doc/whats_new/v1.3.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 5a0c44c37b58f..aa5ade8969003 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -294,8 +294,9 @@ Changelog
 :mod:`sklearn.linear_model`
 ...........................
 
-- |Efficiency| Avoid uncessary data scaling and conversion to sparse data
-  structures in :class:`linear_model.LinearRegression` when not necessary.
+- |Efficiency| Avoid data scaling when `sample_weight=None` and other
+  unnecessary data copies and unexpected dense to sparse data conversion in
+  :class:`linear_model.LinearRegression`.
   :pr:`26207` by :user:`Olivier Grisel <ogrisel>`.
 
 - |Enhancement| :class:`linear_model.SGDClassifier`,