From 624c69c54bc0942ab5eac00bf885a950cab43030 Mon Sep 17 00:00:00 2001 From: tataganesh Date: Mon, 23 Oct 2023 19:35:00 -0600 Subject: [PATCH 1/5] FIX: Copy to avoid in-place modify --- sklearn/cluster/_dbscan.py | 3 ++- sklearn/cluster/tests/test_dbscan.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 0129138801973..98f524752a39a 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -391,9 +391,10 @@ def fit(self, X, y=None, sample_weight=None): if self.metric == "precomputed" and sparse.issparse(X): # set the diagonal to explicit values, as a point is its own # neighbor + X = X.copy() # copy to avoid in-place modification with warnings.catch_warnings(): warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning) - X.setdiag(X.diagonal()) # XXX: modifies X's internals in-place + X.setdiag(X.diagonal()) neighbors_model = NearestNeighbors( radius=self.eps, diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index 0b5a067b3f8b1..737735ef68119 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -122,6 +122,20 @@ def test_dbscan_input_not_modified(metric, csr_container): assert_array_equal(X, X_copy) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_dbscan_input_not_modified_precomputed_sparse_nodiag(csr_container): + # test that the input is not modified by dbscan when the + # precomputed sparse matrix has no diagonal + # elements + X = np.random.RandomState(0).rand(10, 10) + np.fill_diagonal(X, 0) + X = csr_container(X) + X_copy = X.copy() + dbscan(X, metric="precomputed") + assert X.nnz == X_copy.nnz + assert_array_equal(X.toarray(), X_copy.toarray()) + + @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_dbscan_no_core_samples(csr_container): rng = np.random.RandomState(0) From 7ecd6ef1055316d2359d0da3a1a7401869bf3dca Mon Sep 17 00:00:00 2001 From: Tata Ganesh Date: Mon, 13 Nov 2023 18:30:00 +0530 Subject: [PATCH 2/5] Suggestions from code review - Better comments Co-authored-by: Guillaume Lemaitre --- sklearn/cluster/tests/test_dbscan.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index 737735ef68119..15b189b49615a 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -124,14 +124,21 @@ def test_dbscan_input_not_modified(metric, csr_container): @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_dbscan_input_not_modified_precomputed_sparse_nodiag(csr_container): - # test that the input is not modified by dbscan when the - # precomputed sparse matrix has no diagonal - # elements + """Check that we don't modify in-place the pre-computed sparse matrix. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27508 + """ X = np.random.RandomState(0).rand(10, 10) + # Add zeros on the diagonal that will be implicit when creating + # the sparse matrix. If `X` is modified in-place, the zeros from + # the diagonal will be made explicit. np.fill_diagonal(X, 0) X = csr_container(X) X_copy = X.copy() dbscan(X, metric="precomputed") + # Make sure that we did not modify `X` in-place even by creating + # explicit 0s values. assert X.nnz == X_copy.nnz assert_array_equal(X.toarray(), X_copy.toarray()) From 34dc9deea4fabb5b8f0d77127f26f0e791aebbf1 Mon Sep 17 00:00:00 2001 From: tataganesh Date: Tue, 14 Nov 2023 14:21:34 +0530 Subject: [PATCH 3/5] DOCS: Changelog updated --- doc/whats_new/v1.4.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index a947dfe225b04..60da8ea246caf 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -217,6 +217,11 @@ Changelog `kdtree` and `balltree` values will be removed in 1.6. :pr:`26744` by :user:`Shreesha Kumar Bhat `. +- |FIX| : Create copy of precomputed sparse matrix within the + `fit` method of `cluster.DBSCAN` to avoid in-place modification of + the sparse matrix. + :pr:`27651` by :user:`Ganesh Tata `. + :mod:`sklearn.compose` ...................... From 11da1effebf06748af772e45b3522de97b6e0871 Mon Sep 17 00:00:00 2001 From: tataganesh Date: Tue, 14 Nov 2023 14:26:06 +0530 Subject: [PATCH 4/5] FIX: Linter error --- sklearn/cluster/tests/test_dbscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index 15b189b49615a..6eb70397a2e73 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -125,7 +125,7 @@ def test_dbscan_input_not_modified(metric, csr_container): @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_dbscan_input_not_modified_precomputed_sparse_nodiag(csr_container): """Check that we don't modify in-place the pre-computed sparse matrix. - + Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/27508 """ From 6c823c51e77d71ba8fb2ea0b7d602f58da7e3aed Mon Sep 17 00:00:00 2001 From: tataganesh Date: Tue, 14 Nov 2023 20:43:06 +0530 Subject: [PATCH 5/5] CHORE: Assert for implicit zeroes --- sklearn/cluster/tests/test_dbscan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index 6eb70397a2e73..d42cc2b17d518 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -135,6 +135,7 @@ def test_dbscan_input_not_modified_precomputed_sparse_nodiag(csr_container): # the diagonal will be made explicit. np.fill_diagonal(X, 0) X = csr_container(X) + assert all(row != col for row, col in zip(*X.nonzero())) X_copy = X.copy() dbscan(X, metric="precomputed") # Make sure that we did not modify `X` in-place even by creating