From 15f39d1a0554fca42aad1903af9f66af1f3aa557 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 19 Jan 2023 11:12:53 +0100 Subject: [PATCH 1/7] TST Add non-regression test for #7981 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reproducer is adapted from the one of this message: https://github.com/scikit-learn/scikit-learn/issues/7981#issue-193466569 Co-authored-by: Loïc Estève --- sklearn/metrics/tests/test_pairwise.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 3624983c4c481..2f8bb67f92cc7 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -2,6 +2,7 @@ from types import GeneratorType import numpy as np +from joblib import Parallel from numpy import linalg from scipy.sparse import dok_matrix, csr_matrix, issparse @@ -15,7 +16,7 @@ # should be used instead. from scipy.spatial.distance import minkowski as wminkowski -from sklearn.utils.fixes import sp_version, parse_version +from sklearn.utils.fixes import sp_version, parse_version, delayed import pytest @@ -1549,3 +1550,14 @@ def test_numeric_pairwise_distances_datatypes(metric, global_dtype, y_is_x): dist = pairwise_distances(X, Y, metric=metric, **params) assert_allclose(dist, expected_dist) + + +def test_sparse_manhattan_readonly_dataset(): + # Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/7981 + matrices1 = [csr_matrix(np.ones((5, 5)))] + matrices2 = [csr_matrix(np.ones((5, 5)))] + # Joblib memory maps datasets which makes them read only. + # The following call was reporting as failing in 7981, but this must pass. + Parallel(n_jobs=-1, max_nbytes=0)( + delayed(manhattan_distances)(m1, m2) for m1, m2 in zip(matrices1, matrices2) + ) From bf2b5b498fa6e78dd0aa339028a206a4d7845dce Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 19 Jan 2023 11:16:23 +0100 Subject: [PATCH 2/7] FIX Support readonly sparse datasets for manhattan --- sklearn/metrics/_pairwise_fast.pyx | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/_pairwise_fast.pyx b/sklearn/metrics/_pairwise_fast.pyx index b9006773c015d..f7ddd68c46c1e 100644 --- a/sklearn/metrics/_pairwise_fast.pyx +++ b/sklearn/metrics/_pairwise_fast.pyx @@ -35,9 +35,15 @@ def _chi2_kernel_fast(floating[:, :] X, result[i, j] = -res -def _sparse_manhattan(floating[::1] X_data, int[:] X_indices, int[:] X_indptr, - floating[::1] Y_data, int[:] Y_indices, int[:] Y_indptr, - double[:, ::1] D): +def _sparse_manhattan( + const floating[::1] X_data, + const int[:] X_indices, + const int[:] X_indptr, + const floating[::1] Y_data, + const int[:] Y_indices, + const int[:] Y_indptr, + double[:, ::1] D, +): """Pairwise L1 distances for CSR matrices. Usage: From f8c38854878121bd3eb41fa0a64733b0f68feea6 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 24 Jan 2023 10:34:59 +0100 Subject: [PATCH 3/7] DOC Add entry in whats_new/v1.2.rst for 1.2.1 --- doc/whats_new/v1.2.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 3dd5edfceea86..7708d2fba4dfb 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -105,6 +105,12 @@ Changelog - |Fix| :class:`manifold.TSNE` now works correctly when output type is set to pandas :pr:`25370` by :user:`Tim Head `. +:mod:`sklearn.metrics` +...................... + +- |Fix| :func:`metric.manhattan_distances` now supports readonly sparse datasets. + :pr:`25432` by :user:`Julien Jerphanion `. + :mod:`sklearn.model_selection` .............................. From 286ee5b6c34c431f9c1889322afe6c730e036eb1 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 24 Jan 2023 12:05:11 +0100 Subject: [PATCH 4/7] FIX Fix comment --- sklearn/metrics/tests/test_pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index a4088ff378659..889fdecfa7c3e 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -1557,7 +1557,7 @@ def test_sparse_manhattan_readonly_dataset(): matrices1 = [csr_matrix(np.ones((5, 5)))] matrices2 = [csr_matrix(np.ones((5, 5)))] # Joblib memory maps datasets which makes them read only. - # The following call was reporting as failing in 7981, but this must pass. + # The following call was reporting as failing in #7981, but this must pass. Parallel(n_jobs=-1, max_nbytes=0)( delayed(manhattan_distances)(m1, m2) for m1, m2 in zip(matrices1, matrices2) ) From 4f05003207297bde8902f1bb6f7794f935085457 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 24 Jan 2023 15:20:12 +0100 Subject: [PATCH 5/7] Update sklearn/metrics/tests/test_pairwise.py Co-authored-by: Christian Lorentzen --- sklearn/metrics/tests/test_pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 889fdecfa7c3e..aca6695675685 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -1556,7 +1556,7 @@ def test_sparse_manhattan_readonly_dataset(): # Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/7981 matrices1 = [csr_matrix(np.ones((5, 5)))] matrices2 = [csr_matrix(np.ones((5, 5)))] - # Joblib memory maps datasets which makes them read only. + # Joblib memory maps datasets which makes them read-only. # The following call was reporting as failing in #7981, but this must pass. Parallel(n_jobs=-1, max_nbytes=0)( delayed(manhattan_distances)(m1, m2) for m1, m2 in zip(matrices1, matrices2) From 80f4af391b8ae76f41cba9bf552b8e46d3504403 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 24 Jan 2023 15:55:38 +0100 Subject: [PATCH 6/7] DOC Move entry to whats_new/v1.3.rst --- doc/whats_new/v1.2.rst | 6 ------ doc/whats_new/v1.3.rst | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 7708d2fba4dfb..3dd5edfceea86 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -105,12 +105,6 @@ Changelog - |Fix| :class:`manifold.TSNE` now works correctly when output type is set to pandas :pr:`25370` by :user:`Tim Head `. -:mod:`sklearn.metrics` -...................... - -- |Fix| :func:`metric.manhattan_distances` now supports readonly sparse datasets. - :pr:`25432` by :user:`Julien Jerphanion `. - :mod:`sklearn.model_selection` .............................. diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 937c9c1448030..e5bd8fcb73fee 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -122,6 +122,12 @@ Changelog - |Enhancement| Added the parameter `fill_value` to :class:`impute.IterativeImputer`. :pr:`25232` by :user:`Thijs van Weezel `. +:mod:`sklearn.metrics` +...................... + +- |Fix| :func:`metric.manhattan_distances` now supports readonly sparse datasets. + :pr:`25432` by :user:`Julien Jerphanion `. + :mod:`sklearn.naive_bayes` .......................... From 9181608364726a588b1c5092d38e1ce69c5aab65 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 24 Jan 2023 18:32:33 +0100 Subject: [PATCH 7/7] Update sklearn/metrics/tests/test_pairwise.py Co-authored-by: Olivier Grisel --- sklearn/metrics/tests/test_pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index aca6695675685..930ac93534546 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -1558,6 +1558,6 @@ def test_sparse_manhattan_readonly_dataset(): matrices2 = [csr_matrix(np.ones((5, 5)))] # Joblib memory maps datasets which makes them read-only. # The following call was reporting as failing in #7981, but this must pass. - Parallel(n_jobs=-1, max_nbytes=0)( + Parallel(n_jobs=2, max_nbytes=0)( delayed(manhattan_distances)(m1, m2) for m1, m2 in zip(matrices1, matrices2) )