From 3ad593b23b73228cfa42a0f7239105d65568093f Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 10:05:52 +0200 Subject: [PATCH 01/21] Reword 'radius_neighborhood' for 'radius_neigbors' --- .gitignore | 4 +-- setup.cfg | 4 +-- setup.py | 2 +- .../_dispatcher.py | 2 +- ...orhood.pxd.tp => _radius_neighbors.pxd.tp} | 0 ...orhood.pyx.tp => _radius_neighbors.pyx.tp} | 0 .../_pairwise_distances_reduction/setup.py | 6 ++-- .../test_pairwise_distances_reduction.py | 28 +++++++++---------- sklearn/neighbors/tests/test_neighbors.py | 4 +-- 9 files changed, 25 insertions(+), 25 deletions(-) rename sklearn/metrics/_pairwise_distances_reduction/{_radius_neighborhood.pxd.tp => _radius_neighbors.pxd.tp} (100%) rename sklearn/metrics/_pairwise_distances_reduction/{_radius_neighborhood.pyx.tp => _radius_neighbors.pyx.tp} (100%) diff --git a/.gitignore b/.gitignore index 24f562af3df15..68299a202b7c7 100644 --- a/.gitignore +++ b/.gitignore @@ -95,5 +95,5 @@ sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx -sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd -sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx +sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd +sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx diff --git a/setup.cfg b/setup.cfg index 81fbbffadb233..3aa89052a4b8e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -79,8 +79,8 @@ ignore = sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx - sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd - sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx + sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd + sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx [codespell] diff --git a/setup.py b/setup.py index a22df6e647a8e..cc5b6f11749cb 100755 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ "sklearn.metrics._pairwise_distances_reduction._gemm_term_computer", "sklearn.metrics._pairwise_distances_reduction._base", "sklearn.metrics._pairwise_distances_reduction._argkmin", - "sklearn.metrics._pairwise_distances_reduction._radius_neighborhood", + "sklearn.metrics._pairwise_distances_reduction._radius_neighbors", "sklearn.metrics._pairwise_fast", "sklearn.neighbors._partition_nodes", "sklearn.tree._splitter", diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 809d683b52ced..d9875c9d0fc6d 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -16,7 +16,7 @@ ArgKmin64, ArgKmin32, ) -from ._radius_neighborhood import ( +from ._radius_neighbors import ( RadiusNeighbors64, RadiusNeighbors32, ) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp similarity index 100% rename from sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp rename to sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp similarity index 100% rename from sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp rename to sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp diff --git a/sklearn/metrics/_pairwise_distances_reduction/setup.py b/sklearn/metrics/_pairwise_distances_reduction/setup.py index f55ec659b5821..e1fbbceea7eb8 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/setup.py +++ b/sklearn/metrics/_pairwise_distances_reduction/setup.py @@ -21,8 +21,8 @@ def configuration(parent_package="", top_path=None): "sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp", "sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp", "sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp", - "sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp", - "sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp", + "sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp", + "sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp", ] gen_from_templates(templates) @@ -32,7 +32,7 @@ def configuration(parent_package="", top_path=None): "_gemm_term_computer.pyx", "_base.pyx", "_argkmin.pyx", - "_radius_neighborhood.pyx", + "_radius_neighbors.pyx", ] for source_file in cython_sources: diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 52f5f3e73948c..c4c2dd55af30b 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -185,7 +185,7 @@ def assert_argkmin_results_quasi_equality( ), msg -def assert_radius_neighborhood_results_equality( +def assert_radius_neighbors_results_equality( ref_dist, dist, ref_indices, indices, radius ): # We get arrays of arrays and we need to check for individual pairs @@ -204,7 +204,7 @@ def assert_radius_neighborhood_results_equality( ) -def assert_radius_neighborhood_results_quasi_equality( +def assert_radius_neighbors_results_quasi_equality( ref_dist, dist, ref_indices, @@ -308,7 +308,7 @@ def assert_radius_neighborhood_results_quasi_equality( ( RadiusNeighbors, np.float64, - ): assert_radius_neighborhood_results_equality, + ): assert_radius_neighbors_results_equality, # In the case of 32bit, indices can be permuted due to small difference # in the computations of their associated distances, hence we test equality of # results up to valid permutations. @@ -316,7 +316,7 @@ def assert_radius_neighborhood_results_quasi_equality( ( RadiusNeighbors, np.float32, - ): assert_radius_neighborhood_results_quasi_equality, + ): assert_radius_neighbors_results_quasi_equality, } @@ -404,7 +404,7 @@ def test_assert_argkmin_results_quasi_equality(): ) -def test_assert_radius_neighborhood_results_quasi_equality(): +def test_assert_radius_neighbors_results_quasi_equality(): rtol = 1e-7 eps = 1e-7 @@ -425,7 +425,7 @@ def test_assert_radius_neighborhood_results_quasi_equality(): ] # Sanity check: compare the reference results to themselves. - assert_radius_neighborhood_results_quasi_equality( + assert_radius_neighbors_results_quasi_equality( ref_dist, ref_dist, ref_indices, @@ -435,7 +435,7 @@ def test_assert_radius_neighborhood_results_quasi_equality(): ) # Apply valid permutation on indices - assert_radius_neighborhood_results_quasi_equality( + assert_radius_neighbors_results_quasi_equality( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1, 2, 3, 4, 5])]), @@ -443,7 +443,7 @@ def test_assert_radius_neighborhood_results_quasi_equality(): radius=6.1, rtol=rtol, ) - assert_radius_neighborhood_results_quasi_equality( + assert_radius_neighbors_results_quasi_equality( np.array([np.array([_1m, _1m, 1, _1p, _1p])]), np.array([np.array([_1m, _1m, 1, _1p, _1p])]), np.array([np.array([6, 7, 8, 9, 10])]), @@ -455,7 +455,7 @@ def test_assert_radius_neighborhood_results_quasi_equality(): # Apply invalid permutation on indices msg = "Neighbors indices for query 0 are not matching" with pytest.raises(AssertionError, match=msg): - assert_radius_neighborhood_results_quasi_equality( + assert_radius_neighbors_results_quasi_equality( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1, 2, 3, 4, 5])]), @@ -465,7 +465,7 @@ def test_assert_radius_neighborhood_results_quasi_equality(): ) # Having extra last elements is valid if they are in: [radius ± rtol] - assert_radius_neighborhood_results_quasi_equality( + assert_radius_neighbors_results_quasi_equality( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1.2, 2.5, _6_1m, 6.1])]), np.array([np.array([1, 2, 3, 4, 5])]), @@ -479,7 +479,7 @@ def test_assert_radius_neighborhood_results_quasi_equality(): "The last extra elements ([6.]) aren't in [radius ± rtol]=[6.1 ± 1e-07]" ) with pytest.raises(AssertionError, match=msg): - assert_radius_neighborhood_results_quasi_equality( + assert_radius_neighbors_results_quasi_equality( np.array([np.array([1.2, 2.5, 6])]), np.array([np.array([1.2, 2.5])]), np.array([np.array([1, 2, 3])]), @@ -491,7 +491,7 @@ def test_assert_radius_neighborhood_results_quasi_equality(): # Indices aren't properly sorted w.r.t their distances msg = "Neighbors indices for query 0 are not matching" with pytest.raises(AssertionError, match=msg): - assert_radius_neighborhood_results_quasi_equality( + assert_radius_neighbors_results_quasi_equality( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1, 2, 3, 4, 5])]), @@ -503,7 +503,7 @@ def test_assert_radius_neighborhood_results_quasi_equality(): # Distances aren't properly sorted msg = "Distances aren't sorted on row 0" with pytest.raises(AssertionError, match=msg): - assert_radius_neighborhood_results_quasi_equality( + assert_radius_neighbors_results_quasi_equality( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]), np.array([np.array([1, 2, 3, 4, 5])]), @@ -631,7 +631,7 @@ def test_argkmin_factory_method_wrong_usages(): ) -def test_radius_neighborhood_factory_method_wrong_usages(): +def test_radius_neighbors_factory_method_wrong_usages(): rng = np.random.RandomState(1) X = rng.rand(100, 10) Y = rng.rand(100, 10) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 0a75b57cc60ae..4cea0bbb00a5f 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -29,7 +29,7 @@ from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS from sklearn.metrics.tests.test_pairwise_distances_reduction import ( - assert_radius_neighborhood_results_equality, + assert_radius_neighbors_results_equality, ) from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split @@ -2174,7 +2174,7 @@ def test_radius_neighbors_brute_backend( X_test, return_distance=True ) - assert_radius_neighborhood_results_equality( + assert_radius_neighbors_results_equality( legacy_brute_dst, pdr_brute_dst, legacy_brute_idx, From cb086a43284dddf9ea783f526d5b3de28d16679b Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 10:07:27 +0200 Subject: [PATCH 02/21] Remove out-of-date comment --- .../_pairwise_distances_reduction/_dispatcher.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index d9875c9d0fc6d..197f273d36cd2 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -274,11 +274,6 @@ def compute( for the concrete implementation are therefore freed when this classmethod returns. """ - # Note (jjerphan): Some design thoughts for future extensions. - # This factory comes to handle specialisations for the given arguments. - # For future work, this might can be an entrypoint to specialise operations - # for various backend and/or hardware and/or datatypes, and/or fused - # {sparse, dense}-datasetspair etc. if X.dtype == Y.dtype == np.float64: return ArgKmin64.compute( X=X, @@ -425,11 +420,6 @@ def compute( This allows entirely decoupling the API entirely from the implementation details whilst maintaining RAII. """ - # Note (jjerphan): Some design thoughts for future extensions. - # This factory comes to handle specialisations for the given arguments. - # For future work, this might can be an entrypoint to specialise operations - # for various backend and/or hardware and/or datatypes, and/or fused - # {sparse, dense}-datasetspair etc. if X.dtype == Y.dtype == np.float64: return RadiusNeighbors64.compute( X=X, From f2a6686188f767e8c6200e8f41d15eea0d1a10a0 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 10:09:41 +0200 Subject: [PATCH 03/21] Reword to use BaseDistancesReduction --- .../_pairwise_distances_reduction/__init__.py | 10 +++--- .../_argkmin.pxd.tp | 6 ++-- .../_argkmin.pyx.tp | 6 ++-- .../_base.pxd.tp | 2 +- .../_base.pyx.tp | 4 +-- .../_datasets_pair.pyx.tp | 2 +- .../_dispatcher.py | 8 ++--- .../_radius_neighbors.pxd.tp | 6 ++-- .../_radius_neighbors.pyx.tp | 6 ++-- .../test_pairwise_distances_reduction.py | 32 +++++++++---------- 10 files changed, 41 insertions(+), 41 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/__init__.py b/sklearn/metrics/_pairwise_distances_reduction/__init__.py index 1aebb8bc4a572..db21d066ab1a6 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/__init__.py +++ b/sklearn/metrics/_pairwise_distances_reduction/__init__.py @@ -32,7 +32,7 @@ # # Dispatchers are meant to be used in the Python code. Under the hood, a # dispatcher must only define the logic to choose at runtime to the correct -# dtype-specialized :class:`BaseDistanceReductionDispatcher` implementation based +# dtype-specialized :class:`BaseDistancesReductionDispatcher` implementation based # on the dtype of X and of Y. # # @@ -46,7 +46,7 @@ # # # (base dispatcher) -# BaseDistanceReductionDispatcher +# BaseDistancesReductionDispatcher # ∆ # | # | @@ -57,7 +57,7 @@ # | | # | | # | (64bit implem.) | -# | BaseDistanceReducer{32,64} | +# | BaseDistancesReduction{32,64} | # | ∆ | # | | | # | | | @@ -87,14 +87,14 @@ from ._dispatcher import ( - BaseDistanceReductionDispatcher, + BaseDistancesReductionDispatcher, ArgKmin, RadiusNeighbors, sqeuclidean_row_norms, ) __all__ = [ - "BaseDistanceReductionDispatcher", + "BaseDistancesReductionDispatcher", "ArgKmin", "RadiusNeighbors", "sqeuclidean_row_norms", diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp index 37dd7f5836857..0f8b5a756bbfd 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp @@ -21,11 +21,11 @@ cnp.import_array() {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} -from ._base cimport BaseDistanceReducer{{name_suffix}} +from ._base cimport BaseDistancesReduction{{name_suffix}} from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} -cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): - """{{name_suffix}}bit implementation of BaseDistanceReducer{{name_suffix}} for the `ArgKmin` reduction.""" +cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): + """{{name_suffix}}bit implementation of BaseDistancesReduction{{name_suffix}} for the `ArgKmin` reduction.""" cdef: ITYPE_t k diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 1c1459e27f210..bf948b5d67f6e 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -40,7 +40,7 @@ cnp.import_array() {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} from ._base cimport ( - BaseDistanceReducer{{name_suffix}}, + BaseDistancesReduction{{name_suffix}}, _sqeuclidean_row_norms{{name_suffix}}, ) @@ -52,8 +52,8 @@ from ._datasets_pair cimport ( from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} -cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): - """{{name_suffix}}bit implementation of the pairwise-distance reduction BaseDistanceReducer{{name_suffix}}.""" +cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): + """{{name_suffix}}bit implementation of the pairwise-distance reduction BaseDistancesReduction{{name_suffix}}.""" @classmethod def compute( diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp index 77a689a9e4e94..24aeff4013c69 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp @@ -30,7 +30,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}( ITYPE_t num_threads, ) -cdef class BaseDistanceReducer{{name_suffix}}: +cdef class BaseDistancesReduction{{name_suffix}}: """ Base {{name_suffix}}bit implementation template of the pairwise-distances reduction backend. diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index d0c06de0f8761..c6b344c66004a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -105,7 +105,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32( from ._datasets_pair cimport DatasetsPair{{name_suffix}} -cdef class BaseDistanceReducer{{name_suffix}}: +cdef class BaseDistancesReduction{{name_suffix}}: """ Base {{name_suffix}}bit implementation template of the pairwise-distances reduction backend. @@ -341,7 +341,7 @@ cdef class BaseDistanceReducer{{name_suffix}}: ) nogil: """Compute the pairwise distances on two chunks of X and Y and reduce them. - This is THE core computational method of BaseDistanceReducer{{name_suffix}}. + This is THE core computational method of BaseDistancesReduction{{name_suffix}}. This must be implemented in subclasses agnostically from the parallelization strategies. """ diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index cfa37a004f17a..78857341f9c97 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -35,7 +35,7 @@ cdef class DatasetsPair{{name_suffix}}: The handling of parallelization over chunks to compute the distances and aggregation for several rows at a time is done in dedicated - subclasses of :class:`BaseDistanceReductionDispatcher` that in-turn rely on + subclasses of :class:`BaseDistancesReductionDispatcher` that in-turn rely on subclasses of :class:`DatasetsPair` for each pair of rows in the data. The goal is to make it possible to decouple the generic parallelization and aggregation logic from metric-specific computation as much as possible. diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 197f273d36cd2..56589b1174477 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -51,10 +51,10 @@ def sqeuclidean_row_norms(X, num_threads): ) -class BaseDistanceReductionDispatcher: +class BaseDistancesReductionDispatcher: """Abstract base dispatcher for pairwise distance computation & reduction. - Each dispatcher extending the base :class:`BaseDistanceReductionDispatcher` + Each dispatcher extending the base :class:`BaseDistancesReductionDispatcher` dispatcher must implement the :meth:`compute` classmethod. """ @@ -165,7 +165,7 @@ def compute( """ -class ArgKmin(BaseDistanceReductionDispatcher): +class ArgKmin(BaseDistancesReductionDispatcher): """Compute the argkmin of row vectors of X on the ones of Y. For each row vector of X, computes the indices of k first the rows @@ -304,7 +304,7 @@ def compute( ) -class RadiusNeighbors(BaseDistanceReductionDispatcher): +class RadiusNeighbors(BaseDistancesReductionDispatcher): """Compute radius-based neighbors for two sets of vectors. For each row-vector X[i] of the queries X, find all the indices j of diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp index 9bbe186589f20..631211186d7d5 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp @@ -43,12 +43,12 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( ##################### {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} -from ._base cimport BaseDistanceReducer{{name_suffix}} +from ._base cimport BaseDistancesReduction{{name_suffix}} from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} -cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): +cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): """ - {{name_suffix}}bit implementation of BaseDistanceReducer{{name_suffix}} for the + {{name_suffix}}bit implementation of BaseDistancesReduction{{name_suffix}} for the `RadiusNeighbors` reduction. """ diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 8d50a6e04abf9..c157d7ba877ce 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -59,7 +59,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} from ._base cimport ( - BaseDistanceReducer{{name_suffix}}, + BaseDistancesReduction{{name_suffix}}, _sqeuclidean_row_norms{{name_suffix}} ) @@ -71,9 +71,9 @@ from ._datasets_pair cimport ( from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} -cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): +cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): """ - {{name_suffix}}bit implementation of the pairwise-distance reduction BaseDistanceReducer{{name_suffix}} for the + {{name_suffix}}bit implementation of the pairwise-distance reduction BaseDistancesReduction{{name_suffix}} for the `RadiusNeighbors` reduction. """ diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index c4c2dd55af30b..f929a55105509 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -10,7 +10,7 @@ from scipy.spatial.distance import cdist from sklearn.metrics._pairwise_distances_reduction import ( - BaseDistanceReductionDispatcher, + BaseDistancesReductionDispatcher, ArgKmin, RadiusNeighbors, sqeuclidean_row_norms, @@ -522,33 +522,33 @@ def test_pairwise_distances_reduction_is_usable_for(): metric = "manhattan" # Must be usable for all possible pair of {dense, sparse} datasets - assert BaseDistanceReductionDispatcher.is_usable_for(X, Y, metric) - assert BaseDistanceReductionDispatcher.is_usable_for(X_csr, Y_csr, metric) - assert BaseDistanceReductionDispatcher.is_usable_for(X_csr, Y, metric) - assert BaseDistanceReductionDispatcher.is_usable_for(X, Y_csr, metric) + assert BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric) + assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y_csr, metric) + assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric) + assert BaseDistancesReductionDispatcher.is_usable_for(X, Y_csr, metric) - assert BaseDistanceReductionDispatcher.is_usable_for( + assert BaseDistancesReductionDispatcher.is_usable_for( X.astype(np.float64), Y.astype(np.float64), metric ) - assert BaseDistanceReductionDispatcher.is_usable_for( + assert BaseDistancesReductionDispatcher.is_usable_for( X.astype(np.float32), Y.astype(np.float32), metric ) - assert not BaseDistanceReductionDispatcher.is_usable_for( + assert not BaseDistancesReductionDispatcher.is_usable_for( X.astype(np.int64), Y.astype(np.int64), metric ) - assert not BaseDistanceReductionDispatcher.is_usable_for(X, Y, metric="pyfunc") - assert not BaseDistanceReductionDispatcher.is_usable_for( + assert not BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric="pyfunc") + assert not BaseDistancesReductionDispatcher.is_usable_for( X.astype(np.float32), Y, metric ) - assert not BaseDistanceReductionDispatcher.is_usable_for( + assert not BaseDistancesReductionDispatcher.is_usable_for( X, Y.astype(np.int32), metric ) # F-ordered arrays are not supported - assert not BaseDistanceReductionDispatcher.is_usable_for( + assert not BaseDistancesReductionDispatcher.is_usable_for( np.asfortranarray(X), Y, metric ) @@ -558,17 +558,17 @@ def test_pairwise_distances_reduction_is_usable_for(): # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669 # noqa # TODO: implement specialisation for (sq)euclidean on fused sparse-dense # using sparse-dense routines for matrix-vector multiplications. - assert not BaseDistanceReductionDispatcher.is_usable_for( + assert not BaseDistancesReductionDispatcher.is_usable_for( X_csr, Y, metric="euclidean" ) - assert not BaseDistanceReductionDispatcher.is_usable_for( + assert not BaseDistancesReductionDispatcher.is_usable_for( X_csr, Y_csr, metric="sqeuclidean" ) # CSR matrices without non-zeros elements aren't currently supported # TODO: support CSR matrices without non-zeros elements X_csr_0_nnz = csr_matrix(X * 0) - assert not BaseDistanceReductionDispatcher.is_usable_for(X_csr_0_nnz, Y, metric) + assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_0_nnz, Y, metric) # CSR matrices with int64 indices and indptr (e.g. large nnz, or large n_features) # aren't supported as of now. @@ -576,7 +576,7 @@ def test_pairwise_distances_reduction_is_usable_for(): # TODO: support CSR matrices with int64 indices and indptr X_csr_int64 = csr_matrix(X) X_csr_int64.indices = X_csr_int64.indices.astype(np.int64) - assert not BaseDistanceReductionDispatcher.is_usable_for(X_csr_int64, Y, metric) + assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_int64, Y, metric) def test_argkmin_factory_method_wrong_usages(): From 205edb2bd8a6aae270f946cb32efa76b59a61656 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 10:38:23 +0200 Subject: [PATCH 04/21] Clarify comments regarding float32 and float64 --- .../_pairwise_distances_reduction/__init__.py | 8 +-- .../_argkmin.pxd.tp | 4 +- .../_argkmin.pyx.tp | 4 +- .../_base.pxd.tp | 4 +- .../_base.pyx.tp | 60 +++++++++++++++---- .../_gemm_term_computer.pxd.tp | 2 +- .../_gemm_term_computer.pyx.tp | 2 +- .../_radius_neighbors.pxd.tp | 7 +-- .../_radius_neighbors.pyx.tp | 7 +-- 9 files changed, 63 insertions(+), 35 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/__init__.py b/sklearn/metrics/_pairwise_distances_reduction/__init__.py index db21d066ab1a6..133c854682f0c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/__init__.py +++ b/sklearn/metrics/_pairwise_distances_reduction/__init__.py @@ -56,7 +56,7 @@ # ArgKmin RadiusNeighbors # | | # | | -# | (64bit implem.) | +# | (float{32,64} implem.) | # | BaseDistancesReduction{32,64} | # | ∆ | # | | | @@ -74,9 +74,9 @@ # x | | x # EuclideanArgKmin{32,64} EuclideanRadiusNeighbors{32,64} # -# For instance :class:`ArgKmin`, dispatches to both :class:`ArgKmin64` -# and :class:`ArgKmin32` if X and Y are both dense NumPy arrays with a `float64` -# or `float32` dtype respectively. +# For instance :class:`ArgKmin` dispatches to: +# - :class:`ArgKmin64` if X and Y are two `float64` array-likes +# - :class:`ArgKmin32` if X and Y are two `float32` array-likes # # In addition, if the metric parameter is set to "euclidean" or "sqeuclidean", # then `ArgKmin{32,64}` further dispatches to `EuclideanArgKmin{32,64}`. For diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp index 0f8b5a756bbfd..bf304beb52891 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp @@ -25,7 +25,7 @@ from ._base cimport BaseDistancesReduction{{name_suffix}} from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): - """{{name_suffix}}bit implementation of BaseDistancesReduction{{name_suffix}} for the `ArgKmin` reduction.""" + """float{{name_suffix}} implementation of the ArgKmin.""" cdef: ITYPE_t k @@ -39,7 +39,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): - """EuclideanDistance-specialized {{name_suffix}}bit implementation of ArgKmin{{name_suffix}}.""" + """EuclideanDistance-specialisation of ArgKmin{{name_suffix}}.""" cdef: GEMMTermComputer{{name_suffix}} gemm_term_computer const DTYPE_t[::1] X_norm_squared diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index bf948b5d67f6e..1189f1f5cd682 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -53,7 +53,7 @@ from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): - """{{name_suffix}}bit implementation of the pairwise-distance reduction BaseDistancesReduction{{name_suffix}}.""" + """float{{name_suffix}} implementation of the ArgKmin.""" @classmethod def compute( @@ -328,7 +328,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): - """EuclideanDistance-specialized implementation for ArgKmin{{name_suffix}}.""" + """EuclideanDistance-specialisation of ArgKmin{{name_suffix}}.""" @classmethod def is_usable_for(cls, X, Y, metric) -> bool: diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp index 24aeff4013c69..b5e4261c2217a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp @@ -32,8 +32,8 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}( cdef class BaseDistancesReduction{{name_suffix}}: """ - Base {{name_suffix}}bit implementation template of the pairwise-distances reduction - backend. + Base float{{name_suffix}} implementation template of the pairwise-distances + reduction backends. Implementations inherit from this template and may override the several defined hooks as needed in order to easily extend functionality with diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index c6b344c66004a..aeb28e8103b6a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -82,7 +82,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32( ITYPE_t d = X.shape[1] DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE) - # To upcast the i-th row of X from 32bit to 64bit + # To upcast the i-th row of X from float32 to float64 vector[vector[DTYPE_t]] X_i_upcast = vector[vector[DTYPE_t]]( num_threads, vector[DTYPE_t](d) ) @@ -90,7 +90,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32( with nogil, parallel(num_threads=num_threads): thread_num = openmp.omp_get_thread_num() for i in prange(n, schedule='static'): - # Upcasting the i-th row of X from 32bit to 64bit + # Upcasting the i-th row of X from float32 to float64 for j in range(d): X_i_upcast[thread_num][j] = deref(X_ptr + i * d + j) @@ -107,8 +107,8 @@ from ._datasets_pair cimport DatasetsPair{{name_suffix}} cdef class BaseDistancesReduction{{name_suffix}}: """ - Base {{name_suffix}}bit implementation template of the pairwise-distances reduction - backend. + Base float{{name_suffix}} implementation template of the pairwise-distances + reduction backends. Implementations inherit from this template and may override the several defined hooks as needed in order to easily extend functionality with @@ -224,7 +224,6 @@ cdef class BaseDistancesReduction{{name_suffix}}: X_end = X_start + self.X_n_samples_chunk # Reinitializing thread datastructures for the new X chunk - # If necessary, upcast X[X_start:X_end] to 64bit self._parallel_on_X_init_chunk(thread_num, X_start, X_end) for Y_chunk_idx in range(self.Y_n_chunks): @@ -234,7 +233,6 @@ cdef class BaseDistancesReduction{{name_suffix}}: else: Y_end = Y_start + self.Y_n_samples_chunk - # If necessary, upcast Y[Y_start:Y_end] to 64bit self._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( X_start, X_end, Y_start, Y_end, @@ -295,7 +293,6 @@ cdef class BaseDistancesReduction{{name_suffix}}: thread_num = _openmp_thread_num() # Initializing datastructures used in this thread - # If necessary, upcast X[X_start:X_end] to 64bit self._parallel_on_Y_parallel_init(thread_num, X_start, X_end) for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'): @@ -305,7 +302,6 @@ cdef class BaseDistancesReduction{{name_suffix}}: else: Y_end = Y_start + self.Y_n_samples_chunk - # If necessary, upcast Y[Y_start:Y_end] to 64bit self._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( X_start, X_end, Y_start, Y_end, @@ -373,7 +369,18 @@ cdef class BaseDistancesReduction{{name_suffix}}: ITYPE_t X_start, ITYPE_t X_end, ) nogil: - """Initialize datastructures used in a thread given its number.""" + """Initialize datastructures used in a thread given its number. + + EuclideanDistance specialisations of subclass of BaseDistancesReduction _must_ + in this method call: + + self.gemm_term_computer._parallel_on_X_init_chunk( + thread_num, X_start, X_end, + ) + + to ensure for the float32 case, the proper upcast of X[X_start:X_end] to + dedicated float64 buffers prior the reduction. + """ return cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( @@ -386,7 +393,15 @@ cdef class BaseDistancesReduction{{name_suffix}}: ) nogil: """Initialize datastructures just before the _compute_and_reduce_distances_on_chunks. - This is eventually used to upcast X[X_start:X_end] to 64bit. + EuclideanDistance specialisations of subclass of BaseDistancesReduction _must_ + in this method call: + + self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num, + ) + + to ensure for the float32 case, the proper upcast of Y[Y_start:Y_end] to + dedicated float64 buffers prior the reduction. """ return @@ -418,7 +433,18 @@ cdef class BaseDistancesReduction{{name_suffix}}: ITYPE_t X_start, ITYPE_t X_end, ) nogil: - """Initialize datastructures used in a thread given its number.""" + """Initialize datastructures used in a thread given its number. + + EuclideanDistance specialisations of subclass of BaseDistancesReduction _must_ + in this method call: + + self.gemm_term_computer._parallel_on_Y_parallel_init( + thread_num, X_start, X_end, + ) + + to ensure for the float32 case, the proper upcast of X[X_start:X_end] to + dedicated float64 buffers prior the reduction. + """ return cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( @@ -431,8 +457,16 @@ cdef class BaseDistancesReduction{{name_suffix}}: ) nogil: """Initialize datastructures just before the _compute_and_reduce_distances_on_chunks. - This is eventually used to upcast Y[Y_start:Y_end] to 64bit. - """ + EuclideanDistance specialisations of subclass of BaseDistancesReduction _must_ + in this method call: + + self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num, + ) + + to ensure for the float32 case, the proper upcast of Y[Y_start:Y_end] to + dedicated float64 buffers prior the reduction. + """"" return cdef void _parallel_on_Y_synchronize( diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp index 5978cfee9ebee..f0599185a1b64 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp @@ -35,7 +35,7 @@ cdef class GEMMTermComputer{{name_suffix}}: vector[vector[DTYPE_t]] dist_middle_terms_chunks {{if upcast_to_float64}} - # Buffers for upcasting chunks of X and Y from 32bit to 64bit + # Buffers for upcasting chunks of X and Y from float32 to float64 vector[vector[DTYPE_t]] X_c_upcast vector[vector[DTYPE_t]] Y_c_upcast {{endif}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index 35e57219a96a7..37c482c961627 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -66,7 +66,7 @@ cdef class GEMMTermComputer{{name_suffix}}: self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) {{if upcast_to_float64}} - # We populate the buffer for upcasting chunks of X and Y from 32bit to 64bit. + # We populate the buffer for upcasting chunks of X and Y from float32 to float64. self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp index 631211186d7d5..2d72c8d88a4c0 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp @@ -47,10 +47,7 @@ from ._base cimport BaseDistancesReduction{{name_suffix}} from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): - """ - {{name_suffix}}bit implementation of BaseDistancesReduction{{name_suffix}} for the - `RadiusNeighbors` reduction. - """ + """float{{name_suffix}} implementation of the RadiusNeighbors.""" cdef: DTYPE_t radius @@ -97,7 +94,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}): - """EuclideanDistance-specialized {{name_suffix}}bit implementation for RadiusNeighbors{{name_suffix}}.""" + """EuclideanDistance-specialisation of RadiusNeighbors{{name_suffix}}.""" cdef: GEMMTermComputer{{name_suffix}} gemm_term_computer const DTYPE_t[::1] X_norm_squared diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index c157d7ba877ce..1890d60b20b20 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -72,10 +72,7 @@ from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): - """ - {{name_suffix}}bit implementation of the pairwise-distance reduction BaseDistancesReduction{{name_suffix}} for the - `RadiusNeighbors` reduction. - """ + """float{{name_suffix}} implementation of the RadiusNeighbors.""" @classmethod def compute( @@ -335,7 +332,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}): - """EuclideanDistance-specialized implementation for RadiusNeighbors{{name_suffix}}.""" + """EuclideanDistance-specialisation of RadiusNeighbors{{name_suffix}}.""" @classmethod def is_usable_for(cls, X, Y, metric) -> bool: From 290e13852954f5980e6ebad77cddfd17b4e804b4 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 10:43:14 +0200 Subject: [PATCH 05/21] Use 'potential' instead of 'eventual' --- sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 4 ++-- .../_pairwise_distances_reduction/_radius_neighbors.pyx.tp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 1189f1f5cd682..a1f3d39de07fb 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -309,7 +309,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): num_threads=self.effective_n_threads): for j in range(self.k): distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist( - # Guard against eventual -0., causing nan production. + # Guard against potential -0., causing nan production. max(distances[i, j], 0.) ) @@ -321,7 +321,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Values are returned identically to the way `KNeighborsMixin.kneighbors` # returns values. This is counter-intuitive but this allows not using - # complex adaptations where `ArgKmin{{name_suffix}}.compute` is called. + # complex adaptations where `ArgKmin.compute` is called. return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices) return np.asarray(self.argkmin_indices) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 1890d60b20b20..165c79209c79c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -325,7 +325,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) for j in range(deref(self.neigh_indices)[i].size()): deref(self.neigh_distances)[i][j] = ( self.datasets_pair.distance_metric._rdist_to_dist( - # Guard against eventual -0., causing nan production. + # Guard against potential -0., causing nan production. max(deref(self.neigh_distances)[i][j], 0.) ) ) From ca16041f5e10de991ed0a3605888429b6f441ced Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 11:04:09 +0200 Subject: [PATCH 06/21] =?UTF-8?q?GEMMTermComputer.{=5Fcompute=5Fdistances?= =?UTF-8?q?=5Fon=5Fchunks=E2=86=92=5Fcompute=5Fdist=5Fmiddle=5Fterms}?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 2 +- .../_gemm_term_computer.pxd.tp | 2 +- .../_gemm_term_computer.pyx.tp | 8 ++++---- .../_radius_neighbors.pyx.tp | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index a1f3d39de07fb..48f4bd61ef0b2 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -498,7 +498,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): DTYPE_t squared_dist_i_j ITYPE_t n_X = X_end - X_start ITYPE_t n_Y = Y_end - Y_start - DTYPE_t * dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks( + DTYPE_t * dist_middle_terms = self.gemm_term_computer._compute_dist_middle_terms( X_start, X_end, Y_start, Y_end, thread_num ) DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num] diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp index f0599185a1b64..2f4f040b91ad8 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp @@ -76,7 +76,7 @@ cdef class GEMMTermComputer{{name_suffix}}: ITYPE_t thread_num ) nogil - cdef DTYPE_t * _compute_distances_on_chunks( + cdef DTYPE_t * _compute_dist_middle_terms( self, ITYPE_t X_start, ITYPE_t X_end, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index 37c482c961627..fe7b4ae9f7ad8 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -32,10 +32,10 @@ from ...utils._cython_blas cimport ( {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} cdef class GEMMTermComputer{{name_suffix}}: - """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM. + """Component for `EuclideanDistance` specialisation wrapping the logic for the call to GEMM. - `FastEuclidean*` classes internally compute the squared Euclidean distances between - chunks of vectors X_c and Y_c using the following decomposition: + `EuclideanDistance` subclasses internally compute the squared Euclidean distances + between chunks of vectors X_c and Y_c using the following decomposition: ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² @@ -167,7 +167,7 @@ cdef class GEMMTermComputer{{name_suffix}}: return {{endif}} - cdef DTYPE_t * _compute_distances_on_chunks( + cdef DTYPE_t * _compute_dist_middle_terms( self, ITYPE_t X_start, ITYPE_t X_end, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 165c79209c79c..be0ee5e4d1f39 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -503,7 +503,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} DTYPE_t squared_dist_i_j ITYPE_t n_X = X_end - X_start ITYPE_t n_Y = Y_end - Y_start - DTYPE_t *dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks( + DTYPE_t *dist_middle_terms = self.gemm_term_computer._compute_dist_middle_terms( X_start, X_end, Y_start, Y_end, thread_num ) From cd768f9047627708d855fdc64fc05359623d326a Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 11:10:18 +0200 Subject: [PATCH 07/21] Use safer unpacking for {X,Y}_norm_squared --- .../_argkmin.pyx.tp | 35 +++++++++++-------- .../_radius_neighbors.pyx.tp | 35 +++++++++++-------- 2 files changed, 42 insertions(+), 28 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 48f4bd61ef0b2..e11652ecfa9d6 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -347,8 +347,10 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ): if ( metric_kwargs is not None and - len(metric_kwargs) > 0 and - "Y_norm_squared" not in metric_kwargs + len(metric_kwargs) > 0 and ( + "Y_norm_squared" not in metric_kwargs or + "X_norm_squared" not in metric_kwargs + ) ): warnings.warn( f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't " @@ -382,21 +384,26 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): chunk_size=self.chunk_size, ) - if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: - self.Y_norm_squared = check_array( - metric_kwargs.pop("Y_norm_squared"), - ensure_2d=False, - input_name="Y_norm_squared", - dtype=np.float64 + if metric_kwargs is not None and metric_kwargs.get("Y_norm_squared", None) is not None: + self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared") + else: + self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}( + datasets_pair.Y, + self.effective_n_threads, ) + + if metric_kwargs is not None and metric_kwargs.get("X_norm_squared", None) is not None: + self.X_norm_squared = metric_kwargs.pop("X_norm_squared") else: - self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.Y, self.effective_n_threads) + # Do not recompute norms if datasets are identical. + self.X_norm_squared = ( + self.Y_norm_squared if X is Y else + _sqeuclidean_row_norms{{name_suffix}}( + datasets_pair.X, + self.effective_n_threads, + ) + ) - # Do not recompute norms if datasets are identical. - self.X_norm_squared = ( - self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.X, self.effective_n_threads) - ) self.use_squared_distances = use_squared_distances @final diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index be0ee5e4d1f39..d2177554299ea 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -352,8 +352,10 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} ): if ( metric_kwargs is not None and - len(metric_kwargs) > 0 and - "Y_norm_squared" not in metric_kwargs + len(metric_kwargs) > 0 and ( + "Y_norm_squared" not in metric_kwargs or + "X_norm_squared" not in metric_kwargs + ) ): warnings.warn( f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't " @@ -388,21 +390,26 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} chunk_size=self.chunk_size, ) - if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: - self.Y_norm_squared = check_array( - metric_kwargs.pop("Y_norm_squared"), - ensure_2d=False, - input_name="Y_norm_squared", - dtype=np.float64 + if metric_kwargs is not None and metric_kwargs.get("Y_norm_squared", None) is not None: + self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared") + else: + self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}( + datasets_pair.Y, + self.effective_n_threads, ) + + if metric_kwargs is not None and metric_kwargs.get("X_norm_squared", None) is not None: + self.X_norm_squared = metric_kwargs.pop("X_norm_squared") else: - self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.Y, self.effective_n_threads) + # Do not recompute norms if datasets are identical. + self.X_norm_squared = ( + self.Y_norm_squared if X is Y else + _sqeuclidean_row_norms{{name_suffix}}( + datasets_pair.X, + self.effective_n_threads, + ) + ) - # Do not recompute norms if datasets are identical. - self.X_norm_squared = ( - self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.X, self.effective_n_threads) - ) self.use_squared_distances = use_squared_distances if use_squared_distances: From e60059ac5857afb13ca9a8c13e82ce3969183edd Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 11:22:32 +0200 Subject: [PATCH 08/21] Simplify Tempita preprocessing --- .../_argkmin.pxd.tp | 18 +---------- .../_argkmin.pyx.tp | 18 +---------- .../_base.pxd.tp | 31 +++++++------------ .../_base.pyx.tp | 17 +--------- .../_datasets_pair.pxd.tp | 6 ++-- .../_radius_neighbors.pxd.tp | 17 +--------- .../_radius_neighbors.pyx.tp | 17 +--------- 7 files changed, 19 insertions(+), 105 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp index bf304beb52891..b738cda119c11 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp @@ -1,25 +1,9 @@ -{{py: - -implementation_specific_values = [ - # Values are the following ones: - # - # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE - # - # We also use the float64 dtype and C-type names as defined in - # `sklearn.utils._typedefs` to maintain consistency. - # - ('64', 'cnp.float64_t', 'np.float64'), - ('32', 'cnp.float32_t', 'np.float32') -] - -}} - cimport numpy as cnp from ...utils._typedefs cimport ITYPE_t, DTYPE_t cnp.import_array() -{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +{{for name_suffix in ['64', '32']}} from ._base cimport BaseDistancesReduction{{name_suffix}} from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index e11652ecfa9d6..7219525a985a8 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -1,19 +1,3 @@ -{{py: - -implementation_specific_values = [ - # Values are the following ones: - # - # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE - # - # We also use the float64 dtype and C-type names as defined in - # `sklearn.utils._typedefs` to maintain consistency. - # - ('64', 'DTYPE_t', 'DTYPE'), - ('32', 'cnp.float32_t', 'np.float32') -] - -}} - cimport numpy as cnp from libc.stdlib cimport free, malloc @@ -37,7 +21,7 @@ from ...utils._typedefs import ITYPE, DTYPE cnp.import_array() -{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +{{for name_suffix in ['64', '32']}} from ._base cimport ( BaseDistancesReduction{{name_suffix}}, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp index b5e4261c2217a..44f48f5bf1558 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp @@ -1,18 +1,3 @@ -{{py: - -implementation_specific_values = [ - # Values are the following ones: - # - # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE - # - # We also use the float64 dtype and C-type names as defined in - # `sklearn.utils._typedefs` to maintain consistency. - # - ('64', 'DTYPE_t', 'DTYPE'), - ('32', 'cnp.float32_t', 'np.float32') -] - -}} cimport numpy as cnp from cython cimport final @@ -21,15 +6,21 @@ from ...utils._typedefs cimport ITYPE_t, DTYPE_t cnp.import_array() -{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} - -from ._datasets_pair cimport DatasetsPair{{name_suffix}} +cpdef DTYPE_t[::1] _sqeuclidean_row_norms64( + const DTYPE_t[:, ::1] X, + ITYPE_t num_threads, +) -cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}( - const {{INPUT_DTYPE_t}}[:, ::1] X, +cpdef DTYPE_t[::1] _sqeuclidean_row_norms32( + const cnp.float32_t[:, ::1] X, ITYPE_t num_threads, ) +{{for name_suffix in ['64', '32']}} + +from ._datasets_pair cimport DatasetsPair{{name_suffix}} + + cdef class BaseDistancesReduction{{name_suffix}}: """ Base float{{name_suffix}} implementation template of the pairwise-distances diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index aeb28e8103b6a..53c383f1d62f9 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -1,18 +1,3 @@ -{{py: - -implementation_specific_values = [ - # Values are the following ones: - # - # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE - # - # We also use the float64 dtype and C-type names as defined in - # `sklearn.utils._typedefs` to maintain consistency. - # - ('64', 'DTYPE_t', 'DTYPE'), - ('32', 'cnp.float32_t', 'np.float32') -] - -}} cimport numpy as cnp cimport openmp @@ -101,7 +86,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32( return squared_row_norms -{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +{{for name_suffix in ['64', '32']}} from ._datasets_pair cimport DatasetsPair{{name_suffix}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp index a02d505808a52..e220f730e7529 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp @@ -7,8 +7,8 @@ implementation_specific_values = [ # # We use DistanceMetric for float64 for backward naming compatibility. # - ('64', 'DistanceMetric', 'DTYPE_t', 'DTYPE'), - ('32', 'DistanceMetric32', 'cnp.float32_t', 'np.float32') + ('64', 'DistanceMetric', 'DTYPE_t'), + ('32', 'DistanceMetric32', 'cnp.float32_t') ] }} @@ -17,7 +17,7 @@ cimport numpy as cnp from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric32 -{{for name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +{{for name_suffix, DistanceMetric, INPUT_DTYPE_t in implementation_specific_values}} cdef class DatasetsPair{{name_suffix}}: diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp index 2d72c8d88a4c0..80fd2775b5acd 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp @@ -1,18 +1,3 @@ -{{py: - -implementation_specific_values = [ - # Values are the following ones: - # - # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE - # - # We also use the float64 dtype and C-type names as defined in - # `sklearn.utils._typedefs` to maintain consistency. - # - ('64', 'DTYPE_t', 'DTYPE'), - ('32', 'cnp.float32_t', 'np.float32') -] - -}} cimport numpy as cnp from libcpp.memory cimport shared_ptr @@ -41,7 +26,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( ) ##################### -{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +{{for name_suffix in ['64', '32']}} from ._base cimport BaseDistancesReduction{{name_suffix}} from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index d2177554299ea..887e5c4dffc76 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -1,18 +1,3 @@ -{{py: - -implementation_specific_values = [ - # Values are the following ones: - # - # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE - # - # We also use the float64 dtype and C-type names as defined in - # `sklearn.utils._typedefs` to maintain consistency. - # - ('64', 'DTYPE_t', 'DTYPE'), - ('32', 'cnp.float32_t', 'np.float32') -] - -}} cimport numpy as cnp import numpy as np import warnings @@ -56,7 +41,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( return nd_arrays_of_nd_arrays ##################### -{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +{{for name_suffix in ['64', '32']}} from ._base cimport ( BaseDistancesReduction{{name_suffix}}, From 22338accd98955329088da3920f158b2e088f833 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 11:49:11 +0200 Subject: [PATCH 09/21] Add a negative zeros and NaNs guard for the Euclidean specialisation --- .../_argkmin.pyx.tp | 24 ++++++++------ .../_radius_neighbors.pyx.tp | 32 ++++++++++++------- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 7219525a985a8..bd598c11e7b3b 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -486,7 +486,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ) nogil: cdef: ITYPE_t i, j - DTYPE_t squared_dist_i_j + DTYPE_t sqeucliean_dist_i_j ITYPE_t n_X = X_end - X_start ITYPE_t n_Y = Y_end - Y_start DTYPE_t * dist_middle_terms = self.gemm_term_computer._compute_dist_middle_terms( @@ -500,19 +500,23 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): # which keep tracks of the argkmin. for i in range(n_X): for j in range(n_Y): + sqeucliean_dist_i_j = ( + self.X_norm_squared[i + X_start] + + dist_middle_terms[i * n_Y + j] + + self.Y_norm_squared[j + Y_start] + ) + + # Guard against eventual -0. and NaN caused by catastrophic + # cancellation (e.g. if X is Y) + sqeucliean_dist_i_j = ( + sqeucliean_dist_i_j if 0. < sqeucliean_dist_i_j == sqeucliean_dist_i_j else 0. + ) + heap_push( heaps_r_distances + i * self.k, heaps_indices + i * self.k, self.k, - # Using the squared euclidean distance as the rank-preserving distance: - # - # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² - # - ( - self.X_norm_squared[i + X_start] + - dist_middle_terms[i * n_Y + j] + - self.Y_norm_squared[j + Y_start] - ), + sqeucliean_dist_i_j, j + Y_start, ) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 887e5c4dffc76..f9bc8ec162144 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -492,27 +492,35 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} ) nogil: cdef: ITYPE_t i, j - DTYPE_t squared_dist_i_j - ITYPE_t n_X = X_end - X_start - ITYPE_t n_Y = Y_end - Y_start + ITYPE_t pair_index = 0 + DTYPE_t sqeucliean_dist_i_j + DTYPE_t *dist_middle_terms = self.gemm_term_computer._compute_dist_middle_terms( X_start, X_end, Y_start, Y_end, thread_num ) # Pushing the distance and their associated indices in vectors. - for i in range(n_X): - for j in range(n_Y): + for i in range(X_start, X_end): + for j in range(Y_start, Y_end): # Using the squared euclidean distance as the rank-preserving distance: # # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² # - squared_dist_i_j = ( - self.X_norm_squared[i + X_start] - + dist_middle_terms[i * n_Y + j] - + self.Y_norm_squared[j + Y_start] + sqeucliean_dist_i_j = ( + self.X_norm_squared[i] + + dist_middle_terms[pair_index] + + self.Y_norm_squared[j] ) - if squared_dist_i_j <= self.r_radius: - deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j) - deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start) + # Guard against eventual -0. and NaN caused by catastrophic + # cancellation (e.g. if X is Y) + sqeucliean_dist_i_j = ( + sqeucliean_dist_i_j if 0. < sqeucliean_dist_i_j == sqeucliean_dist_i_j else 0. + ) + + if sqeucliean_dist_i_j <= self.r_radius: + deref(self.neigh_distances_chunks[thread_num])[i].push_back(sqeucliean_dist_i_j) + deref(self.neigh_indices_chunks[thread_num])[i].push_back(j) + + pair_index += 1 {{endfor}} From b802a058a07af274afba7c3e1a0deaf935343e22 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 11:52:20 +0200 Subject: [PATCH 10/21] Use keywords arguments for calls to heap_push --- .../_argkmin.pyx.tp | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index bd598c11e7b3b..1f84f8fd9042e 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -162,11 +162,11 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): for i in range(n_samples_X): for j in range(n_samples_Y): heap_push( - heaps_r_distances + i * self.k, - heaps_indices + i * self.k, - self.k, - self.datasets_pair.surrogate_dist(X_start + i, Y_start + j), - Y_start + j, + values=heaps_r_distances + i * self.k, + indices=heaps_indices + i * self.k, + size=self.k, + val=self.datasets_pair.surrogate_dist(X_start + i, Y_start + j), + val_idx=Y_start + j, ) cdef void _parallel_on_X_init_chunk( @@ -255,11 +255,11 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): for thread_num in range(self.chunks_n_threads): for jdx in range(self.k): heap_push( - &self.argkmin_distances[X_start + idx, 0], - &self.argkmin_indices[X_start + idx, 0], - self.k, - self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx], - self.heaps_indices_chunks[thread_num][idx * self.k + jdx], + values=&self.argkmin_distances[X_start + idx, 0], + indices=&self.argkmin_indices[X_start + idx, 0], + size=self.k, + val=self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx], + val_idx=self.heaps_indices_chunks[thread_num][idx * self.k + jdx], ) cdef void _parallel_on_Y_finalize( @@ -513,11 +513,11 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ) heap_push( - heaps_r_distances + i * self.k, - heaps_indices + i * self.k, - self.k, - sqeucliean_dist_i_j, - j + Y_start, + values=heaps_r_distances + i * self.k, + indices=heaps_indices + i * self.k, + size=self.k, + val=sqeucliean_dist_i_j, + val_idx=j + Y_start, ) {{endfor}} From 13ced49e7a8efa39077bc8b5f8223fadc8e73883 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 14:26:10 +0200 Subject: [PATCH 11/21] Reword explanations for unsupported distance metrics --- .../_pairwise_distances_reduction/_dispatcher.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 56589b1174477..1ad4ae393a815 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -61,12 +61,15 @@ class BaseDistancesReductionDispatcher: @classmethod def valid_metrics(cls) -> List[str]: excluded = { - "pyfunc", # is relatively slow because we need to coerce data as np arrays + # PyFunc cannot be supported because it necessitates interacting with + # the CPython interpreter to call user defined functions. + "pyfunc", "mahalanobis", # is numerically unstable - # TODO: In order to support discrete distance metrics, we need to have a - # stable simultaneous sort which preserves the order of the input. - # The best might be using std::stable_sort and a Comparator taking an - # Arrays of Structures instead of Structure of Arrays (currently used). + # In order to support discrete distance metrics, we need to have a + # stable simultaneous sort which preserves the order of the indices + # because there generally is a lot of occurrences for a given values + # of distances in this case. + # TODO: implement a stable simultaneous_sort. "hamming", *BOOL_METRICS, } From 631af0db712e9a3207fe5d03525795929b36a8fa Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 14:44:15 +0200 Subject: [PATCH 12/21] Remove unused symbols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you, @MarcoGorelli for working on cython-lint! 🙏 --- .../metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 6 ++---- .../metrics/_pairwise_distances_reduction/_base.pyx.tp | 2 +- .../_gemm_term_computer.pyx.tp | 3 --- .../_radius_neighbors.pyx.tp | 8 ++------ 4 files changed, 5 insertions(+), 14 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 1f84f8fd9042e..abb4e6c067bb1 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -14,7 +14,7 @@ import warnings from numbers import Integral from scipy.sparse import issparse -from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration +from ...utils import check_scalar, _in_unstable_openblas_configuration from ...utils.fixes import threadpool_limits from ...utils._typedefs import ITYPE, DTYPE @@ -188,7 +188,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): ITYPE_t X_end, ) nogil: cdef: - ITYPE_t idx, jdx + ITYPE_t idx # Sorting the main heaps portion associated to `X[X_start:X_end]` # in ascending order w.r.t the distances. @@ -287,7 +287,6 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): cdef void compute_exact_distances(self) nogil: cdef: ITYPE_t i, j - ITYPE_t[:, ::1] Y_indices = self.argkmin_indices DTYPE_t[:, ::1] distances = self.argkmin_distances for i in prange(self.n_samples_X, schedule='static', nogil=True, num_threads=self.effective_n_threads): @@ -439,7 +438,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): cdef void _parallel_on_Y_init( self, ) nogil: - cdef ITYPE_t thread_num ArgKmin{{name_suffix}}._parallel_on_Y_init(self) self.gemm_term_computer._parallel_on_Y_init() diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index 53c383f1d62f9..e4c7fcadfbeba 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -16,7 +16,7 @@ from numbers import Integral from sklearn import get_config from sklearn.utils import check_scalar from ...utils._openmp_helpers import _openmp_effective_n_threads -from ...utils._typedefs import ITYPE, DTYPE +from ...utils._typedefs import DTYPE cnp.import_array() diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index fe7b4ae9f7ad8..e69d1c3df9f7d 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -22,7 +22,6 @@ from ...utils._typedefs cimport DTYPE_t, ITYPE_t from ...utils._cython_blas cimport ( BLAS_Order, BLAS_Trans, - ColMajor, NoTrans, RowMajor, Trans, @@ -176,8 +175,6 @@ cdef class GEMMTermComputer{{name_suffix}}: ITYPE_t thread_num, ) nogil: cdef: - ITYPE_t i, j - DTYPE_t squared_dist_i_j const {{INPUT_DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :] const {{INPUT_DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :] DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index f9bc8ec162144..a2daf18a25d12 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -14,7 +14,7 @@ from ...utils._vector_sentinel cimport vector_to_nd_array from numbers import Real from scipy.sparse import issparse -from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration +from ...utils import check_scalar, _in_unstable_openblas_configuration from ...utils.fixes import threadpool_limits cnp.import_array() @@ -214,9 +214,6 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) ITYPE_t X_start, ITYPE_t X_end, ) nogil: - cdef: - ITYPE_t idx, jdx - # Sorting neighbors for each query vector of X if self.sort_results: for idx in range(X_start, X_end): @@ -276,7 +273,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) self, ) nogil: cdef: - ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current + ITYPE_t idx with nogil, parallel(num_threads=self.effective_n_threads): # Merge vectors used in threads into the main ones. @@ -443,7 +440,6 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} cdef void _parallel_on_Y_init( self, ) nogil: - cdef ITYPE_t thread_num RadiusNeighbors{{name_suffix}}._parallel_on_Y_init(self) self.gemm_term_computer._parallel_on_Y_init() From e6dcde837db9c05dc8dc697e9f7b248cff84fae4 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 15:02:22 +0200 Subject: [PATCH 13/21] DOC Make dispatchers' docstring uniform --- .../_dispatcher.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 1ad4ae393a815..674e1db446d14 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -244,8 +244,6 @@ def compute( 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity for parallelism and is therefore more efficient - despite the synchronization step at each iteration of the outer loop - on chunks of `X`. - None (default) looks-up in scikit-learn configuration for `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. @@ -268,9 +266,9 @@ def compute( Notes ----- - This classmethod is responsible for introspecting the arguments - values to dispatch to the most appropriate implementation of - :class:`ArgKmin64`. + This public classmethod is responsible for introspecting the arguments + values to dispatch to the private dtype-specialized implementation of + :class:`ArgKmin`. This allows decoupling the API entirely from the implementation details whilst maintaining RAII: all temporarily allocated datastructures necessary @@ -415,13 +413,12 @@ def compute( ----- This public classmethod is responsible for introspecting the arguments values to dispatch to the private dtype-specialized implementation of - :class:`RadiusNeighbors64`. - - All temporarily allocated datastructures necessary for the concrete - implementation are therefore freed when this classmethod returns. + :class:`RadiusNeighbors`. - This allows entirely decoupling the API entirely from the - implementation details whilst maintaining RAII. + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. """ if X.dtype == Y.dtype == np.float64: return RadiusNeighbors64.compute( From 5485a4a5b382f5eb4363d17306280c7e88fee17d Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 30 Sep 2022 12:39:00 +0200 Subject: [PATCH 14/21] Remove unneeded references to DenseDenseDatasetsPair instances --- .../_pairwise_distances_reduction/_argkmin.pyx.tp | 15 +++++---------- .../_radius_neighbors.pyx.tp | 15 +++++---------- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index abb4e6c067bb1..e9da9cc5386c2 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -349,21 +349,16 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): strategy=strategy, k=k, ) - # X and Y are checked by the DatasetsPair{{name_suffix}} implemented - # as a DenseDenseDatasetsPair{{name_suffix}} cdef: - DenseDenseDatasetsPair{{name_suffix}} datasets_pair = ( - self.datasets_pair - ) ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk self.gemm_term_computer = GEMMTermComputer{{name_suffix}}( - datasets_pair.X, - datasets_pair.Y, + X, + Y, self.effective_n_threads, self.chunks_n_threads, dist_middle_terms_chunks_size, - n_features=datasets_pair.X.shape[1], + n_features=X.shape[1], chunk_size=self.chunk_size, ) @@ -371,7 +366,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared") else: self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}( - datasets_pair.Y, + Y, self.effective_n_threads, ) @@ -382,7 +377,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): self.X_norm_squared = ( self.Y_norm_squared if X is Y else _sqeuclidean_row_norms{{name_suffix}}( - datasets_pair.X, + X, self.effective_n_threads, ) ) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index a2daf18a25d12..26a906b990624 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -354,21 +354,16 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} strategy=strategy, sort_results=sort_results, ) - # X and Y are checked by the DatasetsPair{{name_suffix}} implemented - # as a DenseDenseDatasetsPair{{name_suffix}} cdef: - DenseDenseDatasetsPair{{name_suffix}} datasets_pair = ( - self.datasets_pair - ) ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk self.gemm_term_computer = GEMMTermComputer{{name_suffix}}( - datasets_pair.X, - datasets_pair.Y, + X, + Y, self.effective_n_threads, self.chunks_n_threads, dist_middle_terms_chunks_size, - n_features=datasets_pair.X.shape[1], + n_features=X.shape[1], chunk_size=self.chunk_size, ) @@ -376,7 +371,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared") else: self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}( - datasets_pair.Y, + Y, self.effective_n_threads, ) @@ -387,7 +382,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} self.X_norm_squared = ( self.Y_norm_squared if X is Y else _sqeuclidean_row_norms{{name_suffix}}( - datasets_pair.X, + X, self.effective_n_threads, ) ) From 192dd92eb2280c66f081a249d61b2822fa96f0a5 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 7 Oct 2022 08:34:28 +0200 Subject: [PATCH 15/21] Apply review comments Co-authored-by: Olivier Grisel --- .../_argkmin.pyx.tp | 38 ++++++++++++------ .../_radius_neighbors.pyx.tp | 40 +++++++++++++------ 2 files changed, 53 insertions(+), 25 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index e9da9cc5386c2..d07ccab7129f5 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -14,7 +14,7 @@ import warnings from numbers import Integral from scipy.sparse import issparse -from ...utils import check_scalar, _in_unstable_openblas_configuration +from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration from ...utils.fixes import threadpool_limits from ...utils._typedefs import ITYPE, DTYPE @@ -362,16 +362,26 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): chunk_size=self.chunk_size, ) - if metric_kwargs is not None and metric_kwargs.get("Y_norm_squared", None) is not None: - self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared") + if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: + self.Y_norm_squared = check_array( + metric_kwargs.pop("Y_norm_squared"), + ensure_2d=False, + input_name="Y_norm_squared", + dtype=np.float64, + ) else: self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}( Y, self.effective_n_threads, ) - if metric_kwargs is not None and metric_kwargs.get("X_norm_squared", None) is not None: - self.X_norm_squared = metric_kwargs.pop("X_norm_squared") + if metric_kwargs is not None and "X_norm_squared" in metric_kwargs: + self.X_norm_squared = check_array( + metric_kwargs.pop("X_norm_squared"), + ensure_2d=False, + input_name="X_norm_squared", + dtype=np.float64, + ) else: # Do not recompute norms if datasets are identical. self.X_norm_squared = ( @@ -479,7 +489,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ) nogil: cdef: ITYPE_t i, j - DTYPE_t sqeucliean_dist_i_j + DTYPE_t sqeuclidean_dist_i_j ITYPE_t n_X = X_end - X_start ITYPE_t n_Y = Y_end - Y_start DTYPE_t * dist_middle_terms = self.gemm_term_computer._compute_dist_middle_terms( @@ -493,23 +503,27 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): # which keep tracks of the argkmin. for i in range(n_X): for j in range(n_Y): - sqeucliean_dist_i_j = ( + sqeuclidean_dist_i_j = ( self.X_norm_squared[i + X_start] + dist_middle_terms[i * n_Y + j] + self.Y_norm_squared[j + Y_start] ) - # Guard against eventual -0. and NaN caused by catastrophic - # cancellation (e.g. if X is Y) - sqeucliean_dist_i_j = ( - sqeucliean_dist_i_j if 0. < sqeucliean_dist_i_j == sqeucliean_dist_i_j else 0. + sqeuclidean_dist_i_j = ( + sqeuclidean_dist_i_j + # Catastrophic cancellation causing -0. and NaN can be present + # in some situations, e.g. when computing d(x_i, y_i) when X is Y. + # + # Guard against -0. Guard against NaN + # v v + if 0. < sqeuclidean_dist_i_j == sqeuclidean_dist_i_j else 0. ) heap_push( values=heaps_r_distances + i * self.k, indices=heaps_indices + i * self.k, size=self.k, - val=sqeucliean_dist_i_j, + val=sqeuclidean_dist_i_j, val_idx=j + Y_start, ) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 26a906b990624..53f971451a748 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -14,7 +14,7 @@ from ...utils._vector_sentinel cimport vector_to_nd_array from numbers import Real from scipy.sparse import issparse -from ...utils import check_scalar, _in_unstable_openblas_configuration +from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration from ...utils.fixes import threadpool_limits cnp.import_array() @@ -367,16 +367,26 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} chunk_size=self.chunk_size, ) - if metric_kwargs is not None and metric_kwargs.get("Y_norm_squared", None) is not None: - self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared") + if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: + self.Y_norm_squared = check_array( + metric_kwargs.pop("Y_norm_squared"), + ensure_2d=False, + input_name="Y_norm_squared", + dtype=np.float64, + ) else: self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}( Y, self.effective_n_threads, ) - if metric_kwargs is not None and metric_kwargs.get("X_norm_squared", None) is not None: - self.X_norm_squared = metric_kwargs.pop("X_norm_squared") + if metric_kwargs is not None and "X_norm_squared" in metric_kwargs: + self.X_norm_squared = check_array( + metric_kwargs.pop("X_norm_squared"), + ensure_2d=False, + input_name="X_norm_squared", + dtype=np.float64, + ) else: # Do not recompute norms if datasets are identical. self.X_norm_squared = ( @@ -484,7 +494,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} cdef: ITYPE_t i, j ITYPE_t pair_index = 0 - DTYPE_t sqeucliean_dist_i_j + DTYPE_t sqeuclidean_dist_i_j DTYPE_t *dist_middle_terms = self.gemm_term_computer._compute_dist_middle_terms( X_start, X_end, Y_start, Y_end, thread_num @@ -497,20 +507,24 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} # # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² # - sqeucliean_dist_i_j = ( + sqeuclidean_dist_i_j = ( self.X_norm_squared[i] + dist_middle_terms[pair_index] + self.Y_norm_squared[j] ) - # Guard against eventual -0. and NaN caused by catastrophic - # cancellation (e.g. if X is Y) - sqeucliean_dist_i_j = ( - sqeucliean_dist_i_j if 0. < sqeucliean_dist_i_j == sqeucliean_dist_i_j else 0. + sqeuclidean_dist_i_j = ( + sqeuclidean_dist_i_j + # Catastrophic cancellation causing -0. and NaN can be present + # in some situations, e.g. when computing d(x_i, y_i) when X is Y. + # + # Guard against -0. Guard against NaN + # v v + if 0. < sqeuclidean_dist_i_j == sqeuclidean_dist_i_j else 0. ) - if sqeucliean_dist_i_j <= self.r_radius: - deref(self.neigh_distances_chunks[thread_num])[i].push_back(sqeucliean_dist_i_j) + if sqeuclidean_dist_i_j <= self.r_radius: + deref(self.neigh_distances_chunks[thread_num])[i].push_back(sqeuclidean_dist_i_j) deref(self.neigh_indices_chunks[thread_num])[i].push_back(j) pair_index += 1 From b29c79dae6ce93322ce94839016ff184eda8220e Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 12 Oct 2022 14:46:35 +0200 Subject: [PATCH 16/21] Remove NaN guard and adapt -0. guard Co-authored-by: Olivier Grisel Co-authored-by: Thomas J. Fan --- .../_pairwise_distances_reduction/_argkmin.pyx.tp | 12 +++--------- .../_radius_neighbors.pyx.tp | 12 +++--------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index d07ccab7129f5..f92b6027d0a74 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -509,15 +509,9 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): self.Y_norm_squared[j + Y_start] ) - sqeuclidean_dist_i_j = ( - sqeuclidean_dist_i_j - # Catastrophic cancellation causing -0. and NaN can be present - # in some situations, e.g. when computing d(x_i, y_i) when X is Y. - # - # Guard against -0. Guard against NaN - # v v - if 0. < sqeuclidean_dist_i_j == sqeuclidean_dist_i_j else 0. - ) + # Catastrophic cancellation might cause -0. to be present, + # e.g. when computing d(x_i, y_i) when X is Y. + sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j) heap_push( values=heaps_r_distances + i * self.k, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 53f971451a748..f953429966260 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -513,15 +513,9 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} + self.Y_norm_squared[j] ) - sqeuclidean_dist_i_j = ( - sqeuclidean_dist_i_j - # Catastrophic cancellation causing -0. and NaN can be present - # in some situations, e.g. when computing d(x_i, y_i) when X is Y. - # - # Guard against -0. Guard against NaN - # v v - if 0. < sqeuclidean_dist_i_j == sqeuclidean_dist_i_j else 0. - ) + # Catastrophic cancellation might cause -0. to be present, + # e.g. when computing d(x_i, y_i) when X is Y. + sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j) if sqeuclidean_dist_i_j <= self.r_radius: deref(self.neigh_distances_chunks[thread_num])[i].push_back(sqeuclidean_dist_i_j) From fa0e2eab59d9a9f0a035175466bee2bcbcb5b10c Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 14 Oct 2022 17:36:52 +0200 Subject: [PATCH 17/21] fixup! Merge branch 'main' into maint/pdr-misc --- sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index b7cf2f6ec4f73..cd693f352dbc5 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -266,9 +266,8 @@ def compute( Notes ----- - This public classmethod is responsible for introspecting the arguments - values to dispatch to the private dtype-specialized implementation of - :class:`ArgKmin`. + This classmethod inspects the arguments values to dispatch to the + dtype-specialized implementation of :class:`ArgKmin`. This allows decoupling the API entirely from the implementation details whilst maintaining RAII: all temporarily allocated datastructures necessary From 3c2126d497226bf43ca4a104aa8943e85edf66cc Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 17 Oct 2022 15:53:23 +0200 Subject: [PATCH 18/21] Revert to explicit indexing Also remove the comment not to duplicate and restate information at several place in those implementations. Co-authored-by: Thomas J. Fan --- .../_radius_neighbors.pyx.tp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 6cbe233ebf748..c2cf0256a457f 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -496,23 +496,24 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} ) nogil: cdef: ITYPE_t i, j - ITYPE_t pair_index = 0 DTYPE_t sqeuclidean_dist_i_j + ITYPE_t n_X = X_end - X_start + ITYPE_t n_Y = Y_end - Y_start DTYPE_t *dist_middle_terms = self.gemm_term_computer._compute_dist_middle_terms( X_start, X_end, Y_start, Y_end, thread_num ) # Pushing the distance and their associated indices in vectors. - for i in range(X_start, X_end): - for j in range(Y_start, Y_end): + for i in range(n_X): + for j in range(n_Y): # Using the squared euclidean distance as the rank-preserving distance: # # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² # sqeuclidean_dist_i_j = ( - self.X_norm_squared[i] - + dist_middle_terms[pair_index] - + self.Y_norm_squared[j] + self.X_norm_squared[i + X_start] + + dist_middle_terms[i * n_Y + j] + + self.Y_norm_squared[j + Y_start] ) # Catastrophic cancellation might cause -0. to be present, @@ -520,9 +521,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j) if sqeuclidean_dist_i_j <= self.r_radius: - deref(self.neigh_distances_chunks[thread_num])[i].push_back(sqeuclidean_dist_i_j) - deref(self.neigh_indices_chunks[thread_num])[i].push_back(j) - - pair_index += 1 + deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(sqeuclidean_dist_i_j) + deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start) {{endfor}} From 8ddef0104101b79af82071c039be455611a6c92f Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 17 Oct 2022 16:45:25 +0200 Subject: [PATCH 19/21] fixup! Revert to explicit indexing --- .../_pairwise_distances_reduction/_radius_neighbors.pyx.tp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index c2cf0256a457f..15c49ef067dea 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -506,10 +506,6 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} # Pushing the distance and their associated indices in vectors. for i in range(n_X): for j in range(n_Y): - # Using the squared euclidean distance as the rank-preserving distance: - # - # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² - # sqeuclidean_dist_i_j = ( self.X_norm_squared[i + X_start] + dist_middle_terms[i * n_Y + j] From f2e917bbdac1e53a62be73ee28caad0c88b266ec Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 17 Oct 2022 16:45:44 +0200 Subject: [PATCH 20/21] Do not slice memoryviews in _compute_dist_middle_terms See the reasons here: https://github.com/scikit-learn/scikit-learn/issues/17299 --- .../_gemm_term_computer.pyx.tp | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index e69d1c3df9f7d..e040417cbe705 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -175,8 +175,6 @@ cdef class GEMMTermComputer{{name_suffix}}: ITYPE_t thread_num, ) nogil: cdef: - const {{INPUT_DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :] - const {{INPUT_DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :] DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() # Careful: LDA, LDB and LDC are given for F-ordered arrays @@ -187,9 +185,9 @@ cdef class GEMMTermComputer{{name_suffix}}: BLAS_Order order = RowMajor BLAS_Trans ta = NoTrans BLAS_Trans tb = Trans - ITYPE_t m = X_c.shape[0] - ITYPE_t n = Y_c.shape[0] - ITYPE_t K = X_c.shape[1] + ITYPE_t m = X_end - X_start + ITYPE_t n = Y_end - Y_start + ITYPE_t K = self.n_features DTYPE_t alpha = - 2. {{if upcast_to_float64}} DTYPE_t * A = self.X_c_upcast[thread_num].data() @@ -198,15 +196,15 @@ cdef class GEMMTermComputer{{name_suffix}}: # Casting for A and B to remove the const is needed because APIs exposed via # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. # See: https://github.com/scipy/scipy/issues/14262 - DTYPE_t * A = &X_c[0, 0] - DTYPE_t * B = &Y_c[0, 0] + DTYPE_t * A = &self.X[X_start, 0] + DTYPE_t * B = &self.Y[Y_start, 0] {{endif}} - ITYPE_t lda = X_c.shape[1] - ITYPE_t ldb = X_c.shape[1] + ITYPE_t lda = self.n_features + ITYPE_t ldb = self.n_features DTYPE_t beta = 0. - ITYPE_t ldc = Y_c.shape[0] + ITYPE_t ldc = Y_end - Y_start - # dist_middle_terms = `-2 * X_c @ Y_c.T` + # dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T` _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) return dist_middle_terms From 379ffae4ed0016ba6815bcfd46e16a3b3bd6aa2b Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 21 Oct 2022 11:19:48 +0200 Subject: [PATCH 21/21] Revert "Do not slice memoryviews in _compute_dist_middle_terms" This reverts commit f2e917bbdac1e53a62be73ee28caad0c88b266ec. --- .../_gemm_term_computer.pyx.tp | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index e040417cbe705..e69d1c3df9f7d 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -175,6 +175,8 @@ cdef class GEMMTermComputer{{name_suffix}}: ITYPE_t thread_num, ) nogil: cdef: + const {{INPUT_DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :] + const {{INPUT_DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :] DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() # Careful: LDA, LDB and LDC are given for F-ordered arrays @@ -185,9 +187,9 @@ cdef class GEMMTermComputer{{name_suffix}}: BLAS_Order order = RowMajor BLAS_Trans ta = NoTrans BLAS_Trans tb = Trans - ITYPE_t m = X_end - X_start - ITYPE_t n = Y_end - Y_start - ITYPE_t K = self.n_features + ITYPE_t m = X_c.shape[0] + ITYPE_t n = Y_c.shape[0] + ITYPE_t K = X_c.shape[1] DTYPE_t alpha = - 2. {{if upcast_to_float64}} DTYPE_t * A = self.X_c_upcast[thread_num].data() @@ -196,15 +198,15 @@ cdef class GEMMTermComputer{{name_suffix}}: # Casting for A and B to remove the const is needed because APIs exposed via # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. # See: https://github.com/scipy/scipy/issues/14262 - DTYPE_t * A = &self.X[X_start, 0] - DTYPE_t * B = &self.Y[Y_start, 0] + DTYPE_t * A = &X_c[0, 0] + DTYPE_t * B = &Y_c[0, 0] {{endif}} - ITYPE_t lda = self.n_features - ITYPE_t ldb = self.n_features + ITYPE_t lda = X_c.shape[1] + ITYPE_t ldb = X_c.shape[1] DTYPE_t beta = 0. - ITYPE_t ldc = Y_end - Y_start + ITYPE_t ldc = Y_c.shape[0] - # dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T` + # dist_middle_terms = `-2 * X_c @ Y_c.T` _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) return dist_middle_terms