scikit-learn · jjerphan · Jun 22, 2022 · Jul 2, 2022 · Jul 4, 2022 · Jul 11, 2022
diff --git a/.gitignore b/.gitignore
@@ -95,5 +95,7 @@ sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd
 sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
 sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
 sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
+sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd
+sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
diff --git a/setup.cfg b/setup.cfg
@@ -85,6 +85,8 @@ ignore =
     sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
     sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
     sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
+    sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd
+    sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
 

diff --git a/setup.py b/setup.py
@@ -90,6 +90,7 @@
     "sklearn.metrics._pairwise_distances_reduction._datasets_pair",
     "sklearn.metrics._pairwise_distances_reduction._middle_term_computer",
     "sklearn.metrics._pairwise_distances_reduction._base",
+    "sklearn.metrics._pairwise_distances_reduction._pairwise_distances",
     "sklearn.metrics._pairwise_distances_reduction._argkmin",
     "sklearn.metrics._pairwise_distances_reduction._radius_neighbors",
     "sklearn.metrics._pairwise_fast",
@@ -327,6 +328,12 @@ def check_package_status(package, min_version):
             "include_np": True,
             "extra_compile_args": ["-std=c++11"],
         },
+        {
+            "sources": ["_pairwise_distances.pyx.tp", "_pairwise_distances.pxd.tp"],
+            "language": "c++",
+            "include_np": True,
+            "extra_compile_args": ["-std=c++11"],
+        },
         {
             "sources": ["_argkmin.pyx.tp", "_argkmin.pxd.tp"],
             "language": "c++",

diff --git a/sklearn/metrics/_pairwise_distances_reduction/__init__.py b/sklearn/metrics/_pairwise_distances_reduction/__init__.py
@@ -89,13 +89,18 @@
 from ._dispatcher import (
     BaseDistancesReductionDispatcher,
     ArgKmin,
+    PairwiseDistances,
     RadiusNeighbors,
     sqeuclidean_row_norms,
 )
 
+from ._pairwise_distances import _precompute_metric_params
+
 __all__ = [
     "BaseDistancesReductionDispatcher",
     "ArgKmin",
+    "PairwiseDistances",
     "RadiusNeighbors",
     "sqeuclidean_row_norms",
+    "_precompute_metric_params",
 ]
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
@@ -50,6 +50,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
         ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
         ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
 
+        bint X_is_Y
         bint execute_in_parallel_on_Y
 
     @final

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp
@@ -25,6 +25,8 @@ cdef class DatasetsPair{{name_suffix}}:
         {{DistanceMetric}} distance_metric
         ITYPE_t n_features
 
+        readonly bint X_is_Y
+
     cdef ITYPE_t n_samples_X(self) nogil
 
     cdef ITYPE_t n_samples_Y(self) nogil

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp
@@ -1,3 +1,5 @@
+import copy
+
 {{py:
 
 implementation_specific_values = [
@@ -91,18 +93,24 @@ cdef class DatasetsPair{{name_suffix}}:
         datasets_pair: DatasetsPair{{name_suffix}}
             The suited DatasetsPair{{name_suffix}} implementation.
         """
-        # Y_norm_squared might be propagated down to DatasetsPairs
-        # via metrics_kwargs when the Euclidean specialisations
-        # can't be used. To prevent Y_norm_squared to be passed
+        # X_norm_squared and Y_norm_squared might be propagated
+        # down to DatasetsPairs via metrics_kwargs when the Euclidean
+        # specialisations can't be used.
+        # To prevent X_norm_squared and Y_norm_squared to be passed
         # down to DistanceMetrics (whose constructors would raise
-        # a RuntimeError), we pop it here.
+        # a RuntimeError), we pop them here.
         if metric_kwargs is not None:
+            # Copying metric_kwargs not to pop "X_norm_squared"
+            # and "Y_norm_squared" where they are used
+            metric_kwargs = copy.copy(metric_kwargs)
+            metric_kwargs.pop("X_norm_squared", None)
             metric_kwargs.pop("Y_norm_squared", None)
         cdef:
             {{DistanceMetric}} distance_metric = {{DistanceMetric}}.get_metric(
                 metric,
                 **(metric_kwargs or {})
             )
+            bint X_is_Y = X is Y
 
         # Metric-specific checks that do not replace nor duplicate `check_array`.
         distance_metric._validate_data(X)
@@ -112,15 +120,15 @@ cdef class DatasetsPair{{name_suffix}}:
         Y_is_sparse = issparse(Y)
 
         if not X_is_sparse and not Y_is_sparse:
-            return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+            return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric, X_is_Y)
 
         if X_is_sparse and Y_is_sparse:
-            return SparseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+            return SparseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric, X_is_Y)
 
         if X_is_sparse and not Y_is_sparse:
-            return SparseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+            return SparseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric, X_is_Y)
 
-        return DenseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+        return DenseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric, X_is_Y)
 
     @classmethod
     def unpack_csr_matrix(cls, X: csr_matrix):
@@ -130,8 +138,9 @@ cdef class DatasetsPair{{name_suffix}}:
         X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
         return X_data, X_indices, X_indptr
 
-    def __init__(self, {{DistanceMetric}} distance_metric, ITYPE_t n_features):
+    def __init__(self, {{DistanceMetric}} distance_metric, ITYPE_t n_features, bint X_is_Y):
         self.distance_metric = distance_metric
+        self.X_is_Y = X_is_Y
         self.n_features = n_features
 
     cdef ITYPE_t n_samples_X(self) nogil:
@@ -179,8 +188,9 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
         const {{INPUT_DTYPE_t}}[:, ::1] X,
         const {{INPUT_DTYPE_t}}[:, ::1] Y,
         {{DistanceMetric}} distance_metric,
+        bint X_is_Y,
     ):
-        super().__init__(distance_metric, n_features=X.shape[1])
+        super().__init__(distance_metric, n_features=X.shape[1], X_is_Y=X_is_Y)
         # Arrays have already been checked
         self.X = X
         self.Y = Y
@@ -219,8 +229,8 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
         between two vectors of (X, Y).
     """
 
-    def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
-        super().__init__(distance_metric, n_features=X.shape[1])
+    def __init__(self, X, Y, {{DistanceMetric}} distance_metric, bint X_is_Y):
+        super().__init__(distance_metric, n_features=X.shape[1], X_is_Y=X_is_Y)
 
         self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
         self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
@@ -279,8 +289,8 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
         between two vectors of (X, Y).
     """
 
-    def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
-        super().__init__(distance_metric, n_features=X.shape[1])
+    def __init__(self, X, Y, {{DistanceMetric}} distance_metric, bint X_is_Y):
+        super().__init__(distance_metric, n_features=X.shape[1], X_is_Y=X_is_Y)
 
         self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
 
@@ -377,10 +387,10 @@ cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
         between two vectors of (X, Y).
     """
 
-    def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
-        super().__init__(distance_metric, n_features=X.shape[1])
+    def __init__(self, X, Y, {{DistanceMetric}} distance_metric, bint X_is_Y):
+        super().__init__(distance_metric, n_features=X.shape[1], X_is_Y=X_is_Y)
         # Swapping arguments on the constructor
-        self.datasets_pair = SparseDenseDatasetsPair{{name_suffix}}(Y, X, distance_metric)
+        self.datasets_pair = SparseDenseDatasetsPair{{name_suffix}}(Y, X, distance_metric, X_is_Y)
 
     @final
     cdef ITYPE_t n_samples_X(self) nogil:

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -16,6 +16,12 @@
     ArgKmin64,
     ArgKmin32,
 )
+
+from ._pairwise_distances import (
+    PairwiseDistances64,
+    PairwiseDistances32,
+)
+
 from ._radius_neighbors import (
     RadiusNeighbors64,
     RadiusNeighbors32,
@@ -168,6 +174,132 @@ def compute(
         """
 
 
+class PairwiseDistances(BaseDistancesReductionDispatcher):
+    """Compute the pairwise distances matrix for two sets of vectors.
+
+    The distance function `dist` depends on the values of the `metric`
+    and `metric_kwargs` parameters.
+
+    This class only computes the pairwise distances matrix without
+    applying any reduction on it. It shares most of the underlying
+    code infrastructure with reducing variants to leverage
+    cache-aware chunking and multi-thread parallelism.
+
+    This class is not meant to be instantiated, one should only use
+    its :meth:`compute` classmethod which handles allocation and
+    deallocation consistently.
+    """
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        Y = X if Y is None else Y
+        return super().is_usable_for(X, Y, metric)
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        metric="euclidean",
+        chunk_size=None,
+        metric_kwargs=None,
+        strategy=None,
+    ):
+        """Return pairwise distances matrix for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+                Each thread then iterates on all the chunks of Y. This strategy is
+                embarrassingly parallel and comes with no datastructures
+                synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+                Each thread processes all the chunks of X in turn. This strategy is
+                a sequence of embarrassingly parallel subtasks (the inner loop on Y
+                chunks) with intermediate datastructures synchronisation at each
+                iteration of the sequential outer loop on X chunks.
+
+              - 'auto' relies on a simple heuristic to choose between
+                'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+                'parallel_on_X' is usually the most efficient strategy.
+                When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
+                brings more opportunity for parallelism and is therefore more efficient.
+
+              - None (default) looks-up in scikit-learn configuration for
+                `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        Returns
+        -------
+        pairwise_distances_matrix : ndarray of shape (n_samples_X, n_samples_Y)
+            The pairwise distances matrix.
+
+        Notes
+        -----
+        This public classmethod is responsible for introspecting the arguments
+        values to dispatch to the private dtype-specialized implementation of
+        :class:`PairwiseDistances`.
+
+        All temporarily allocated datastructures necessary for the concrete
+        implementation are therefore freed when this classmethod returns.
+
+        This allows entirely decoupling the API entirely from the
+        implementation details whilst maintaining RAII.
+        """
+        Y = X if Y is None else Y
+        if X.dtype == Y.dtype == np.float64:
+            return PairwiseDistances64.compute(
+                X=X,
+                Y=Y,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+            )
+
+        if X.dtype == Y.dtype == np.float32:
+            return PairwiseDistances32.compute(
+                X=X,
+                Y=Y,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+            )
+
+        raise ValueError(
+            "Only float64 or float32 datasets pairs are supported, but "
+            f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
+
+
 class ArgKmin(BaseDistancesReductionDispatcher):
     """Compute the argkmin of row vectors of X on the ones of Y.
 
@@ -243,7 +375,7 @@ def compute(
                 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
                 'parallel_on_X' is usually the most efficient strategy.
                 When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
-                brings more opportunity for parallelism and is therefore more efficient
+                brings more opportunity for parallelism and is therefore more efficient.
 
               - None (default) looks-up in scikit-learn configuration for
                 `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
@@ -382,9 +514,7 @@ def compute(
                 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
                 'parallel_on_X' is usually the most efficient strategy.
                 When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
-                brings more opportunity for parallelism and is therefore more efficient
-                despite the synchronization step at each iteration of the outer loop
-                on chunks of `X`.
+                brings more opportunity for parallelism and is therefore more efficient.
 
               - None (default) looks-up in scikit-learn configuration for
                 `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp
@@ -0,0 +1,39 @@
+{{py:
+
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    #       name_suffix, INPUT_DTYPE_t
+    #
+    #
+    ('64', 'cnp.float64_t'),
+    ('32', 'cnp.float32_t')
+]
+
+}}
+cimport numpy as cnp
+
+from ...utils._typedefs cimport DTYPE_t
+{{for name_suffix, INPUT_DTYPE_t in implementation_specific_values}}
+
+from ._base cimport BaseDistancesReduction{{name_suffix}}
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
+
+
+cdef class PairwiseDistances{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
+    """float{{name_suffix}} implementation of PairwiseDistances."""
+
+    cdef:
+        {{INPUT_DTYPE_t}}[:, ::1] pairwise_distances_matrix
+
+
+cdef class EuclideanPairwiseDistances{{name_suffix}}(PairwiseDistances{{name_suffix}}):
+    """EuclideanDistance-specialized float{{name_suffix}} implementation for PairwiseDistances."""
+    cdef:
+        MiddleTermComputer{{name_suffix}} middle_term_computer
+        const DTYPE_t[::1] X_norm_squared
+        const DTYPE_t[::1] Y_norm_squared
+
+        bint use_squared_distances
+
+{{endfor}}