scikit-learn · Vincent-Maladiere · Feb 7, 2023 · Feb 7, 2023 · Feb 8, 2023 · Feb 11, 2023
diff --git a/.gitignore b/.gitignore
@@ -97,6 +97,8 @@ sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd
 sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
 sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
 sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
+sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd
+sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
 

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -518,6 +518,16 @@ Changelog
 - |Fix| :func:`metrics.manhattan_distances` now supports readonly sparse datasets.
   :pr:`25432` by :user:`Julien Jerphanion <jjerphan>`.
 
+- |Efficiency| :func:`pairwise.pairwise_distances`' performance has been improved
+  when providing dense datasets.
+  :pr:`25561` by :user:`Vincent Maladiere <Vincent-Maladiere>` and
+  :user:`Julien Jerphanion <jjerphan>`.
+
+- |Feature| :func:`pairwise.pairwise_distances` now supports combination of
+  dense arrays and sparse CSR matrices datasets.
+  :pr:`25561` by :user:`Vincent Maladiere <Vincent-Maladiere>` and
+  :user:`Julien Jerphanion <jjerphan>`.
+
 - |Fix| Fixed :func:`metrics.classification_report` so that empty input will return
   `np.nan`. Previously, "macro avg" and `weighted avg` would return
   e.g. `f1-score=np.nan` and `f1-score=0.0`, being inconsistent. Now, they

diff --git a/setup.cfg b/setup.cfg
@@ -52,6 +52,8 @@ ignore =
     sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
     sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
     sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
+    sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd
+    sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
 

diff --git a/setup.py b/setup.py
@@ -277,6 +277,12 @@ def check_package_status(package, min_version):
             "include_np": True,
             "extra_compile_args": ["-std=c++11"],
         },
+        {
+            "sources": ["_pairwise_distances.pyx.tp", "_pairwise_distances.pxd.tp"],
+            "language": "c++",
+            "include_np": True,
+            "extra_compile_args": ["-std=c++11"],
+        },
         {
             "sources": ["_argkmin.pyx.tp", "_argkmin.pxd.tp"],
             "language": "c++",

diff --git a/sklearn/metrics/_pairwise_distances_reduction/__init__.py b/sklearn/metrics/_pairwise_distances_reduction/__init__.py
@@ -90,14 +90,18 @@
     ArgKmin,
     ArgKminClassMode,
     BaseDistancesReductionDispatcher,
+    PairwiseDistances,
     RadiusNeighbors,
     sqeuclidean_row_norms,
 )
+from ._pairwise_distances import _precompute_metric_params
 
 __all__ = [
     "BaseDistancesReductionDispatcher",
     "ArgKmin",
+    "PairwiseDistances",
     "RadiusNeighbors",
     "ArgKminClassMode",
     "sqeuclidean_row_norms",
+    "_precompute_metric_params",
 ]
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
@@ -46,6 +46,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
         intp_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
         intp_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
 
+        bint X_is_Y
         bint execute_in_parallel_on_Y
 
     @final

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp
@@ -20,6 +20,8 @@ cdef class DatasetsPair{{name_suffix}}:
         {{DistanceMetric}} distance_metric
         intp_t n_features
 
+        readonly bint X_is_Y
+
     cdef intp_t n_samples_X(self) noexcept nogil
 
     cdef intp_t n_samples_Y(self) noexcept nogil

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp
@@ -1,3 +1,5 @@
+import copy
+
 {{py:
 
 implementation_specific_values = [
@@ -84,19 +86,25 @@ cdef class DatasetsPair{{name_suffix}}:
         datasets_pair: DatasetsPair{{name_suffix}}
             The suited DatasetsPair{{name_suffix}} implementation.
         """
-        # Y_norm_squared might be propagated down to DatasetsPairs
-        # via metrics_kwargs when the Euclidean specialisations
-        # can't be used. To prevent Y_norm_squared to be passed
+        # X_norm_squared and Y_norm_squared might be propagated
+        # down to DatasetsPairs via metrics_kwargs when the Euclidean
+        # specialisations can't be used.
+        # To prevent X_norm_squared and Y_norm_squared to be passed
         # down to DistanceMetrics (whose constructors would raise
-        # a RuntimeError), we pop it here.
+        # a RuntimeError), we pop them here.
         if metric_kwargs is not None:
+            # Copying metric_kwargs not to pop "X_norm_squared"
+            # and "Y_norm_squared" where they are used
+            metric_kwargs = copy.copy(metric_kwargs)
+            metric_kwargs.pop("X_norm_squared", None)
             metric_kwargs.pop("Y_norm_squared", None)
         cdef:
             {{DistanceMetric}} distance_metric = DistanceMetric.get_metric(
                 metric,
                 {{INPUT_DTYPE}},
                 **(metric_kwargs or {})
             )
+            bint X_is_Y = X is Y
 
         # Metric-specific checks that do not replace nor duplicate `check_array`.
         distance_metric._validate_data(X)
@@ -106,15 +114,15 @@ cdef class DatasetsPair{{name_suffix}}:
         Y_is_sparse = issparse(Y)
 
         if not X_is_sparse and not Y_is_sparse:
-            return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+            return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric, X_is_Y)
 
         if X_is_sparse and Y_is_sparse:
-            return SparseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+            return SparseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric, X_is_Y)
 
         if X_is_sparse and not Y_is_sparse:
-            return SparseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+            return SparseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric, X_is_Y)
 
-        return DenseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+        return DenseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric, X_is_Y)
 
     @classmethod
     def unpack_csr_matrix(cls, X: csr_matrix):
@@ -124,8 +132,9 @@ cdef class DatasetsPair{{name_suffix}}:
         X_indptr = np.asarray(X.indptr, dtype=np.int32)
         return X_data, X_indices, X_indptr
 
-    def __init__(self, {{DistanceMetric}} distance_metric, intp_t n_features):
+    def __init__(self, {{DistanceMetric}} distance_metric, intp_t n_features, bint X_is_Y):
         self.distance_metric = distance_metric
+        self.X_is_Y = X_is_Y
         self.n_features = n_features
 
     cdef intp_t n_samples_X(self) noexcept nogil:
@@ -173,8 +182,9 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
         const {{INPUT_DTYPE_t}}[:, ::1] X,
         const {{INPUT_DTYPE_t}}[:, ::1] Y,
         {{DistanceMetric}} distance_metric,
+        bint X_is_Y,
     ):
-        super().__init__(distance_metric, n_features=X.shape[1])
+        super().__init__(distance_metric, n_features=X.shape[1], X_is_Y=X_is_Y)
         # Arrays have already been checked
         self.X = X
         self.Y = Y
@@ -213,8 +223,8 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
         between two vectors of (X, Y).
     """
 
-    def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
-        super().__init__(distance_metric, n_features=X.shape[1])
+    def __init__(self, X, Y, {{DistanceMetric}} distance_metric, bint X_is_Y):
+        super().__init__(distance_metric, n_features=X.shape[1], X_is_Y=X_is_Y)
 
         self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
         self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
@@ -273,8 +283,8 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
         between two vectors of (X, Y).
     """
 
-    def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
-        super().__init__(distance_metric, n_features=X.shape[1])
+    def __init__(self, X, Y, {{DistanceMetric}} distance_metric, bint X_is_Y):
+        super().__init__(distance_metric, n_features=X.shape[1], X_is_Y=X_is_Y)
 
         self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
 
@@ -371,10 +381,10 @@ cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
         between two vectors of (X, Y).
     """
 
-    def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
-        super().__init__(distance_metric, n_features=X.shape[1])
+    def __init__(self, X, Y, {{DistanceMetric}} distance_metric, bint X_is_Y):
+        super().__init__(distance_metric, n_features=X.shape[1], X_is_Y=X_is_Y)
         # Swapping arguments on the constructor
-        self.datasets_pair = SparseDenseDatasetsPair{{name_suffix}}(Y, X, distance_metric)
+        self.datasets_pair = SparseDenseDatasetsPair{{name_suffix}}(Y, X, distance_metric, X_is_Y)
 
     @final
     cdef intp_t n_samples_X(self) noexcept nogil:

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -5,6 +5,7 @@
 from scipy.sparse import issparse, isspmatrix_csr
 
 from ... import get_config
+from ...utils._openmp_helpers import _openmp_effective_n_threads
 from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING64
 from ._argkmin import (
     ArgKmin32,
@@ -15,6 +16,10 @@
     ArgKminClassMode64,
 )
 from ._base import _sqeuclidean_row_norms32, _sqeuclidean_row_norms64
+from ._pairwise_distances import (
+    PairwiseDistances32,
+    PairwiseDistances64,
+)
 from ._radius_neighbors import (
     RadiusNeighbors32,
     RadiusNeighbors64,
@@ -148,6 +153,172 @@ def compute(
         """
 
 
+class PairwiseDistances(BaseDistancesReductionDispatcher):
+    """Compute the pairwise distances matrix for two sets of vectors.
+
+    The distance function `dist` depends on the values of the `metric`
+    and `metric_kwargs` parameters.
+
+    This class only computes the pairwise distances matrix without
+    applying any reduction on it. It shares most of the underlying
+    code infrastructure with reducing variants to leverage multi-thread
+    parallelism. However contrary to the reducing variants, no chunking
+    is applied to allow for contiguous write access to the final distance
+    array that is not expected to fit in the CPU cache in general.
+
+    This class is not meant to be instantiated, one should only use
+    its :meth:`compute` classmethod which handles allocation and
+    deallocation consistently.
+    """
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric, metric_kwargs=None) -> bool:
+        """Return True if the dispatcher can be used for the
+        given parameters.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        Returns
+        -------
+        True if the dispatcher can be used, else False.
+        """
+        effective_n_threads = _openmp_effective_n_threads()
+
+        def is_euclidean(metric, metric_kwargs):
+            metric_kwargs = metric_kwargs or dict()
+            euclidean_metrics = [
+                "euclidean",
+                "sqeuclidean",
+                "l2",
+            ]
+            # TODO: pass `p` as a standalone argument instead of a metric_kwargs.
+            return metric in euclidean_metrics or (
+                metric == "minkowski" and metric_kwargs.get("p", 2) == 2
+            )
+
+        Y = X if Y is None else Y
+
+        # We need to rely on `PairwiseDistances` for manhattan anyway because
+        # the implementation of manhattan distances on sparse data has been removed.
+        manhattan_metrics = ["cityblock", "l1", "manhattan"]
+
+        is_usable = super().is_usable_for(X, Y, metric) and (
+            (not is_euclidean(metric, metric_kwargs) and effective_n_threads != 1)
+            or metric in manhattan_metrics
+        )
+
+        return is_usable
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        metric="euclidean",
+        metric_kwargs=None,
+        strategy=None,
+    ):
+        """Return pairwise distances matrix for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on rows of X and rows of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches rows of X uniformly on threads.
+                Each thread then iterates on all the rows of Y. This strategy is
+                embarrassingly parallel and comes with no datastructures
+                synchronisation.
+
+              - 'parallel_on_Y' dispatches rows of Y uniformly on threads.
+                Each thread processes all the rows of X in turn. This strategy is
+                a sequence of embarrassingly parallel subtasks (the inner loop on Y
+                chunks) with no intermediate datastructures synchronisation.
+
+              - 'auto' relies on a simple heuristic to choose between
+                'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+                'parallel_on_X' is usually the most efficient strategy.
+                When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
+                brings more opportunity for parallelism and is therefore more efficient.
+
+              - None (default) looks-up in scikit-learn configuration for
+                `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        Returns
+        -------
+        pairwise_distances_matrix : ndarray of shape (n_samples_X, n_samples_Y)
+            The pairwise distances matrix.
+
+        Notes
+        -----
+        This public classmethod is responsible for introspecting the arguments
+        values to dispatch to the private dtype-specialized implementation of
+        :class:`PairwiseDistances`.
+
+        All temporarily allocated datastructures necessary for the concrete
+        implementations are therefore freed when this classmethod returns.
+
+        This allows entirely decoupling the API entirely from the
+        implementation details whilst maintaining RAII.
+        """
+        Y = X if Y is None else Y
+        if X.dtype == Y.dtype == np.float64:
+            return PairwiseDistances64.compute(
+                X=X,
+                Y=Y,
+                metric=metric,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+            )
+
+        if X.dtype == Y.dtype == np.float32:
+            return PairwiseDistances32.compute(
+                X=X,
+                Y=Y,
+                metric=metric,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+            )
+
+        raise ValueError(
+            "Only float64 or float32 datasets pairs are supported, but "
+            f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
+
+
 class ArgKmin(BaseDistancesReductionDispatcher):
     """Compute the argkmin of row vectors of X on the ones of Y.