scikit-learn · OmarManzoor · Apr 3, 2023 · Apr 20, 2023 · May 2, 2023 · May 10, 2023
diff --git a/.gitignore b/.gitignore
@@ -99,6 +99,7 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
 sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
+sklearn/metrics/_pairwise_distances_reduction/_argkmin_lrd.pyx
 
 # Default JupyterLite content
 jupyterlite_contents
diff --git a/setup.cfg b/setup.cfg
@@ -90,6 +90,7 @@ ignore =
     sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
+    sklearn/metrics/_pairwise_distances_reduction/_argkmin_lrd.pyx
 
 
 [codespell]

diff --git a/setup.py b/setup.py
@@ -291,6 +291,12 @@ def check_package_status(package, min_version):
             "include_np": True,
             "extra_compile_args": ["-std=c++11"],
         },
+        {
+            "sources": ["_argkmin_lrd.pyx.tp"],
+            "language": "c++",
+            "include_np": True,
+            "extra_compile_args": ["-std=c++11"],
+        },
     ],
     "preprocessing": [
         {"sources": ["_csr_polynomial_expansion.pyx"]},

diff --git a/sklearn/metrics/_pairwise_distances_reduction/__init__.py b/sklearn/metrics/_pairwise_distances_reduction/__init__.py
@@ -92,6 +92,7 @@
     RadiusNeighbors,
     ArgKminClassMode,
     sqeuclidean_row_norms,
+    ArgKminLRD,
 )
 
 __all__ = [
@@ -100,4 +101,5 @@
     "RadiusNeighbors",
     "ArgKminClassMode",
     "sqeuclidean_row_norms",
+    "ArgKminLRD",
 ]
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_lrd.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_lrd.pyx.tp
@@ -0,0 +1,140 @@
+from cython cimport floating, integral
+from cython.parallel cimport prange
+from libc.math cimport fmax
+from libc.stdlib cimport free
+
+from ...utils._typedefs cimport intp_t, float64_t
+from ...utils.fixes import threadpool_limits
+
+import numpy as np
+
+
+cdef float64_t EPS = 1e-10
+
+{{for name_suffix in ["32", "64"]}}
+
+from ._argkmin cimport ArgKmin{{name_suffix}}
+from ._datasets_pair cimport DatasetsPair{{name_suffix}}
+
+cdef class ArgKminLRD{{name_suffix}}(ArgKmin{{name_suffix}}):
+    """
+    {{name_suffix}}bit implementation of ArgKminLRD. Current
+    implementation supports the fit method of LocalOutlierFactor.
+    """
+    cdef:
+        float64_t[::1] lrd
+        float64_t[:, ::1] actual_distances
+        intp_t[:, ::1] actual_indices
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        intp_t k,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+        bint return_distance=False,
+    ):
+        # Use a generic implementation that handles most scipy
+        # metrics by computing the distances between 2 vectors at a time.
+        pda = ArgKminLRD{{name_suffix}}(
+            datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
+            k=k,
+            chunk_size=chunk_size,
+            strategy=strategy,
+        )
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results(return_distance)
+
+    def __init__(
+        self,
+        DatasetsPair{{name_suffix}} datasets_pair,
+        chunk_size=None,
+        strategy=None,
+        intp_t k=1,
+    ):
+        super().__init__(
+            datasets_pair=datasets_pair,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            k=k,
+        )
+
+        self.lrd = np.zeros(self.n_samples_X, dtype=np.float64)
+
+        # Current implementation of ArgKminLRD is used in the fit method
+        # of the LocalOutlierFactor. When it is called from fit the
+        # number of neighbors are actually set to one more than the actual
+        # number of neighbors. There we would actually get k + 1. However
+        # the first neighbor is actually ignored when doing the computation.
+        # This is why the number of neighbors below is set to k - 1.
+        self.actual_distances = np.zeros(
+            (self.n_samples_X, self.k - 1), dtype=np.float64
+        )
+        self.actual_indices = np.zeros(
+            (self.n_samples_X, self.k - 1), dtype=np.intp
+        )
+
+    cdef inline void _compute_local_reachability_density(
+        self,
+        bint return_distance,
+    ) noexcept nogil:
+        cdef:
+            intp_t sample_index, neighbor_rank, neighbor_idx
+            float64_t dist_sample, dist_neighbor, dist_reach
+            float64_t overall_reachability
+
+        for sample_index in prange(
+            self.n_samples_X,
+            schedule='static',
+            nogil=True,
+            num_threads=self.effective_n_threads,
+        ):
+            overall_reachability = 0
+            # Since the first neighbor is ignored when calling ArgKminLRD from
+            # the fit method of the LocalOutlierFactor we simply start the loop
+            # from 1.
+            for neighbor_rank in range(1, self.k):
+                neighbor_idx = self.argkmin_indices[sample_index, neighbor_rank]
+                dist_sample = self.datasets_pair.distance_metric._rdist_to_dist(
+                    fmax(self.argkmin_distances[sample_index, neighbor_rank], 0.0)
+                )
+                dist_neighbor = self.datasets_pair.distance_metric._rdist_to_dist(
+                    fmax(self.argkmin_distances[neighbor_idx, self.k - 1], 0.0)
+                )
+                dist_reach = fmax(dist_sample, dist_neighbor)
+                overall_reachability += dist_reach
+                # For actual indices and actual distances we need to fill the
+                # arrays starting from neighbor index = 0 thus we subtract 1
+                # here.
+                self.actual_indices[sample_index, neighbor_rank - 1] = neighbor_idx
+                if return_distance:
+                    self.actual_distances[sample_index, neighbor_rank - 1] = dist_sample
+
+            # self.k - 1 is the actual number of neighbors ignoring the
+            # first extra one.
+            overall_reachability = overall_reachability / (self.k - 1)
+            self.lrd[sample_index] = 1.0 / (overall_reachability + EPS)
+
+    def _finalize_results(self, bint return_distance):
+        self._compute_local_reachability_density(return_distance=return_distance)
+        if return_distance:
+            return (
+                np.asarray(self.lrd),
+                np.asarray(self.actual_indices),
+                np.asarray(self.actual_distances),
+            )
+        return np.asarray(self.lrd), np.asarray(self.actual_indices)
+
+{{endfor}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -24,6 +24,11 @@
     RadiusNeighbors32,
 )
 
+from ._argkmin_lrd import (
+    ArgKminLRD64,
+    ArgKminLRD32,
+)
+
 from ... import get_config
 
 
@@ -618,3 +623,131 @@ def compute(
             "Only float64 or float32 datasets pairs are supported at this time, "
             f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
         )
+
+
+class ArgKminLRD(BaseDistancesReductionDispatcher):
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        k,
+        metric="euclidean",
+        chunk_size=None,
+        metric_kwargs=None,
+        strategy=None,
+        return_distance=False,
+    ):
+        """Compute the argkmin reduction for calculating the local reachability
+        density.
+
+        ArgKminLRD is typically used to perform bruteforce k-nearest neighbors
+        queries and compute the local reachability density using the distances
+        and indices of the final neighbors that are computed.
+
+        This class is not meant to be instantiated, one should only use
+        its :meth:`compute` classmethod which handles allocation and
+        deallocation consistently.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        k : int
+            The number of nearest neighbors to consider.
+
+        metric : str, default='euclidean'
+            The distance metric to use. For a list of available metrics, see
+            the documentation of :class:`~sklearn.metrics.DistanceMetric`.
+            Currently does not support `'precomputed'`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+                Each thread then iterates on all the chunks of Y. This strategy is
+                embarrassingly parallel and comes with no datastructures
+                synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+                Each thread processes all the chunks of X in turn. This strategy is
+                a sequence of embarrassingly parallel subtasks (the inner loop on Y
+                chunks) with intermediate datastructures synchronisation at each
+                iteration of the sequential outer loop on X chunks.
+
+              - 'auto' relies on a simple heuristic to choose between
+                'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+                'parallel_on_X' is usually the most efficient strategy.
+                When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
+                brings more opportunity for parallelism and is therefore more efficient
+                despite the synchronization step at each iteration of the outer loop
+                on chunks of `X`.
+
+              - None (default) looks-up in scikit-learn configuration for
+                `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its
+            argkmin if set to True.
+
+        Returns
+        -------
+        lrd : ndarray of shape (n_samples_X,)
+            An array containing the local reachability density for each sample.
+
+        Notes
+        -----
+        This classmethod is responsible for introspecting the arguments
+        values to dispatch to the most appropriate implementation of
+        :class:`PairwiseDistancesArgKmin`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+        """
+        if X.dtype == Y.dtype == np.float64:
+            return ArgKminLRD64.compute(
+                X=X,
+                Y=Y,
+                k=k,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                return_distance=return_distance,
+            )
+
+        if X.dtype == Y.dtype == np.float32:
+            return ArgKminLRD32.compute(
+                X=X,
+                Y=Y,
+                k=k,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                return_distance=return_distance,
+            )
+
+        raise ValueError(
+            "Only float64 or float32 datasets pairs are supported at this time, "
+            f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -14,6 +14,7 @@
     BaseDistancesReductionDispatcher,
     ArgKmin,
     ArgKminClassMode,
+    ArgKminLRD,
     RadiusNeighbors,
     sqeuclidean_row_norms,
 )
@@ -770,6 +771,38 @@ def test_argkmin_classmode_factory_method_wrong_usages():
     # of ArgKminClassMode is supported.
 
 
+def test_argkmin_lrd_factory_method_wrong_usages():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    k = 5
+    metric = "manhattan"
+
+    msg = (
+        "Only float64 or float32 datasets pairs are supported at this time, "
+        "got: X.dtype=float32 and Y.dtype=float64"
+    )
+    with pytest.raises(ValueError, match=msg):
+        ArgKminLRD.compute(
+            X=X.astype(np.float32),
+            Y=Y,
+            k=k,
+            metric=metric,
+        )
+
+    msg = (
+        "Only float64 or float32 datasets pairs are supported at this time, "
+        "got: X.dtype=float64 and Y.dtype=int32"
+    )
+    with pytest.raises(ValueError, match=msg):
+        ArgKminLRD.compute(
+            X=X,
+            Y=Y.astype(np.int32),
+            k=k,
+            metric=metric,
+        )
+
+
 def test_radius_neighbors_factory_method_wrong_usages():
     rng = np.random.RandomState(1)
     X = rng.rand(100, 10)