Skip to content

ENH add pairwise distances backend for LocalOutlierFactor #26316

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
sklearn/metrics/_pairwise_distances_reduction/_argkmin_lrd.pyx

# Default JupyterLite content
jupyterlite_contents
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ ignore =
sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
sklearn/metrics/_pairwise_distances_reduction/_argkmin_lrd.pyx


[codespell]
Expand Down
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,12 @@ def check_package_status(package, min_version):
"include_np": True,
"extra_compile_args": ["-std=c++11"],
},
{
"sources": ["_argkmin_lrd.pyx.tp"],
"language": "c++",
"include_np": True,
"extra_compile_args": ["-std=c++11"],
},
],
"preprocessing": [
{"sources": ["_csr_polynomial_expansion.pyx"]},
Expand Down
2 changes: 2 additions & 0 deletions sklearn/metrics/_pairwise_distances_reduction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@
RadiusNeighbors,
ArgKminClassMode,
sqeuclidean_row_norms,
ArgKminLRD,
)

__all__ = [
Expand All @@ -100,4 +101,5 @@
"RadiusNeighbors",
"ArgKminClassMode",
"sqeuclidean_row_norms",
"ArgKminLRD",
]
140 changes: 140 additions & 0 deletions sklearn/metrics/_pairwise_distances_reduction/_argkmin_lrd.pyx.tp
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from cython cimport floating, integral
from cython.parallel cimport prange
from libc.math cimport fmax
from libc.stdlib cimport free

from ...utils._typedefs cimport intp_t, float64_t
from ...utils.fixes import threadpool_limits

import numpy as np


cdef float64_t EPS = 1e-10

{{for name_suffix in ["32", "64"]}}

from ._argkmin cimport ArgKmin{{name_suffix}}
from ._datasets_pair cimport DatasetsPair{{name_suffix}}

cdef class ArgKminLRD{{name_suffix}}(ArgKmin{{name_suffix}}):
"""
{{name_suffix}}bit implementation of ArgKminLRD. Current
implementation supports the fit method of LocalOutlierFactor.
"""
cdef:
float64_t[::1] lrd
float64_t[:, ::1] actual_distances
intp_t[:, ::1] actual_indices

@classmethod
def compute(
cls,
X,
Y,
intp_t k,
str metric="euclidean",
chunk_size=None,
dict metric_kwargs=None,
str strategy=None,
bint return_distance=False,
):
# Use a generic implementation that handles most scipy
# metrics by computing the distances between 2 vectors at a time.
pda = ArgKminLRD{{name_suffix}}(
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
k=k,
chunk_size=chunk_size,
strategy=strategy,
)

# Limit the number of threads in second level of nested parallelism for BLAS
# to avoid threads over-subscription (in GEMM for instance).
with threadpool_limits(limits=1, user_api="blas"):
if pda.execute_in_parallel_on_Y:
pda._parallel_on_Y()
else:
pda._parallel_on_X()

return pda._finalize_results(return_distance)

def __init__(
self,
DatasetsPair{{name_suffix}} datasets_pair,
chunk_size=None,
strategy=None,
intp_t k=1,
):
super().__init__(
datasets_pair=datasets_pair,
chunk_size=chunk_size,
strategy=strategy,
k=k,
)

self.lrd = np.zeros(self.n_samples_X, dtype=np.float64)

# Current implementation of ArgKminLRD is used in the fit method
# of the LocalOutlierFactor. When it is called from fit the
# number of neighbors are actually set to one more than the actual
# number of neighbors. There we would actually get k + 1. However
# the first neighbor is actually ignored when doing the computation.
# This is why the number of neighbors below is set to k - 1.
self.actual_distances = np.zeros(
(self.n_samples_X, self.k - 1), dtype=np.float64
)
self.actual_indices = np.zeros(
(self.n_samples_X, self.k - 1), dtype=np.intp
)

cdef inline void _compute_local_reachability_density(
self,
bint return_distance,
) noexcept nogil:
cdef:
intp_t sample_index, neighbor_rank, neighbor_idx
float64_t dist_sample, dist_neighbor, dist_reach
float64_t overall_reachability

for sample_index in prange(
self.n_samples_X,
schedule='static',
nogil=True,
num_threads=self.effective_n_threads,
):
overall_reachability = 0
# Since the first neighbor is ignored when calling ArgKminLRD from
# the fit method of the LocalOutlierFactor we simply start the loop
# from 1.
for neighbor_rank in range(1, self.k):
neighbor_idx = self.argkmin_indices[sample_index, neighbor_rank]
dist_sample = self.datasets_pair.distance_metric._rdist_to_dist(
fmax(self.argkmin_distances[sample_index, neighbor_rank], 0.0)
)
dist_neighbor = self.datasets_pair.distance_metric._rdist_to_dist(
fmax(self.argkmin_distances[neighbor_idx, self.k - 1], 0.0)
)
dist_reach = fmax(dist_sample, dist_neighbor)
overall_reachability += dist_reach
# For actual indices and actual distances we need to fill the
# arrays starting from neighbor index = 0 thus we subtract 1
# here.
self.actual_indices[sample_index, neighbor_rank - 1] = neighbor_idx
if return_distance:
self.actual_distances[sample_index, neighbor_rank - 1] = dist_sample

# self.k - 1 is the actual number of neighbors ignoring the
# first extra one.
overall_reachability = overall_reachability / (self.k - 1)
self.lrd[sample_index] = 1.0 / (overall_reachability + EPS)

def _finalize_results(self, bint return_distance):
self._compute_local_reachability_density(return_distance=return_distance)
if return_distance:
return (
np.asarray(self.lrd),
np.asarray(self.actual_indices),
np.asarray(self.actual_distances),
)
return np.asarray(self.lrd), np.asarray(self.actual_indices)

{{endfor}}
133 changes: 133 additions & 0 deletions sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
RadiusNeighbors32,
)

from ._argkmin_lrd import (
ArgKminLRD64,
ArgKminLRD32,
)

from ... import get_config


Expand Down Expand Up @@ -618,3 +623,131 @@ def compute(
"Only float64 or float32 datasets pairs are supported at this time, "
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
)


class ArgKminLRD(BaseDistancesReductionDispatcher):
@classmethod
def compute(
cls,
X,
Y,
k,
metric="euclidean",
chunk_size=None,
metric_kwargs=None,
strategy=None,
return_distance=False,
):
"""Compute the argkmin reduction for calculating the local reachability
density.

ArgKminLRD is typically used to perform bruteforce k-nearest neighbors
queries and compute the local reachability density using the distances
and indices of the final neighbors that are computed.

This class is not meant to be instantiated, one should only use
its :meth:`compute` classmethod which handles allocation and
deallocation consistently.

Parameters
----------
X : ndarray or CSR matrix of shape (n_samples_X, n_features)
Input data.

Y : ndarray or CSR matrix of shape (n_samples_X, n_features)
Input data.

k : int
The number of nearest neighbors to consider.

metric : str, default='euclidean'
The distance metric to use. For a list of available metrics, see
the documentation of :class:`~sklearn.metrics.DistanceMetric`.
Currently does not support `'precomputed'`.

chunk_size : int, default=None,
The number of vectors per chunk. If None (default) looks-up in
scikit-learn configuration for `pairwise_dist_chunk_size`,
and use 256 if it is not set.

metric_kwargs : dict, default=None
Keyword arguments to pass to specified metric function.

strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
The chunking strategy defining which dataset parallelization are made on.

For both strategies the computations happens with two nested loops,
respectively on chunks of X and chunks of Y.
Strategies differs on which loop (outer or inner) is made to run
in parallel with the Cython `prange` construct:

- 'parallel_on_X' dispatches chunks of X uniformly on threads.
Each thread then iterates on all the chunks of Y. This strategy is
embarrassingly parallel and comes with no datastructures
synchronisation.

- 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
Each thread processes all the chunks of X in turn. This strategy is
a sequence of embarrassingly parallel subtasks (the inner loop on Y
chunks) with intermediate datastructures synchronisation at each
iteration of the sequential outer loop on X chunks.

- 'auto' relies on a simple heuristic to choose between
'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
'parallel_on_X' is usually the most efficient strategy.
When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
brings more opportunity for parallelism and is therefore more efficient
despite the synchronization step at each iteration of the outer loop
on chunks of `X`.

- None (default) looks-up in scikit-learn configuration for
`pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.

return_distance : boolean, default=False
Return distances between each X vector and its
argkmin if set to True.

Returns
-------
lrd : ndarray of shape (n_samples_X,)
An array containing the local reachability density for each sample.

Notes
-----
This classmethod is responsible for introspecting the arguments
values to dispatch to the most appropriate implementation of
:class:`PairwiseDistancesArgKmin`.

This allows decoupling the API entirely from the implementation details
whilst maintaining RAII: all temporarily allocated datastructures necessary
for the concrete implementation are therefore freed when this classmethod
returns.
"""
if X.dtype == Y.dtype == np.float64:
return ArgKminLRD64.compute(
X=X,
Y=Y,
k=k,
metric=metric,
chunk_size=chunk_size,
metric_kwargs=metric_kwargs,
strategy=strategy,
return_distance=return_distance,
)

if X.dtype == Y.dtype == np.float32:
return ArgKminLRD32.compute(
X=X,
Y=Y,
k=k,
metric=metric,
chunk_size=chunk_size,
metric_kwargs=metric_kwargs,
strategy=strategy,
return_distance=return_distance,
)

raise ValueError(
"Only float64 or float32 datasets pairs are supported at this time, "
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
)
Comment on lines +750 to +753
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to add a test for this

33 changes: 33 additions & 0 deletions sklearn/metrics/tests/test_pairwise_distances_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
BaseDistancesReductionDispatcher,
ArgKmin,
ArgKminClassMode,
ArgKminLRD,
RadiusNeighbors,
sqeuclidean_row_norms,
)
Expand Down Expand Up @@ -770,6 +771,38 @@ def test_argkmin_classmode_factory_method_wrong_usages():
# of ArgKminClassMode is supported.


def test_argkmin_lrd_factory_method_wrong_usages():
rng = np.random.RandomState(1)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
k = 5
metric = "manhattan"

msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float32 and Y.dtype=float64"
)
with pytest.raises(ValueError, match=msg):
ArgKminLRD.compute(
X=X.astype(np.float32),
Y=Y,
k=k,
metric=metric,
)

msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float64 and Y.dtype=int32"
)
with pytest.raises(ValueError, match=msg):
ArgKminLRD.compute(
X=X,
Y=Y.astype(np.int32),
k=k,
metric=metric,
)


def test_radius_neighbors_factory_method_wrong_usages():
rng = np.random.RandomState(1)
X = rng.rand(100, 10)
Expand Down
Loading