Skip to content

[MRG+1] ENH use pairwise_distances_chunked in brute nearest neighbors #11136

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v0.20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,10 @@ Classifiers and regressors
- :func:`manifold.t_sne.trustworthiness` accepts metrics other than
Euclidean. :issue:`9775` by :user:`William de Vazelhes <wdevazelhes>`.

- :mod:`Nearest neighbors <neighbors>` query methods are now more memory
efficient when ``algorithm='brute'``. :issue:`11136` by `Joel Nothman`_
and :user:`Aman Dalmia <dalmia>`.

Cluster

- :class:`cluster.KMeans`, :class:`cluster.MiniBatchKMeans` and
Expand Down
162 changes: 111 additions & 51 deletions sklearn/neighbors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
# Multi-output support by Arnaud Joly <a.joly@ulg.ac.be>
#
# License: BSD 3 clause (C) INRIA, University of Amsterdam
from functools import partial

import warnings
from abc import ABCMeta, abstractmethod

Expand All @@ -15,7 +17,7 @@
from .ball_tree import BallTree
from .kd_tree import KDTree
from ..base import BaseEstimator
from ..metrics import pairwise_distances
from ..metrics import pairwise_distances_chunked
from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices
from ..utils.multiclass import check_classification_targets
Expand Down Expand Up @@ -276,9 +278,43 @@ def _pairwise(self):
class KNeighborsMixin(object):
"""Mixin for k-neighbors searches"""

def _kneighbors_reduce_func(self, dist, start,
n_neighbors, return_distance):
Copy link
Member

@rth rth May 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would make sense to decorate it with @staticmethod and remove self to help redability?

Copy link
Member Author

@jnothman jnothman May 28, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It uses self.effective_metric_

"""Reduce a chunk of distances to the nearest neighbors

Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`

Parameters
----------
dist : array of shape (n_samples_chunk, n_samples)
start : int
The index in X which the first row of dist corresponds to.
n_neighbors : int
return_distance : bool

Returns
-------
dist : array of shape (n_samples_chunk, n_neighbors), optional
Returned only if return_distance
neigh : array of shape (n_samples_chunk, n_neighbors)
"""
sample_range = np.arange(dist.shape[0])[:, None]
neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1)
neigh_ind = neigh_ind[:, :n_neighbors]
# argpartition doesn't guarantee sorted order, so we sort again
neigh_ind = neigh_ind[
sample_range, np.argsort(dist[sample_range, neigh_ind])]
if return_distance:
if self.effective_metric_ == 'euclidean':
result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
else:
result = dist[sample_range, neigh_ind], neigh_ind
else:
result = neigh_ind
return result

def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
"""Finds the K-neighbors of a point.

Returns indices of and distances to the neighbors of each point.

Parameters
Expand Down Expand Up @@ -367,28 +403,19 @@ class from an array representing our data set and ask who's

n_jobs = _get_n_jobs(self.n_jobs)
if self._fit_method == 'brute':
# for efficiency, use squared euclidean distances
if self.effective_metric_ == 'euclidean':
dist = pairwise_distances(X, self._fit_X, 'euclidean',
n_jobs=n_jobs, squared=True)
else:
dist = pairwise_distances(
X, self._fit_X, self.effective_metric_, n_jobs=n_jobs,
**self.effective_metric_params_)

neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1)
neigh_ind = neigh_ind[:, :n_neighbors]
# argpartition doesn't guarantee sorted order, so we sort again
neigh_ind = neigh_ind[
sample_range, np.argsort(dist[sample_range, neigh_ind])]
reduce_func = partial(self._kneighbors_reduce_func,
n_neighbors=n_neighbors,
return_distance=return_distance)

if return_distance:
if self.effective_metric_ == 'euclidean':
result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
else:
result = dist[sample_range, neigh_ind], neigh_ind
else:
result = neigh_ind
# for efficiency, use squared euclidean distances
kwds = ({'squared': True} if self.effective_metric_ == 'euclidean'
else self.effective_metric_params_)

result = pairwise_distances_chunked(
X, self._fit_X, reduce_func=reduce_func,
metric=self.effective_metric_, n_jobs=n_jobs,
**kwds)

elif self._fit_method in ['ball_tree', 'kd_tree']:
if issparse(X):
Expand All @@ -400,14 +427,15 @@ class from an array representing our data set and ask who's
X[s], n_neighbors, return_distance)
for s in gen_even_slices(X.shape[0], n_jobs)
)
if return_distance:
dist, neigh_ind = tuple(zip(*result))
result = np.vstack(dist), np.vstack(neigh_ind)
else:
result = np.vstack(result)
else:
raise ValueError("internal: _fit_method not recognized")

if return_distance:
dist, neigh_ind = zip(*result)
result = np.vstack(dist), np.vstack(neigh_ind)
else:
result = np.vstack(result)

if not query_is_train:
return result
else:
Expand Down Expand Up @@ -519,6 +547,40 @@ def kneighbors_graph(self, X=None, n_neighbors=None,
class RadiusNeighborsMixin(object):
"""Mixin for radius-based neighbors searches"""

def _radius_neighbors_reduce_func(self, dist, start,
radius, return_distance):
"""Reduce a chunk of distances to the nearest neighbors

Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`

Parameters
----------
dist : array of shape (n_samples_chunk, n_samples)
start : int
The index in X which the first row of dist corresponds to.
radius : float
return_distance : bool

Returns
-------
dist : list of n_samples_chunk 1d arrays, optional
Returned only if return_distance
neigh : list of n_samples_chunk 1d arrays
"""
neigh_ind = [np.where(d <= radius)[0] for d in dist]

if return_distance:
if self.effective_metric_ == 'euclidean':
dist = [np.sqrt(d[neigh_ind[i]])
for i, d in enumerate(dist)]
else:
dist = [d[neigh_ind[i]]
for i, d in enumerate(dist)]
results = dist, neigh_ind
else:
results = neigh_ind
return results

def radius_neighbors(self, X=None, radius=None, return_distance=True):
"""Finds the neighbors within a given radius of a point or points.

Expand Down Expand Up @@ -597,39 +659,37 @@ class from an array representing our data set and ask who's
if radius is None:
radius = self.radius

n_samples = X.shape[0]
if self._fit_method == 'brute':
# for efficiency, use squared euclidean distances
if self.effective_metric_ == 'euclidean':
dist = pairwise_distances(X, self._fit_X, 'euclidean',
n_jobs=self.n_jobs, squared=True)
radius *= radius
kwds = {'squared': True}
else:
dist = pairwise_distances(X, self._fit_X,
self.effective_metric_,
n_jobs=self.n_jobs,
**self.effective_metric_params_)

neigh_ind_list = [np.where(d <= radius)[0] for d in dist]
kwds = self.effective_metric_params_

# See https://github.com/numpy/numpy/issues/5456
# if you want to understand why this is initialized this way.
neigh_ind = np.empty(n_samples, dtype='object')
neigh_ind[:] = neigh_ind_list
reduce_func = partial(self._radius_neighbors_reduce_func,
radius=radius,
return_distance=return_distance)

results = pairwise_distances_chunked(
X, self._fit_X, reduce_func=reduce_func,
metric=self.effective_metric_, n_jobs=self.n_jobs,
**kwds)
if return_distance:
dist_array = np.empty(n_samples, dtype='object')
if self.effective_metric_ == 'euclidean':
dist_list = [np.sqrt(d[neigh_ind[i]])
for i, d in enumerate(dist)]
else:
dist_list = [d[neigh_ind[i]]
for i, d in enumerate(dist)]
dist_array[:] = dist_list

results = dist_array, neigh_ind
dist_chunks, neigh_ind_chunks = zip(*results)
dist_list = sum(dist_chunks, [])
neigh_ind_list = sum(neigh_ind_chunks, [])
# See https://github.com/numpy/numpy/issues/5456
# if you want to understand why this is initialized this way.
dist = np.empty(len(dist_list), dtype='object')
dist[:] = dist_list
neigh_ind = np.empty(len(neigh_ind_list), dtype='object')
neigh_ind[:] = neigh_ind_list
results = dist, neigh_ind
else:
results = neigh_ind
neigh_ind_list = sum(results, [])
results = np.empty(len(neigh_ind_list), dtype='object')
results[:] = neigh_ind_list

elif self._fit_method in ['ball_tree', 'kd_tree']:
if issparse(X):
Expand Down