scikit-learn · TomDLT · Jun 6, 2018 · May 25, 2018 · May 25, 2018 · May 25, 2018
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -182,6 +182,10 @@ Classifiers and regressors
 - :func:`manifold.t_sne.trustworthiness` accepts metrics other than
   Euclidean. :issue:`9775` by :user:`William de Vazelhes <wdevazelhes>`.
 
+- :mod:`Nearest neighbors <neighbors>` query methods are now more memory
+  efficient when ``algorithm='brute'``. :issue:`11136` by `Joel Nothman`_
+  and :user:`Aman Dalmia <dalmia>`.
+
 Cluster
 
 - :class:`cluster.KMeans`, :class:`cluster.MiniBatchKMeans` and

diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
@@ -6,6 +6,8 @@
 #          Multi-output support by Arnaud Joly <a.joly@ulg.ac.be>
 #
 # License: BSD 3 clause (C) INRIA, University of Amsterdam
+from functools import partial
+
 import warnings
 from abc import ABCMeta, abstractmethod
 
@@ -15,7 +17,7 @@
 from .ball_tree import BallTree
 from .kd_tree import KDTree
 from ..base import BaseEstimator
-from ..metrics import pairwise_distances
+from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
 from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices
 from ..utils.multiclass import check_classification_targets
@@ -276,9 +278,43 @@ def _pairwise(self):
 class KNeighborsMixin(object):
     """Mixin for k-neighbors searches"""
 
+    def _kneighbors_reduce_func(self, dist, start,
+                                n_neighbors, return_distance):
+        """Reduce a chunk of distances to the nearest neighbors
+
+        Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`
+
+        Parameters
+        ----------
+        dist : array of shape (n_samples_chunk, n_samples)
+        start : int
+            The index in X which the first row of dist corresponds to.
+        n_neighbors : int
+        return_distance : bool
+
+        Returns
+        -------
+        dist : array of shape (n_samples_chunk, n_neighbors), optional
+            Returned only if return_distance
+        neigh : array of shape (n_samples_chunk, n_neighbors)
+        """
+        sample_range = np.arange(dist.shape[0])[:, None]
+        neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1)
+        neigh_ind = neigh_ind[:, :n_neighbors]
+        # argpartition doesn't guarantee sorted order, so we sort again
+        neigh_ind = neigh_ind[
+            sample_range, np.argsort(dist[sample_range, neigh_ind])]
+        if return_distance:
+            if self.effective_metric_ == 'euclidean':
+                result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
+            else:
+                result = dist[sample_range, neigh_ind], neigh_ind
+        else:
+            result = neigh_ind
+        return result
+
     def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
         """Finds the K-neighbors of a point.
-
         Returns indices of and distances to the neighbors of each point.
 
         Parameters
@@ -367,28 +403,19 @@ class from an array representing our data set and ask who's
 
         n_jobs = _get_n_jobs(self.n_jobs)
         if self._fit_method == 'brute':
-            # for efficiency, use squared euclidean distances
-            if self.effective_metric_ == 'euclidean':
-                dist = pairwise_distances(X, self._fit_X, 'euclidean',
-                                          n_jobs=n_jobs, squared=True)
-            else:
-                dist = pairwise_distances(
-                    X, self._fit_X, self.effective_metric_, n_jobs=n_jobs,
-                    **self.effective_metric_params_)
 
-            neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1)
-            neigh_ind = neigh_ind[:, :n_neighbors]
-            # argpartition doesn't guarantee sorted order, so we sort again
-            neigh_ind = neigh_ind[
-                sample_range, np.argsort(dist[sample_range, neigh_ind])]
+            reduce_func = partial(self._kneighbors_reduce_func,
+                                  n_neighbors=n_neighbors,
+                                  return_distance=return_distance)
 
-            if return_distance:
-                if self.effective_metric_ == 'euclidean':
-                    result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
-                else:
-                    result = dist[sample_range, neigh_ind], neigh_ind
-            else:
-                result = neigh_ind
+            # for efficiency, use squared euclidean distances
+            kwds = ({'squared': True} if self.effective_metric_ == 'euclidean'
+                    else self.effective_metric_params_)
+
+            result = pairwise_distances_chunked(
+                X, self._fit_X, reduce_func=reduce_func,
+                metric=self.effective_metric_, n_jobs=n_jobs,
+                **kwds)
 
         elif self._fit_method in ['ball_tree', 'kd_tree']:
             if issparse(X):
@@ -400,14 +427,15 @@ class from an array representing our data set and ask who's
                     X[s], n_neighbors, return_distance)
                 for s in gen_even_slices(X.shape[0], n_jobs)
             )
-            if return_distance:
-                dist, neigh_ind = tuple(zip(*result))
-                result = np.vstack(dist), np.vstack(neigh_ind)
-            else:
-                result = np.vstack(result)
         else:
             raise ValueError("internal: _fit_method not recognized")
 
+        if return_distance:
+            dist, neigh_ind = zip(*result)
+            result = np.vstack(dist), np.vstack(neigh_ind)
+        else:
+            result = np.vstack(result)
+
         if not query_is_train:
             return result
         else:
@@ -519,6 +547,40 @@ def kneighbors_graph(self, X=None, n_neighbors=None,
 class RadiusNeighborsMixin(object):
     """Mixin for radius-based neighbors searches"""
 
+    def _radius_neighbors_reduce_func(self, dist, start,
+                                      radius, return_distance):
+        """Reduce a chunk of distances to the nearest neighbors
+
+        Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`
+
+        Parameters
+        ----------
+        dist : array of shape (n_samples_chunk, n_samples)
+        start : int
+            The index in X which the first row of dist corresponds to.
+        radius : float
+        return_distance : bool
+
+        Returns
+        -------
+        dist : list of n_samples_chunk 1d arrays, optional
+            Returned only if return_distance
+        neigh : list of n_samples_chunk 1d arrays
+        """
+        neigh_ind = [np.where(d <= radius)[0] for d in dist]
+
+        if return_distance:
+            if self.effective_metric_ == 'euclidean':
+                dist = [np.sqrt(d[neigh_ind[i]])
+                        for i, d in enumerate(dist)]
+            else:
+                dist = [d[neigh_ind[i]]
+                        for i, d in enumerate(dist)]
+            results = dist, neigh_ind
+        else:
+            results = neigh_ind
+        return results
+
     def radius_neighbors(self, X=None, radius=None, return_distance=True):
         """Finds the neighbors within a given radius of a point or points.
 
@@ -597,39 +659,37 @@ class from an array representing our data set and ask who's
         if radius is None:
             radius = self.radius
 
-        n_samples = X.shape[0]
         if self._fit_method == 'brute':
             # for efficiency, use squared euclidean distances
             if self.effective_metric_ == 'euclidean':
-                dist = pairwise_distances(X, self._fit_X, 'euclidean',
-                                          n_jobs=self.n_jobs, squared=True)
                 radius *= radius
+                kwds = {'squared': True}
             else:
-                dist = pairwise_distances(X, self._fit_X,
-                                          self.effective_metric_,
-                                          n_jobs=self.n_jobs,
-                                          **self.effective_metric_params_)
-
-            neigh_ind_list = [np.where(d <= radius)[0] for d in dist]
+                kwds = self.effective_metric_params_
 
-            # See https://github.com/numpy/numpy/issues/5456
-            # if you want to understand why this is initialized this way.
-            neigh_ind = np.empty(n_samples, dtype='object')
-            neigh_ind[:] = neigh_ind_list
+            reduce_func = partial(self._radius_neighbors_reduce_func,
+                                  radius=radius,
+                                  return_distance=return_distance)
 
+            results = pairwise_distances_chunked(
+                X, self._fit_X, reduce_func=reduce_func,
+                metric=self.effective_metric_, n_jobs=self.n_jobs,
+                **kwds)
             if return_distance:
-                dist_array = np.empty(n_samples, dtype='object')
-                if self.effective_metric_ == 'euclidean':
-                    dist_list = [np.sqrt(d[neigh_ind[i]])
-                                 for i, d in enumerate(dist)]
-                else:
-                    dist_list = [d[neigh_ind[i]]
-                                 for i, d in enumerate(dist)]
-                dist_array[:] = dist_list
-
-                results = dist_array, neigh_ind
+                dist_chunks, neigh_ind_chunks = zip(*results)
+                dist_list = sum(dist_chunks, [])
+                neigh_ind_list = sum(neigh_ind_chunks, [])
+                # See https://github.com/numpy/numpy/issues/5456
+                # if you want to understand why this is initialized this way.
+                dist = np.empty(len(dist_list), dtype='object')
+                dist[:] = dist_list
+                neigh_ind = np.empty(len(neigh_ind_list), dtype='object')
+                neigh_ind[:] = neigh_ind_list
+                results = dist, neigh_ind
             else:
-                results = neigh_ind
+                neigh_ind_list = sum(results, [])
+                results = np.empty(len(neigh_ind_list), dtype='object')
+                results[:] = neigh_ind_list
 
         elif self._fit_method in ['ball_tree', 'kd_tree']:
             if issparse(X):