Skip to content

Fix radius neighbors memory leak #11055

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
9 changes: 9 additions & 0 deletions doc/whats_new/v0.20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,22 @@ Classifiers and regressors
only require X to be an object with finite length or shape.
:issue:`9832` by :user:`Vrishank Bhardwaj <vrishank97>`.

- :class:`neighbors.RadiusNeighborsRegressor` and
:class:`neighbors.RadiusNeighborsClassifier` are now
parallelized according to ``n_jobs`` regardless of ``algorithm``.
:issue:`8003` by :user:`Joël Billaud <recamshak>`.

Cluster

- :class:`cluster.KMeans`, :class:`cluster.MiniBatchKMeans` and
:func:`cluster.k_means` passed with ``algorithm='full'`` now enforces
row-major ordering, improving runtime.
:issue:`10471` by :user:`Gaurav Dhingra <gxyd>`.

- :class:`cluster.DBSCAN` now is parallelized according to ``n_jobs``
regardless of ``algorithm``.
:issue:`8003` by :user:`Joël Billaud <recamshak>`.

Datasets

- In :func:`datasets.make_blobs`, one can now pass a list to the `n_samples`
Expand Down
2 changes: 1 addition & 1 deletion sklearn/neighbors/ball_tree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ cdef inline DTYPE_t max_dist(BinaryTree tree, ITYPE_t i_node,


cdef inline int min_max_dist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt,
DTYPE_t* min_dist, DTYPE_t* max_dist) except -1:
DTYPE_t* min_dist, DTYPE_t* max_dist) nogil except -1:
"""Compute the minimum and maximum distance between a point and a node"""
cdef DTYPE_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
tree.data.shape[1])
Expand Down
14 changes: 11 additions & 3 deletions sklearn/neighbors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,10 +619,18 @@ class from an array representing our data set and ask who's
raise ValueError(
"%s does not work with sparse matrices. Densify the data, "
"or set algorithm='brute'" % self._fit_method)
results = self._tree.query_radius(X, radius,
return_distance=return_distance)

n_jobs = _get_n_jobs(self.n_jobs)
results = Parallel(n_jobs, backend='threading')(
delayed(self._tree.query_radius, check_pickle=False)(
X[s], radius, return_distance)
for s in gen_even_slices(X.shape[0], n_jobs)
)
if return_distance:
results = results[::-1]
neigh_ind, dist = tuple(zip(*results))
results = np.hstack(dist), np.hstack(neigh_ind)
else:
results = np.hstack(results)
else:
raise ValueError("internal: _fit_method not recognized")

Expand Down
129 changes: 98 additions & 31 deletions sklearn/neighbors/binary_tree.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@
cimport cython
cimport numpy as np
from libc.math cimport fabs, sqrt, exp, cos, pow, log
from libc.stdlib cimport calloc, malloc, free
from libc.string cimport memcpy
from sklearn.utils.lgamma cimport lgamma

import numpy as np
Expand All @@ -156,6 +158,11 @@ from typedefs import DTYPE, ITYPE
from dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist,
euclidean_dist_to_rdist, euclidean_rdist_to_dist)

cdef extern from "numpy/arrayobject.h":
void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)

np.import_array()

# some handy constants
cdef DTYPE_t INF = np.inf
cdef DTYPE_t NEG_INF = -np.inf
Expand Down Expand Up @@ -545,7 +552,7 @@ cdef inline void swap(DITYPE_t* arr, ITYPE_t i1, ITYPE_t i2):


cdef inline void dual_swap(DTYPE_t* darr, ITYPE_t* iarr,
ITYPE_t i1, ITYPE_t i2):
ITYPE_t i1, ITYPE_t i2) nogil:
"""swap the values at inex i1 and i2 of both darr and iarr"""
cdef DTYPE_t dtmp = darr[i1]
darr[i1] = darr[i2]
Expand Down Expand Up @@ -670,7 +677,7 @@ cdef class NeighborsHeap:


cdef int _simultaneous_sort(DTYPE_t* dist, ITYPE_t* idx,
ITYPE_t size) except -1:
ITYPE_t size) nogil except -1:
"""
Perform a recursive quicksort on the dist array, simultaneously
performing the same swaps on the idx array. The equivalent in
Expand Down Expand Up @@ -1341,7 +1348,7 @@ cdef class BinaryTree:
else:
return indices.reshape(X.shape[:X.ndim - 1] + (k,))

def query_radius(self, X, r, return_distance=False,
def query_radius(self, X, r, int return_distance=False,
int count_only=False, int sort_results=False):
"""
query_radius(self, X, r, count_only = False):
Expand Down Expand Up @@ -1406,6 +1413,8 @@ cdef class BinaryTree:
cdef DTYPE_t[::1] dist_arr_i
cdef ITYPE_t[::1] idx_arr_i, counts
cdef DTYPE_t* pt
cdef ITYPE_t** indices = NULL
cdef DTYPE_t** distances = NULL

# validate X and prepare for query
X = check_array(X, dtype=DTYPE, order='C')
Expand All @@ -1429,11 +1438,15 @@ cdef class BinaryTree:
rarr_np = r.reshape(-1) # store explicitly to keep in scope
cdef DTYPE_t[::1] rarr = get_memview_DTYPE_1D(rarr_np)

# prepare variables for iteration
if not count_only:
indices = np.zeros(Xarr.shape[0], dtype='object')
indices = <ITYPE_t**>calloc(Xarr.shape[0], sizeof(ITYPE_t*))
if indices == NULL:
raise MemoryError()
if return_distance:
distances = np.zeros(Xarr.shape[0], dtype='object')
distances = <DTYPE_t**>calloc(Xarr.shape[0], sizeof(DTYPE_t*))
if distances == NULL:
free(indices)
raise MemoryError()

np_idx_arr = np.zeros(self.data.shape[0], dtype=ITYPE)
idx_arr_i = get_memview_ITYPE_1D(np_idx_arr)
Expand All @@ -1445,33 +1458,89 @@ cdef class BinaryTree:
counts = get_memview_ITYPE_1D(counts_arr)

pt = &Xarr[0, 0]
for i in range(Xarr.shape[0]):
counts[i] = self._query_radius_single(0, pt, rarr[i],
&idx_arr_i[0],
&dist_arr_i[0],
0, count_only,
return_distance)
pt += n_features
memory_error = False
with nogil:
for i in range(Xarr.shape[0]):
counts[i] = self._query_radius_single(0, pt, rarr[i],
&idx_arr_i[0],
&dist_arr_i[0],
0, count_only,
return_distance)
pt += n_features

if count_only:
continue

if count_only:
pass
else:
if sort_results:
_simultaneous_sort(&dist_arr_i[0], &idx_arr_i[0],
counts[i])

indices[i] = np_idx_arr[:counts[i]].copy()
# equivalent to: indices[i] = np_idx_arr[:counts[i]].copy()
indices[i] = <ITYPE_t*>malloc(counts[i] * sizeof(ITYPE_t))
if indices[i] == NULL:
memory_error = True
break
memcpy(indices[i], &idx_arr_i[0], counts[i] * sizeof(ITYPE_t))

if return_distance:
distances[i] = np_dist_arr[:counts[i]].copy()
# equivalent to: distances[i] = np_dist_arr[:counts[i]].copy()
distances[i] = <DTYPE_t*>malloc(counts[i] * sizeof(DTYPE_t))
if distances[i] == NULL:
memory_error = True
break
memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof(DTYPE_t))

try:
if memory_error:
raise MemoryError()

if count_only:
# deflatten results
return counts_arr.reshape(X.shape[:X.ndim - 1])
elif return_distance:
indices_npy = np.zeros(Xarr.shape[0], dtype='object')
distances_npy = np.zeros(Xarr.shape[0], dtype='object')
for i in range(Xarr.shape[0]):
# make a new numpy array that wraps the existing data
indices_npy[i] = np.PyArray_SimpleNewFromData(1, &counts[i], np.NPY_INTP, indices[i])
# make sure the data will be freed when the numpy array is garbage collected
PyArray_ENABLEFLAGS(indices_npy[i], np.NPY_OWNDATA)
# make sure the data is not freed twice
indices[i] = NULL

# make a new numpy array that wraps the existing data
distances_npy[i] = np.PyArray_SimpleNewFromData(1, &counts[i], np.NPY_DOUBLE, distances[i])
# make sure the data will be freed when the numpy array is garbage collected
PyArray_ENABLEFLAGS(distances_npy[i], np.NPY_OWNDATA)
# make sure the data is not freed twice
distances[i] = NULL

# deflatten results
return (indices_npy.reshape(X.shape[:X.ndim - 1]),
distances_npy.reshape(X.shape[:X.ndim - 1]))
else:
indices_npy = np.zeros(Xarr.shape[0], dtype='object')
for i in range(Xarr.shape[0]):
# make a new numpy array that wraps the existing data
indices_npy[i] = np.PyArray_SimpleNewFromData(1, &counts[i], np.NPY_INTP, indices[i])
# make sure the data will be freed when the numpy array is garbage collected
PyArray_ENABLEFLAGS(indices_npy[i], np.NPY_OWNDATA)
# make sure the data is not freed twice
indices[i] = NULL

# deflatten results
return indices_npy.reshape(X.shape[:X.ndim - 1])
except:
# free any buffer that is not owned by a numpy array
for i in range(Xarr.shape[0]):
free(indices[i])
if return_distance:
free(distances[i])
raise
finally:
free(indices)
free(distances)

# deflatten results
if count_only:
return counts_arr.reshape(X.shape[:X.ndim - 1])
elif return_distance:
return (indices.reshape(X.shape[:X.ndim - 1]),
distances.reshape(X.shape[:X.ndim - 1]))
else:
return indices.reshape(X.shape[:X.ndim - 1])

def kernel_density(self, X, h, kernel='gaussian',
atol=0, rtol=1E-8,
Expand Down Expand Up @@ -1971,7 +2040,7 @@ cdef class BinaryTree:
DTYPE_t* distances,
ITYPE_t count,
int count_only,
int return_distance) except -1:
int return_distance) nogil:
"""recursive single-tree radius query, depth-first"""
cdef DTYPE_t* data = &self.data[0, 0]
cdef ITYPE_t* idx_array = &self.idx_array[0]
Expand Down Expand Up @@ -1999,8 +2068,7 @@ cdef class BinaryTree:
else:
for i in range(node_info.idx_start, node_info.idx_end):
if (count < 0) or (count >= self.data.shape[0]):
raise ValueError("Fatal: count too big: "
"this should never happen")
return -1
indices[count] = idx_array[i]
if return_distance:
distances[count] = self.dist(pt, (data + n_features
Expand All @@ -2019,8 +2087,7 @@ cdef class BinaryTree:
n_features)
if dist_pt <= reduced_r:
if (count < 0) or (count >= self.data.shape[0]):
raise ValueError("Fatal: count out of range. "
"This should never happen.")
return -1
if count_only:
pass
else:
Expand Down
9 changes: 7 additions & 2 deletions sklearn/neighbors/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,10 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin,
metric_params : dict, optional (default = None)
Additional keyword arguments for the metric function.

n_jobs : int, optional (default = 1)
The number of parallel jobs to run for neighbors search.
If ``-1``, then the number of jobs is set to the number of CPU cores.

Examples
--------
>>> X = [[0], [1], [2], [3]]
Expand Down Expand Up @@ -317,12 +321,13 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin,

def __init__(self, radius=1.0, weights='uniform',
algorithm='auto', leaf_size=30, p=2, metric='minkowski',
outlier_label=None, metric_params=None, **kwargs):
outlier_label=None, metric_params=None, n_jobs=1, **kwargs):
super(RadiusNeighborsClassifier, self).__init__(
radius=radius,
algorithm=algorithm,
leaf_size=leaf_size,
metric=metric, p=p, metric_params=metric_params, **kwargs)
metric=metric, p=p, metric_params=metric_params,
n_jobs=n_jobs, **kwargs)
self.weights = _check_weights(weights)
self.outlier_label = outlier_label

Expand Down
4 changes: 2 additions & 2 deletions sklearn/neighbors/dist_metrics.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ cdef inline DTYPE_t euclidean_dist_to_rdist(DTYPE_t dist) nogil except -1:
return dist * dist


cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) except -1:
cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) nogil except -1:
return sqrt(dist)


Expand Down Expand Up @@ -72,6 +72,6 @@ cdef class DistanceMetric:
cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y,
DTYPE_t[:, ::1] D) except -1

cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1
cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1

cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1
14 changes: 7 additions & 7 deletions sklearn/neighbors/dist_metrics.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ cdef class DistanceMetric:
D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1])
return 0

cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
"""Convert the reduced distance to the distance"""
return rdist

Expand Down Expand Up @@ -418,7 +418,7 @@ cdef class EuclideanDistance(DistanceMetric):
ITYPE_t size) nogil except -1:
return euclidean_rdist(x1, x2, size)

cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
return sqrt(rdist)

cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
Expand Down Expand Up @@ -462,7 +462,7 @@ cdef class SEuclideanDistance(DistanceMetric):
ITYPE_t size) nogil except -1:
return sqrt(self.rdist(x1, x2, size))

cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
return sqrt(rdist)

cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
Expand Down Expand Up @@ -551,7 +551,7 @@ cdef class MinkowskiDistance(DistanceMetric):
ITYPE_t size) nogil except -1:
return pow(self.rdist(x1, x2, size), 1. / self.p)

cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
return pow(rdist, 1. / self.p)

cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
Expand Down Expand Up @@ -610,7 +610,7 @@ cdef class WMinkowskiDistance(DistanceMetric):
ITYPE_t size) nogil except -1:
return pow(self.rdist(x1, x2, size), 1. / self.p)

cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
return pow(rdist, 1. / self.p)

cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
Expand Down Expand Up @@ -683,7 +683,7 @@ cdef class MahalanobisDistance(DistanceMetric):
ITYPE_t size) nogil except -1:
return sqrt(self.rdist(x1, x2, size))

cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
return sqrt(rdist)

cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
Expand Down Expand Up @@ -998,7 +998,7 @@ cdef class HaversineDistance(DistanceMetric):
return 2 * asin(sqrt(sin_0 * sin_0
+ cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1))

cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
return 2 * asin(sqrt(rdist))

cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
Expand Down
2 changes: 1 addition & 1 deletion sklearn/neighbors/kd_tree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ cdef DTYPE_t max_dist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt) except -1:


cdef inline int min_max_dist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt,
DTYPE_t* min_dist, DTYPE_t* max_dist) except -1:
DTYPE_t* min_dist, DTYPE_t* max_dist) nogil except -1:
"""Compute the minimum and maximum distance between a point and a node"""
cdef ITYPE_t n_features = tree.data.shape[1]

Expand Down
Loading