From 82eb949edf3ec5cd2e18c9abf13e504d3abe3f6e Mon Sep 17 00:00:00 2001 From: jiefangxuanyan <505745416@qq.com> Date: Thu, 3 May 2018 16:06:24 +0800 Subject: [PATCH 1/4] Use std::nth_element from C++ standard library. --- sklearn/neighbors/binary_tree.pxi | 69 +-------------------------- sklearn/neighbors/nth_element.pxd | 8 ++++ sklearn/neighbors/nth_element.pyx | 9 ++++ sklearn/neighbors/setup.py | 7 +++ sklearn/neighbors/src/nth_element.cpp | 31 ++++++++++++ sklearn/neighbors/src/nth_element.h | 9 ++++ 6 files changed, 66 insertions(+), 67 deletions(-) create mode 100644 sklearn/neighbors/nth_element.pxd create mode 100644 sklearn/neighbors/nth_element.pyx create mode 100644 sklearn/neighbors/src/nth_element.cpp create mode 100644 sklearn/neighbors/src/nth_element.h diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi index edf78257c9b23..1764d0b07583e 100755 --- a/sklearn/neighbors/binary_tree.pxi +++ b/sklearn/neighbors/binary_tree.pxi @@ -161,6 +161,8 @@ from dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist, cdef extern from "numpy/arrayobject.h": void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) +from nth_element cimport partition_node_indices + np.import_array() # some handy constants @@ -791,73 +793,6 @@ cdef ITYPE_t find_node_split_dim(DTYPE_t* data, return j_max -cdef int partition_node_indices(DTYPE_t* data, - ITYPE_t* node_indices, - ITYPE_t split_dim, - ITYPE_t split_index, - ITYPE_t n_features, - ITYPE_t n_points) except -1: - """Partition points in the node into two equal-sized groups. - - Upon return, the values in node_indices will be rearranged such that - (assuming numpy-style indexing): - - data[node_indices[0:split_index], split_dim] - <= data[node_indices[split_index], split_dim] - - and - - data[node_indices[split_index], split_dim] - <= data[node_indices[split_index:n_points], split_dim] - - The algorithm is essentially a partial in-place quicksort around a - set pivot. - - Parameters - ---------- - data : double pointer - Pointer to a 2D array of the training data, of shape [N, n_features]. - N must be greater than any of the values in node_indices. - node_indices : int pointer - Pointer to a 1D array of length n_points. This lists the indices of - each of the points within the current node. This will be modified - in-place. - split_dim : int - the dimension on which to split. This will usually be computed via - the routine ``find_node_split_dim`` - split_index : int - the index within node_indices around which to split the points. - - Returns - ------- - status : int - integer exit status. On return, the contents of node_indices are - modified as noted above. - """ - cdef ITYPE_t left, right, midindex, i - cdef DTYPE_t d1, d2 - left = 0 - right = n_points - 1 - - while True: - midindex = left - for i in range(left, right): - d1 = data[node_indices[i] * n_features + split_dim] - d2 = data[node_indices[right] * n_features + split_dim] - if d1 < d2: - swap(node_indices, i, midindex) - midindex += 1 - swap(node_indices, midindex, right) - if midindex == split_index: - break - elif midindex < split_index: - left = midindex + 1 - else: - right = midindex - 1 - - return 0 - - ###################################################################### # NodeHeap : min-heap used to keep track of nodes during # breadth-first query diff --git a/sklearn/neighbors/nth_element.pxd b/sklearn/neighbors/nth_element.pxd new file mode 100644 index 0000000000000..89683c112bcbb --- /dev/null +++ b/sklearn/neighbors/nth_element.pxd @@ -0,0 +1,8 @@ +from typedefs cimport DTYPE_t, ITYPE_t + +cdef int partition_node_indices(DTYPE_t *data, + ITYPE_t *node_indices, + ITYPE_t split_dim, + ITYPE_t split_index, + ITYPE_t n_features, + ITYPE_t n_points) diff --git a/sklearn/neighbors/nth_element.pyx b/sklearn/neighbors/nth_element.pyx new file mode 100644 index 0000000000000..a897de30ede35 --- /dev/null +++ b/sklearn/neighbors/nth_element.pyx @@ -0,0 +1,9 @@ +from typedefs cimport DTYPE_t, ITYPE_t + +cdef extern from "nth_element.h": + int partition_node_indices(DTYPE_t*data, + ITYPE_t*node_indices, + ITYPE_t split_dim, + ITYPE_t split_index, + ITYPE_t n_features, + ITYPE_t n_points) diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py index 8b1ad7bac9fab..1191234183523 100644 --- a/sklearn/neighbors/setup.py +++ b/sklearn/neighbors/setup.py @@ -20,6 +20,13 @@ def configuration(parent_package='', top_path=None): include_dirs=[numpy.get_include()], libraries=libraries) + config.add_extension('nth_element', + sources=['nth_element.pyx', + os.path.join('src', 'nth_element.cpp')], + include_dirs=['src'], + language="c++", + libraries=libraries) + config.add_extension('dist_metrics', sources=['dist_metrics.pyx'], include_dirs=[numpy.get_include(), diff --git a/sklearn/neighbors/src/nth_element.cpp b/sklearn/neighbors/src/nth_element.cpp new file mode 100644 index 0000000000000..630036eec22f2 --- /dev/null +++ b/sklearn/neighbors/src/nth_element.cpp @@ -0,0 +1,31 @@ +#include "nth_element.h" +#include + +class IndexComparator { + double *data; + Py_intptr_t split_dim, n_features; + +public: + IndexComparator(double *data, Py_intptr_t split_dim, Py_intptr_t n_features): + data(data), split_dim(split_dim), n_features(n_features) {} + + bool operator()(Py_intptr_t a, Py_intptr_t b) { + return data[a * n_features + split_dim] + < data[b * n_features + split_dim]; + } +}; + +int partition_node_indices(double *data, + Py_intptr_t *node_indices, + Py_intptr_t split_dim, + Py_intptr_t split_index, + Py_intptr_t n_features, + Py_intptr_t n_points) { + IndexComparator index_comparator(data, split_dim, n_features); + std::nth_element(node_indices, + node_indices + split_index, + node_indices + n_points, + index_comparator); + return 0; +} + diff --git a/sklearn/neighbors/src/nth_element.h b/sklearn/neighbors/src/nth_element.h new file mode 100644 index 0000000000000..cf2403271a65d --- /dev/null +++ b/sklearn/neighbors/src/nth_element.h @@ -0,0 +1,9 @@ +#include + +int partition_node_indices(double *data, + Py_intptr_t *node_indices, + Py_intptr_t split_dim, + Py_intptr_t split_index, + Py_intptr_t n_features, + Py_intptr_t n_points); + From 076069a39489e160f2d485697d2b13c55749612b Mon Sep 17 00:00:00 2001 From: jiefangxuanyan <505745416@qq.com> Date: Fri, 8 Jun 2018 15:07:47 +0800 Subject: [PATCH 2/4] Add a benchmark to expose time complexity degeneration problem of the original partition algorithm. --- benchmarks/bench_binary_tree_partition.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 benchmarks/bench_binary_tree_partition.py diff --git a/benchmarks/bench_binary_tree_partition.py b/benchmarks/bench_binary_tree_partition.py new file mode 100644 index 0000000000000..ee0aa6188ef9b --- /dev/null +++ b/benchmarks/bench_binary_tree_partition.py @@ -0,0 +1,22 @@ +from sklearn.neighbors import KDTree +import numpy as np +from time import time + + +def main(): + test_cases = [ + ("ordered", np.arange(50000, dtype=float)), + ("reverse ordered", np.arange(50000, 0, -1, dtype=float)), + ("duplicated", np.zeros([50000], dtype=float)) + ] + for name, case in test_cases: + expanded_case = np.expand_dims(case, -1) + begin = time() + tree = KDTree(expanded_case, leaf_size=1) + end = time() + del tree + print("{name}: {time}s".format(name=name, time=end - begin)) + + +if __name__ == "__main__": + main() From 3c34c7c8a47391b3ccf695744c905bc8c6181c53 Mon Sep 17 00:00:00 2001 From: jiefangxuanyan <505745416@qq.com> Date: Fri, 8 Jun 2018 15:12:43 +0800 Subject: [PATCH 3/4] Added a general case for binary tree partition algorithm benchmark. --- benchmarks/bench_binary_tree_partition.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/bench_binary_tree_partition.py b/benchmarks/bench_binary_tree_partition.py index ee0aa6188ef9b..b10e9b65c5657 100644 --- a/benchmarks/bench_binary_tree_partition.py +++ b/benchmarks/bench_binary_tree_partition.py @@ -5,6 +5,7 @@ def main(): test_cases = [ + ("random", np.random.rand(50000)), ("ordered", np.arange(50000, dtype=float)), ("reverse ordered", np.arange(50000, 0, -1, dtype=float)), ("duplicated", np.zeros([50000], dtype=float)) From cd112a89f0fdfa47a6ea797443addd8543004786 Mon Sep 17 00:00:00 2001 From: jiefangxuanyan <505745416@qq.com> Date: Sat, 11 Apr 2020 20:25:25 +0800 Subject: [PATCH 4/4] Multiple changes on the nth_element extension: 1. Add "_" in the file name; 2. Make the comparator stable; 3. Use a template implementation of partition_node_indices, the original one has hard-coded types(DTYPE_t -> double, ITYPE_t -> Py_intptr_t) which would be easily broken when these type changes. --- sklearn/neighbors/_binary_tree.pxi | 4 ++-- sklearn/neighbors/_nth_element.pxd | 9 +++++++ sklearn/neighbors/_nth_element.pyx | 25 +++++++++++++++++++ sklearn/neighbors/_nth_element_inner.h | 33 ++++++++++++++++++++++++++ sklearn/neighbors/nth_element.pxd | 8 ------- sklearn/neighbors/nth_element.pyx | 9 ------- sklearn/neighbors/setup.py | 7 +++--- sklearn/neighbors/src/nth_element.cpp | 31 ------------------------ sklearn/neighbors/src/nth_element.h | 9 ------- 9 files changed, 72 insertions(+), 63 deletions(-) create mode 100644 sklearn/neighbors/_nth_element.pxd create mode 100644 sklearn/neighbors/_nth_element.pyx create mode 100644 sklearn/neighbors/_nth_element_inner.h delete mode 100644 sklearn/neighbors/nth_element.pxd delete mode 100644 sklearn/neighbors/nth_element.pyx delete mode 100644 sklearn/neighbors/src/nth_element.cpp delete mode 100644 sklearn/neighbors/src/nth_element.h diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 4e757534d4eab..0ff0f94dfa563 100755 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -159,11 +159,11 @@ from ._typedefs import DTYPE, ITYPE from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist, euclidean_dist_to_rdist, euclidean_rdist_to_dist) +from ._nth_element cimport partition_node_indices + cdef extern from "numpy/arrayobject.h": void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) -from nth_element cimport partition_node_indices - np.import_array() # some handy constants diff --git a/sklearn/neighbors/_nth_element.pxd b/sklearn/neighbors/_nth_element.pxd new file mode 100644 index 0000000000000..522e826632824 --- /dev/null +++ b/sklearn/neighbors/_nth_element.pxd @@ -0,0 +1,9 @@ +from ._typedefs cimport DTYPE_t, ITYPE_t + +cdef int partition_node_indices( + DTYPE_t *data, + ITYPE_t *node_indices, + ITYPE_t split_dim, + ITYPE_t split_index, + ITYPE_t n_features, + ITYPE_t n_points) except -1 diff --git a/sklearn/neighbors/_nth_element.pyx b/sklearn/neighbors/_nth_element.pyx new file mode 100644 index 0000000000000..58cf9563c515c --- /dev/null +++ b/sklearn/neighbors/_nth_element.pyx @@ -0,0 +1,25 @@ +cdef extern from "_nth_element_inner.h": + void partition_node_indices_inner[D, I]( + D *data, + I *node_indices, + I split_dim, + I split_index, + I n_features, + I n_points) except + + + +cdef int partition_node_indices( + DTYPE_t *data, + ITYPE_t *node_indices, + ITYPE_t split_dim, + ITYPE_t split_index, + ITYPE_t n_features, + ITYPE_t n_points) except -1: + partition_node_indices_inner( + data, + node_indices, + split_dim, + split_index, + n_features, + n_points) + return 0 diff --git a/sklearn/neighbors/_nth_element_inner.h b/sklearn/neighbors/_nth_element_inner.h new file mode 100644 index 0000000000000..816addcf6b3db --- /dev/null +++ b/sklearn/neighbors/_nth_element_inner.h @@ -0,0 +1,33 @@ +#include + +template +class IndexComparator { +private: + const D *data; + I split_dim, n_features; +public: + IndexComparator(const D *data, const I &split_dim, const I &n_features): + data(data), split_dim(split_dim), n_features(n_features) {} + + bool operator()(const I &a, const I &b) const { + D a_value = data[a * n_features + split_dim]; + D b_value = data[b * n_features + split_dim]; + return a_value == b_value ? a < b : a_value < b_value; + } +}; + +template +void partition_node_indices_inner( + const D *data, + I *node_indices, + const I &split_dim, + const I &split_index, + const I &n_features, + const I &n_points) { + IndexComparator index_comparator(data, split_dim, n_features); + std::nth_element( + node_indices, + node_indices + split_index, + node_indices + n_points, + index_comparator); +} diff --git a/sklearn/neighbors/nth_element.pxd b/sklearn/neighbors/nth_element.pxd deleted file mode 100644 index 89683c112bcbb..0000000000000 --- a/sklearn/neighbors/nth_element.pxd +++ /dev/null @@ -1,8 +0,0 @@ -from typedefs cimport DTYPE_t, ITYPE_t - -cdef int partition_node_indices(DTYPE_t *data, - ITYPE_t *node_indices, - ITYPE_t split_dim, - ITYPE_t split_index, - ITYPE_t n_features, - ITYPE_t n_points) diff --git a/sklearn/neighbors/nth_element.pyx b/sklearn/neighbors/nth_element.pyx deleted file mode 100644 index a897de30ede35..0000000000000 --- a/sklearn/neighbors/nth_element.pyx +++ /dev/null @@ -1,9 +0,0 @@ -from typedefs cimport DTYPE_t, ITYPE_t - -cdef extern from "nth_element.h": - int partition_node_indices(DTYPE_t*data, - ITYPE_t*node_indices, - ITYPE_t split_dim, - ITYPE_t split_index, - ITYPE_t n_features, - ITYPE_t n_points) diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py index e6d1f2d0eb602..fac1d63a75eb1 100644 --- a/sklearn/neighbors/setup.py +++ b/sklearn/neighbors/setup.py @@ -20,10 +20,9 @@ def configuration(parent_package='', top_path=None): include_dirs=[numpy.get_include()], libraries=libraries) - config.add_extension('nth_element', - sources=['nth_element.pyx', - os.path.join('src', 'nth_element.cpp')], - include_dirs=['src'], + config.add_extension('_nth_element', + sources=['_nth_element.pyx'], + include_dirs=[numpy.get_include()], language="c++", libraries=libraries) diff --git a/sklearn/neighbors/src/nth_element.cpp b/sklearn/neighbors/src/nth_element.cpp deleted file mode 100644 index 630036eec22f2..0000000000000 --- a/sklearn/neighbors/src/nth_element.cpp +++ /dev/null @@ -1,31 +0,0 @@ -#include "nth_element.h" -#include - -class IndexComparator { - double *data; - Py_intptr_t split_dim, n_features; - -public: - IndexComparator(double *data, Py_intptr_t split_dim, Py_intptr_t n_features): - data(data), split_dim(split_dim), n_features(n_features) {} - - bool operator()(Py_intptr_t a, Py_intptr_t b) { - return data[a * n_features + split_dim] - < data[b * n_features + split_dim]; - } -}; - -int partition_node_indices(double *data, - Py_intptr_t *node_indices, - Py_intptr_t split_dim, - Py_intptr_t split_index, - Py_intptr_t n_features, - Py_intptr_t n_points) { - IndexComparator index_comparator(data, split_dim, n_features); - std::nth_element(node_indices, - node_indices + split_index, - node_indices + n_points, - index_comparator); - return 0; -} - diff --git a/sklearn/neighbors/src/nth_element.h b/sklearn/neighbors/src/nth_element.h deleted file mode 100644 index cf2403271a65d..0000000000000 --- a/sklearn/neighbors/src/nth_element.h +++ /dev/null @@ -1,9 +0,0 @@ -#include - -int partition_node_indices(double *data, - Py_intptr_t *node_indices, - Py_intptr_t split_dim, - Py_intptr_t split_index, - Py_intptr_t n_features, - Py_intptr_t n_points); -