Skip to content

MAINT Group all sorting utilities in sklearn.utils._sorting #25606

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ from cython cimport final
from cython.parallel cimport parallel, prange

from ...utils._heap cimport heap_push
from ...utils._sorting cimport simultaneous_sort
from ...utils._sorting cimport simultaneous_quicksort
from ...utils._typedefs cimport intp_t, float64_t

import numpy as np
Expand Down Expand Up @@ -184,7 +184,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
# Sorting the main heaps portion associated to `X[X_start:X_end]`
# in ascending order w.r.t the distances.
for idx in range(X_end - X_start):
simultaneous_sort(
simultaneous_quicksort(
self.heaps_r_distances_chunks[thread_num] + idx * self.k,
self.heaps_indices_chunks[thread_num] + idx * self.k,
self.k
Expand Down Expand Up @@ -268,7 +268,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
# Sorting the main in ascending order w.r.t the distances.
# This is done in parallel sample-wise (no need for locks).
for idx in prange(self.n_samples_X, schedule='static'):
simultaneous_sort(
simultaneous_quicksort(
&self.argkmin_distances[idx, 0],
&self.argkmin_indices[idx, 0],
self.k,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from cython cimport final
from cython.operator cimport dereference as deref
from cython.parallel cimport parallel, prange

from ...utils._sorting cimport simultaneous_sort
from ...utils._sorting cimport simultaneous_quicksort
from ...utils._typedefs cimport intp_t, float64_t
from ...utils._vector_sentinel cimport vector_to_nd_array

Expand Down Expand Up @@ -218,7 +218,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
# Sorting neighbors for each query vector of X
if self.sort_results:
for idx in range(X_start, X_end):
simultaneous_sort(
simultaneous_quicksort(
deref(self.neigh_distances)[idx].data(),
deref(self.neigh_indices)[idx].data(),
deref(self.neigh_indices)[idx].size()
Expand Down Expand Up @@ -289,7 +289,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
# Sort in parallel in ascending order w.r.t the distances if requested.
if self.sort_results:
for idx in prange(self.n_samples_X, schedule='static'):
simultaneous_sort(
simultaneous_quicksort(
deref(self.neigh_distances)[idx].data(),
deref(self.neigh_indices)[idx].data(),
deref(self.neigh_indices)[idx].size()
Expand Down
25 changes: 15 additions & 10 deletions sklearn/neighbors/_binary_tree.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ from ._partition_nodes cimport partition_node_indices
from ..utils import check_array
from ..utils._typedefs cimport float64_t, intp_t
from ..utils._heap cimport heap_push
from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort
from ..utils._sorting cimport simultaneous_quicksort

cnp.import_array()

Expand Down Expand Up @@ -561,9 +561,9 @@ cdef class NeighborsHeap:
"""simultaneously sort the distances and indices"""
cdef intp_t row
for row in range(self.distances.shape[0]):
_simultaneous_sort(
dist=&self.distances[row, 0],
idx=&self.indices[row, 0],
simultaneous_quicksort(
values=&self.distances[row, 0],
indices=&self.indices[row, 0],
size=self.distances.shape[1],
)
return 0
Expand Down Expand Up @@ -1305,8 +1305,11 @@ cdef class BinaryTree:
continue

if sort_results:
_simultaneous_sort(&dist_arr_i[0], &idx_arr_i[0],
counts[i])
simultaneous_quicksort(
&dist_arr_i[0],
&idx_arr_i[0],
counts[i],
)

# equivalent to: indices[i] = np_idx_arr[:counts[i]].copy()
indices[i] = <intp_t*>malloc(counts[i] * sizeof(intp_t))
Expand Down Expand Up @@ -2388,15 +2391,17 @@ def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices):
"""In-place simultaneous sort the given row of the arrays

This python wrapper exists primarily to enable unit testing
of the _simultaneous_sort C routine.
of the simultaneous_quicksort C routine.
"""
assert distances.shape[0] == indices.shape[0]
assert distances.shape[1] == indices.shape[1]
cdef intp_t row
for row in range(distances.shape[0]):
_simultaneous_sort(&distances[row, 0],
&indices[row, 0],
distances.shape[1])
simultaneous_quicksort(
&distances[row, 0],
&indices[row, 0],
distances.shape[1],
)


def nodeheap_sort(float64_t[::1] vals):
Expand Down
26 changes: 0 additions & 26 deletions sklearn/neighbors/tests/test_neighbors_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,11 @@
BallTree,
kernel_norm,
NeighborsHeap as NeighborsHeapBT,
simultaneous_sort as simultaneous_sort_bt,
nodeheap_sort as nodeheap_sort_bt,
)
from sklearn.neighbors._kd_tree import (
KDTree,
NeighborsHeap as NeighborsHeapKDT,
simultaneous_sort as simultaneous_sort_kdt,
nodeheap_sort as nodeheap_sort_kdt,
)

Expand Down Expand Up @@ -188,30 +186,6 @@ def test_node_heap(nodeheap_sort, n_nodes=50):
assert_array_almost_equal(vals[i1], vals2)


@pytest.mark.parametrize(
"simultaneous_sort", [simultaneous_sort_bt, simultaneous_sort_kdt]
)
def test_simultaneous_sort(simultaneous_sort, n_rows=10, n_pts=201):
rng = check_random_state(0)
dist = rng.random_sample((n_rows, n_pts)).astype(np.float64, copy=False)
ind = (np.arange(n_pts) + np.zeros((n_rows, 1))).astype(np.intp, copy=False)

dist2 = dist.copy()
ind2 = ind.copy()

# simultaneous sort rows using function
simultaneous_sort(dist, ind)

# simultaneous sort rows using numpy
i = np.argsort(dist2, axis=1)
row_ind = np.arange(n_rows)[:, None]
dist2 = dist2[row_ind, i]
ind2 = ind2[row_ind, i]

assert_array_almost_equal(dist, dist2)
assert_array_almost_equal(ind, ind2)


@pytest.mark.parametrize("Cls", [KDTree, BallTree])
def test_gaussian_kde(Cls, n_samples=1000):
# Compare gaussian KDE results to scipy.stats.gaussian_kde
Expand Down
123 changes: 7 additions & 116 deletions sklearn/tree/_splitter.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ import numpy as np

from scipy.sparse import csc_matrix

from ..utils._sorting cimport simultaneous_introsort as sort
# TODO: when Cython>=3.0 is used, remove the <intp_t*> casts in call to sort.
from ..utils._typedefs cimport intp_t

from ._utils cimport log
from ._utils cimport rand_int
from ._utils cimport rand_uniform
Expand Down Expand Up @@ -413,119 +417,6 @@ cdef inline int node_split_best(
return 0


# Sort n-element arrays pointed to by feature_values and samples, simultaneously,
# by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997).
cdef inline void sort(DTYPE_t* feature_values, SIZE_t* samples, SIZE_t n) noexcept nogil:
if n == 0:
return
cdef int maxd = 2 * <int>log(n)
introsort(feature_values, samples, n, maxd)


cdef inline void swap(DTYPE_t* feature_values, SIZE_t* samples,
SIZE_t i, SIZE_t j) noexcept nogil:
# Helper for sort
feature_values[i], feature_values[j] = feature_values[j], feature_values[i]
samples[i], samples[j] = samples[j], samples[i]


cdef inline DTYPE_t median3(DTYPE_t* feature_values, SIZE_t n) noexcept nogil:
# Median of three pivot selection, after Bentley and McIlroy (1993).
# Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
cdef DTYPE_t a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1]
if a < b:
if b < c:
return b
elif a < c:
return c
else:
return a
elif b < c:
if a < c:
return a
else:
return c
else:
return b


# Introsort with median of 3 pivot selection and 3-way partition function
# (robust to repeated elements, e.g. lots of zero features).
cdef void introsort(DTYPE_t* feature_values, SIZE_t *samples,
SIZE_t n, int maxd) noexcept nogil:
cdef DTYPE_t pivot
cdef SIZE_t i, l, r

while n > 1:
if maxd <= 0: # max depth limit exceeded ("gone quadratic")
heapsort(feature_values, samples, n)
return
maxd -= 1

pivot = median3(feature_values, n)

# Three-way partition.
i = l = 0
r = n
while i < r:
if feature_values[i] < pivot:
swap(feature_values, samples, i, l)
i += 1
l += 1
elif feature_values[i] > pivot:
r -= 1
swap(feature_values, samples, i, r)
else:
i += 1

introsort(feature_values, samples, l, maxd)
feature_values += r
samples += r
n -= r


cdef inline void sift_down(DTYPE_t* feature_values, SIZE_t* samples,
SIZE_t start, SIZE_t end) noexcept nogil:
# Restore heap order in feature_values[start:end] by moving the max element to start.
cdef SIZE_t child, maxind, root

root = start
while True:
child = root * 2 + 1

# find max of root, left child, right child
maxind = root
if child < end and feature_values[maxind] < feature_values[child]:
maxind = child
if child + 1 < end and feature_values[maxind] < feature_values[child + 1]:
maxind = child + 1

if maxind == root:
break
else:
swap(feature_values, samples, root, maxind)
root = maxind


cdef void heapsort(DTYPE_t* feature_values, SIZE_t* samples, SIZE_t n) noexcept nogil:
cdef SIZE_t start, end

# heapify
start = (n - 2) / 2
end = n
while True:
sift_down(feature_values, samples, start, end)
if start == 0:
break
start -= 1

# sort by shrinking the heap, putting the max element immediately after it
end = n - 1
while end > 0:
swap(feature_values, samples, 0, end)
sift_down(feature_values, samples, 0, end)
end = end - 1

cdef inline int node_split_random(
Splitter splitter,
Partitioner partitioner,
Expand Down Expand Up @@ -742,7 +633,7 @@ cdef class DensePartitioner:
# effectively.
for i in range(self.start, self.end):
feature_values[i] = X[samples[i], current_feature]
sort(&feature_values[self.start], &samples[self.start], self.end - self.start)
sort(&feature_values[self.start], <intp_t*>&samples[self.start], self.end - self.start)

cdef inline void find_min_max(
self,
Expand Down Expand Up @@ -901,9 +792,9 @@ cdef class SparsePartitioner:

self.extract_nnz(current_feature)
# Sort the positive and negative parts of `feature_values`
sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start)
sort(&feature_values[self.start], <intp_t*>&samples[self.start], self.end_negative - self.start)
if self.start_positive < self.end:
sort(&feature_values[self.start_positive], &samples[self.start_positive],
sort(&feature_values[self.start_positive], <intp_t*>&samples[self.start_positive],
self.end - self.start_positive)

# Update index_to_samples to take into account the sort
Expand Down
20 changes: 16 additions & 4 deletions sklearn/utils/_sorting.pxd
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
from cython cimport floating

from ._typedefs cimport intp_t

from cython cimport floating
cdef void simultaneous_quicksort(
floating* values,
intp_t* indices,
intp_t size,
) noexcept nogil

cdef void simultaneous_introsort(
floating* values,
intp_t* indices,
intp_t size,
) noexcept nogil

cdef int simultaneous_sort(
floating *dist,
intp_t *idx,
cdef void simultaneous_heapsort(
floating* values,
intp_t* indices,
intp_t size,
) noexcept nogil
Loading