Skip to content

MAINT Miscellaneous maintenance items the private PairwiseDistancesReductions submodule #24542

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Nov 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
3ad593b
Reword 'radius_neighborhood' for 'radius_neigbors'
jjerphan Sep 29, 2022
cb086a4
Remove out-of-date comment
jjerphan Sep 29, 2022
f2a6686
Reword to use BaseDistancesReduction
jjerphan Sep 29, 2022
205edb2
Clarify comments regarding float32 and float64
jjerphan Sep 29, 2022
290e138
Use 'potential' instead of 'eventual'
jjerphan Sep 29, 2022
ca16041
GEMMTermComputer.{_compute_distances_on_chunks→_compute_dist_middle_t…
jjerphan Sep 29, 2022
cd768f9
Use safer unpacking for {X,Y}_norm_squared
jjerphan Sep 29, 2022
e60059a
Simplify Tempita preprocessing
jjerphan Sep 29, 2022
22338ac
Add a negative zeros and NaNs guard for the Euclidean specialisation
jjerphan Sep 29, 2022
b802a05
Use keywords arguments for calls to heap_push
jjerphan Sep 29, 2022
13ced49
Reword explanations for unsupported distance metrics
jjerphan Sep 29, 2022
631af0d
Remove unused symbols
jjerphan Sep 29, 2022
e6dcde8
DOC Make dispatchers' docstring uniform
jjerphan Sep 29, 2022
5485a4a
Remove unneeded references to DenseDenseDatasetsPair instances
jjerphan Sep 30, 2022
192dd92
Apply review comments
jjerphan Oct 7, 2022
b29c79d
Remove NaN guard and adapt -0. guard
jjerphan Oct 12, 2022
82a9f81
Merge branch 'main' into maint/pdr-misc
jjerphan Oct 13, 2022
29e3d88
Merge branch 'main' into maint/pdr-misc
jjerphan Oct 14, 2022
fa0e2ea
fixup! Merge branch 'main' into maint/pdr-misc
jjerphan Oct 14, 2022
18ba7ed
Merge branch 'main' into maint/pdr-misc
jjerphan Oct 17, 2022
3c2126d
Revert to explicit indexing
jjerphan Oct 17, 2022
8ddef01
fixup! Revert to explicit indexing
jjerphan Oct 17, 2022
f2e917b
Do not slice memoryviews in _compute_dist_middle_terms
jjerphan Oct 17, 2022
379ffae
Revert "Do not slice memoryviews in _compute_dist_middle_terms"
jjerphan Oct 21, 2022
5729f41
Merge branch 'main' into maint/pdr-misc
jjerphan Nov 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 51 additions & 35 deletions sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,11 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
for i in range(n_samples_X):
for j in range(n_samples_Y):
heap_push(
heaps_r_distances + i * self.k,
heaps_indices + i * self.k,
self.k,
self.datasets_pair.surrogate_dist(X_start + i, Y_start + j),
Y_start + j,
values=heaps_r_distances + i * self.k,
indices=heaps_indices + i * self.k,
size=self.k,
val=self.datasets_pair.surrogate_dist(X_start + i, Y_start + j),
val_idx=Y_start + j,
)

cdef void _parallel_on_X_init_chunk(
Expand Down Expand Up @@ -255,11 +255,11 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
for thread_num in range(self.chunks_n_threads):
for jdx in range(self.k):
heap_push(
&self.argkmin_distances[X_start + idx, 0],
&self.argkmin_indices[X_start + idx, 0],
self.k,
self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx],
self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
values=&self.argkmin_distances[X_start + idx, 0],
indices=&self.argkmin_indices[X_start + idx, 0],
size=self.k,
val=self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx],
val_idx=self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
)

cdef void _parallel_on_Y_finalize(
Expand Down Expand Up @@ -292,7 +292,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
num_threads=self.effective_n_threads):
for j in range(self.k):
distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist(
# Guard against eventual -0., causing nan production.
# Guard against potential -0., causing nan production.
max(distances[i, j], 0.)
)

Expand All @@ -304,7 +304,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):

# Values are returned identically to the way `KNeighborsMixin.kneighbors`
# returns values. This is counter-intuitive but this allows not using
# complex adaptations where `ArgKmin{{name_suffix}}.compute` is called.
# complex adaptations where `ArgKmin.compute` is called.
return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)

return np.asarray(self.argkmin_indices)
Expand All @@ -330,8 +330,10 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
):
if (
metric_kwargs is not None and
len(metric_kwargs) > 0 and
"Y_norm_squared" not in metric_kwargs
len(metric_kwargs) > 0 and (
"Y_norm_squared" not in metric_kwargs or
"X_norm_squared" not in metric_kwargs
)
):
warnings.warn(
f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
Expand Down Expand Up @@ -365,20 +367,31 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
metric_kwargs.pop("Y_norm_squared"),
ensure_2d=False,
input_name="Y_norm_squared",
dtype=np.float64
dtype=np.float64,
)
else:
self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(
Y, self.effective_n_threads
Y,
self.effective_n_threads,
)

# Do not recompute norms if datasets are identical.
self.X_norm_squared = (
self.Y_norm_squared if X is Y else
_sqeuclidean_row_norms{{name_suffix}}(
X, self.effective_n_threads
if metric_kwargs is not None and "X_norm_squared" in metric_kwargs:
self.X_norm_squared = check_array(
metric_kwargs.pop("X_norm_squared"),
ensure_2d=False,
input_name="X_norm_squared",
dtype=np.float64,
)
)
else:
# Do not recompute norms if datasets are identical.
self.X_norm_squared = (
self.Y_norm_squared if X is Y else
_sqeuclidean_row_norms{{name_suffix}}(
X,
self.effective_n_threads,
)
)

self.use_squared_distances = use_squared_distances

@final
Expand Down Expand Up @@ -470,6 +483,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
) nogil:
cdef:
ITYPE_t i, j
DTYPE_t sqeuclidean_dist_i_j
ITYPE_t n_X = X_end - X_start
ITYPE_t n_Y = Y_end - Y_start
DTYPE_t * dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms(
Expand All @@ -483,20 +497,22 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
# which keep tracks of the argkmin.
for i in range(n_X):
for j in range(n_Y):
sqeuclidean_dist_i_j = (
self.X_norm_squared[i + X_start] +
dist_middle_terms[i * n_Y + j] +
self.Y_norm_squared[j + Y_start]
)

# Catastrophic cancellation might cause -0. to be present,
# e.g. when computing d(x_i, y_i) when X is Y.
sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j)

heap_push(
heaps_r_distances + i * self.k,
heaps_indices + i * self.k,
self.k,
# Using the squared euclidean distance as the rank-preserving distance:
#
# ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
#
(
self.X_norm_squared[i + X_start] +
dist_middle_terms[i * n_Y + j] +
self.Y_norm_squared[j + Y_start]
),
j + Y_start,
values=heaps_r_distances + i * self.k,
indices=heaps_indices + i * self.k,
size=self.k,
val=sqeuclidean_dist_i_j,
val_idx=j + Y_start,
)

{{endfor}}
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
for j in range(deref(self.neigh_indices)[i].size()):
deref(self.neigh_distances)[i][j] = (
self.datasets_pair.distance_metric._rdist_to_dist(
# Guard against eventual -0., causing nan production.
# Guard against potential -0., causing nan production.
max(deref(self.neigh_distances)[i][j], 0.)
)
)
Expand All @@ -338,8 +338,10 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
):
if (
metric_kwargs is not None and
len(metric_kwargs) > 0 and
"Y_norm_squared" not in metric_kwargs
len(metric_kwargs) > 0 and (
"Y_norm_squared" not in metric_kwargs or
"X_norm_squared" not in metric_kwargs
)
):
warnings.warn(
f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
Expand Down Expand Up @@ -374,16 +376,31 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
metric_kwargs.pop("Y_norm_squared"),
ensure_2d=False,
input_name="Y_norm_squared",
dtype=np.float64
dtype=np.float64,
)
else:
self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(Y, self.effective_n_threads)
self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(
Y,
self.effective_n_threads,
)

if metric_kwargs is not None and "X_norm_squared" in metric_kwargs:
self.X_norm_squared = check_array(
metric_kwargs.pop("X_norm_squared"),
ensure_2d=False,
input_name="X_norm_squared",
dtype=np.float64,
)
else:
# Do not recompute norms if datasets are identical.
self.X_norm_squared = (
self.Y_norm_squared if X is Y else
_sqeuclidean_row_norms{{name_suffix}}(
X,
self.effective_n_threads,
)
)

# Do not recompute norms if datasets are identical.
self.X_norm_squared = (
self.Y_norm_squared if X is Y else
_sqeuclidean_row_norms{{name_suffix}}(X, self.effective_n_threads)
)
self.use_squared_distances = use_squared_distances

if use_squared_distances:
Expand Down Expand Up @@ -480,7 +497,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
) nogil:
cdef:
ITYPE_t i, j
DTYPE_t squared_dist_i_j
DTYPE_t sqeuclidean_dist_i_j
ITYPE_t n_X = X_end - X_start
ITYPE_t n_Y = Y_end - Y_start
DTYPE_t *dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms(
Expand All @@ -490,17 +507,18 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
# Pushing the distance and their associated indices in vectors.
for i in range(n_X):
for j in range(n_Y):
# Using the squared euclidean distance as the rank-preserving distance:
#
# ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
#
squared_dist_i_j = (
sqeuclidean_dist_i_j = (
self.X_norm_squared[i + X_start]
+ dist_middle_terms[i * n_Y + j]
+ self.Y_norm_squared[j + Y_start]
)
if squared_dist_i_j <= self.r_radius:
deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)

# Catastrophic cancellation might cause -0. to be present,
# e.g. when computing d(x_i, y_i) when X is Y.
sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j)

if sqeuclidean_dist_i_j <= self.r_radius:
deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(sqeuclidean_dist_i_j)
deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)

{{endfor}}