Skip to content

FEA add ReadonlyArrayWrapper #20903

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Sep 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sklearn/cluster/_k_means_common.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ cdef floating _euclidean_sparse_dense(floating[::1], int[::1], floating[::1],
floating, bint) nogil

cpdef void _relocate_empty_clusters_dense(
np.ndarray[floating, ndim=2, mode='c'], floating[::1], floating[:, ::1],
floating[:, ::1], floating[::1], floating[:, ::1],
floating[:, ::1], floating[::1], int[::1])

cpdef void _relocate_empty_clusters_sparse(
Expand Down
20 changes: 10 additions & 10 deletions sklearn/cluster/_k_means_common.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,10 @@ def _euclidean_sparse_dense_wrapper(


cpdef floating _inertia_dense(
np.ndarray[floating, ndim=2, mode='c'] X, # IN
floating[::1] sample_weight, # IN
floating[:, ::1] centers, # IN
int[::1] labels, # IN
floating[:, ::1] X, # IN READ-ONLY
floating[::1] sample_weight, # IN READ-ONLY
floating[:, ::1] centers, # IN
int[::1] labels, # IN
int n_threads):
"""Compute inertia for dense input data

Expand Down Expand Up @@ -161,12 +161,12 @@ cpdef floating _inertia_sparse(


cpdef void _relocate_empty_clusters_dense(
np.ndarray[floating, ndim=2, mode='c'] X, # IN
floating[::1] sample_weight, # IN
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # INOUT
floating[::1] weight_in_clusters, # INOUT
int[::1] labels): # IN
floating[:, ::1] X, # IN READ-ONLY
floating[::1] sample_weight, # IN READ-ONLY
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # INOUT
floating[::1] weight_in_clusters, # INOUT
int[::1] labels): # IN
"""Relocate centers which have no sample assigned to them."""
cdef:
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
Expand Down
49 changes: 23 additions & 26 deletions sklearn/cluster/_k_means_elkan.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ np.import_array()


def init_bounds_dense(
np.ndarray[floating, ndim=2, mode='c'] X, # IN
floating[:, ::1] centers, # IN
floating[:, ::1] center_half_distances, # IN
int[::1] labels, # OUT
floating[::1] upper_bounds, # OUT
floating[:, ::1] lower_bounds): # OUT
floating[:, ::1] X, # IN READ-ONLY
floating[:, ::1] centers, # IN
floating[:, ::1] center_half_distances, # IN
int[::1] labels, # OUT
floating[::1] upper_bounds, # OUT
floating[:, ::1] lower_bounds): # OUT
"""Initialize upper and lower bounds for each sample for dense input data.

Given X, centers and the pairwise distances divided by 2.0 between the
Expand Down Expand Up @@ -182,17 +182,17 @@ def init_bounds_sparse(


def elkan_iter_chunked_dense(
np.ndarray[floating, ndim=2, mode='c'] X, # IN
floating[::1] sample_weight, # IN
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_in_clusters, # OUT
floating[:, ::1] center_half_distances, # IN
floating[::1] distance_next_center, # IN
floating[::1] upper_bounds, # INOUT
floating[:, ::1] lower_bounds, # INOUT
int[::1] labels, # INOUT
floating[::1] center_shift, # OUT
floating[:, ::1] X, # IN READ-ONLY
floating[::1] sample_weight, # IN READ-ONLY
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_in_clusters, # OUT
floating[:, ::1] center_half_distances, # IN
floating[::1] distance_next_center, # IN
floating[::1] upper_bounds, # INOUT
floating[:, ::1] lower_bounds, # INOUT
int[::1] labels, # INOUT
floating[::1] center_shift, # OUT
int n_threads,
bint update_centers=True):
"""Single iteration of K-means Elkan algorithm with dense input.
Expand Down Expand Up @@ -292,7 +292,7 @@ def elkan_iter_chunked_dense(
end = start + n_samples_chunk

_update_chunk_dense(
&X[start, 0],
X[start: end],
sample_weight[start: end],
centers_old,
center_half_distances,
Expand Down Expand Up @@ -334,11 +334,8 @@ def elkan_iter_chunked_dense(


cdef void _update_chunk_dense(
floating *X, # IN
# expecting C alinged 2D array. XXX: Can be
# replaced by const memoryview when cython min
# version is >= 0.3
floating[::1] sample_weight, # IN
floating[:, ::1] X, # IN READ-ONLY
floating[::1] sample_weight, # IN READ-ONLY
floating[:, ::1] centers_old, # IN
floating[:, ::1] center_half_distances, # IN
floating[::1] distance_next_center, # IN
Expand Down Expand Up @@ -383,7 +380,7 @@ cdef void _update_chunk_dense(
# between the sample and its current assigned center.
if not bounds_tight:
upper_bound = _euclidean_dense_dense(
X + i * n_features, &centers_old[label, 0], n_features, False)
&X[i, 0], &centers_old[label, 0], n_features, False)
lower_bounds[i, label] = upper_bound
bounds_tight = 1

Expand All @@ -394,7 +391,7 @@ cdef void _update_chunk_dense(
or (upper_bound > center_half_distances[label, j])):

distance = _euclidean_dense_dense(
X + i * n_features, &centers_old[j, 0], n_features, False)
&X[i, 0], &centers_old[j, 0], n_features, False)
lower_bounds[i, j] = distance
if distance < upper_bound:
label = j
Expand All @@ -406,7 +403,7 @@ cdef void _update_chunk_dense(
if update_centers:
weight_in_clusters[label] += sample_weight[i]
for k in range(n_features):
centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i]
centers_new[label * n_features + k] += X[i, k] * sample_weight[i]


def elkan_iter_chunked_sparse(
Expand Down
29 changes: 13 additions & 16 deletions sklearn/cluster/_k_means_lloyd.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ np.import_array()


def lloyd_iter_chunked_dense(
np.ndarray[floating, ndim=2, mode='c'] X, # IN
floating[::1] sample_weight, # IN
floating[::1] x_squared_norms, # IN
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_in_clusters, # OUT
int[::1] labels, # OUT
floating[::1] center_shift, # OUT
floating[:, ::1] X, # IN READ-ONLY
floating[::1] sample_weight, # IN READ-ONLY
floating[::1] x_squared_norms, # IN
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_in_clusters, # OUT
int[::1] labels, # OUT
floating[::1] center_shift, # OUT
int n_threads,
bint update_centers=True):
"""Single iteration of K-means lloyd algorithm with dense input.
Expand Down Expand Up @@ -129,7 +129,7 @@ def lloyd_iter_chunked_dense(
end = start + n_samples_chunk

_update_chunk_dense(
&X[start, 0],
X[start: end],
sample_weight[start: end],
x_squared_norms[start: end],
centers_old,
Expand Down Expand Up @@ -162,11 +162,8 @@ def lloyd_iter_chunked_dense(


cdef void _update_chunk_dense(
floating *X, # IN
# expecting C alinged 2D array. XXX: Can be
# replaced by const memoryview when cython min
# version is >= 0.3
floating[::1] sample_weight, # IN
floating[:, ::1] X, # IN READ-ONLY
floating[::1] sample_weight, # IN READ-ONLY
floating[::1] x_squared_norms, # IN
floating[:, ::1] centers_old, # IN
floating[::1] centers_squared_norms, # IN
Expand Down Expand Up @@ -199,7 +196,7 @@ cdef void _update_chunk_dense(

# pairwise_distances += -2 * X.dot(C.T)
_gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features,
-2.0, X, n_features, &centers_old[0, 0], n_features,
-2.0, &X[0, 0], n_features, &centers_old[0, 0], n_features,
1.0, pairwise_distances, n_clusters)

for i in range(n_samples):
Expand All @@ -215,7 +212,7 @@ cdef void _update_chunk_dense(
if update_centers:
weight_in_clusters[label] += sample_weight[i]
for k in range(n_features):
centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i]
centers_new[label * n_features + k] += X[i, k] * sample_weight[i]


def lloyd_iter_chunked_sparse(
Expand Down
20 changes: 10 additions & 10 deletions sklearn/cluster/_k_means_minibatch.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ np.import_array()


def _minibatch_update_dense(
np.ndarray[floating, ndim=2, mode="c"] X, # IN
floating[::1] sample_weight, # IN
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_sums, # INOUT
int[::1] labels, # IN
floating[:, ::1] X, # IN READ-ONLY
floating[::1] sample_weight, # IN READ-ONLY
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_sums, # INOUT
int[::1] labels, # IN
int n_threads):
"""Update of the centers for dense MiniBatchKMeans.

Expand Down Expand Up @@ -59,7 +59,7 @@ def _minibatch_update_dense(
indices = <int*> malloc(n_samples * sizeof(int))

for cluster_idx in prange(n_clusters, schedule="static"):
update_center_dense(cluster_idx, &X[0, 0], sample_weight,
update_center_dense(cluster_idx, X, sample_weight,
centers_old, centers_new, weight_sums, labels,
indices)

Expand All @@ -68,8 +68,8 @@ def _minibatch_update_dense(

cdef void update_center_dense(
int cluster_idx,
floating *X, # IN
floating[::1] sample_weight, # IN
floating[:, ::1] X, # IN READ-ONLY
floating[::1] sample_weight, # IN READ-ONLY
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_sums, # INOUT
Expand Down Expand Up @@ -103,7 +103,7 @@ cdef void update_center_dense(
for k in range(n_indices):
sample_idx = indices[k]
for feature_idx in range(n_features):
centers_new[cluster_idx, feature_idx] += X[sample_idx * n_features + feature_idx] * sample_weight[sample_idx]
centers_new[cluster_idx, feature_idx] += X[sample_idx, feature_idx] * sample_weight[sample_idx]

# Update the count statistics for this center
weight_sums[cluster_idx] += wsum
Expand Down
10 changes: 9 additions & 1 deletion sklearn/cluster/_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from ..utils import deprecated
from ..utils.validation import check_is_fitted, _check_sample_weight
from ..utils._openmp_helpers import _openmp_effective_n_threads
from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
from ..exceptions import ConvergenceWarning
from ._k_means_common import CHUNK_SIZE
from ._k_means_common import _inertia_dense
Expand Down Expand Up @@ -729,6 +730,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):
else:
_labels = lloyd_iter_chunked_dense
_inertia = _inertia_dense
X = ReadonlyArrayWrapper(X)

_labels(
X,
Expand Down Expand Up @@ -1449,7 +1451,13 @@ def _mini_batch_step(
)
else:
_minibatch_update_dense(
X, sample_weight, centers, centers_new, weight_sums, labels, n_threads
ReadonlyArrayWrapper(X),
sample_weight,
centers,
centers_new,
weight_sums,
labels,
n_threads,
)

# Reassign clusters that have very low weight
Expand Down
68 changes: 68 additions & 0 deletions sklearn/utils/_readonly_array_wrapper.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
ReadonlyArrayWrapper implements the buffer protocol to make the wraped buffer behave as if
writeable, even for readonly buffers. This way, even readonly arrays can be passed as
argument of type (non const) memoryview.
This is a workaround for the missing support for const fused-typed memoryviews in
Cython < 3.0.

Note: All it does is LIE about the readonly attribute: tell it's false!
This way, we can use it on arrays that we don't touch.
!!! USE CAREFULLY !!!
"""
# TODO: Remove with Cython >= 3.0 which supports const memoryviews for fused types.

from cpython cimport Py_buffer
from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_WRITABLE

import numpy as np
cimport numpy as np


np.import_array()


ctypedef fused NUM_TYPES:
np.npy_float64
np.npy_float32
np.npy_int64
np.npy_int32


cdef class ReadonlyArrayWrapper:
cdef object wraps

def __init__(self, wraps):
self.wraps = wraps

def __getbuffer__(self, Py_buffer *buffer, int flags):
request_for_writeable = False
if flags & PyBUF_WRITABLE:
flags ^= PyBUF_WRITABLE
request_for_writeable = True
PyObject_GetBuffer(self.wraps, buffer, flags)
if request_for_writeable:
# The following is a lie when self.wraps is readonly!
buffer.readonly = False

def __releasebuffer__(self, Py_buffer *buffer):
PyBuffer_Release(buffer)


def _test_sum(NUM_TYPES[:] x):
"""This function is for testing only.

As this function does not modify x, we would like to define it as

_test_sum(const NUM_TYPES[:] x)

which is not possible as fused typed const memoryviews aren't
supported in Cython<3.0.
"""
cdef:
int i
int n = x.shape[0]
NUM_TYPES sum = 0

for i in range(n):
sum += x[i]
return sum
6 changes: 6 additions & 0 deletions sklearn/utils/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ def configuration(parent_package="", top_path=None):
libraries=libraries,
)

config.add_extension(
"_readonly_array_wrapper",
sources=["_readonly_array_wrapper.pyx"],
libraries=libraries,
)

config.add_subpackage("tests")

return config
Expand Down
Loading