scikit-learn · ogrisel · Sep 6, 2021 · Aug 31, 2021 · Aug 31, 2021 · Aug 31, 2021
diff --git a/sklearn/cluster/_k_means_common.pxd b/sklearn/cluster/_k_means_common.pxd
@@ -11,7 +11,7 @@ cdef floating _euclidean_sparse_dense(floating[::1], int[::1], floating[::1],
                                       floating, bint) nogil
 
 cpdef void _relocate_empty_clusters_dense(
-    np.ndarray[floating, ndim=2, mode='c'], floating[::1], floating[:, ::1],
+    floating[:, ::1], floating[::1], floating[:, ::1],
     floating[:, ::1], floating[::1], int[::1])
 
 cpdef void _relocate_empty_clusters_sparse(

diff --git a/sklearn/cluster/_k_means_common.pyx b/sklearn/cluster/_k_means_common.pyx
@@ -97,10 +97,10 @@ def _euclidean_sparse_dense_wrapper(
 
 
 cpdef floating _inertia_dense(
-        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
-        floating[::1] sample_weight,               # IN
-        floating[:, ::1] centers,                  # IN
-        int[::1] labels,                           # IN
+        floating[:, ::1] X,           # IN READ-ONLY
+        floating[::1] sample_weight,  # IN READ-ONLY
+        floating[:, ::1] centers,     # IN
+        int[::1] labels,              # IN
         int n_threads):
     """Compute inertia for dense input data
 
@@ -161,12 +161,12 @@ cpdef floating _inertia_sparse(
 
 
 cpdef void _relocate_empty_clusters_dense(
-        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
-        floating[::1] sample_weight,               # IN
-        floating[:, ::1] centers_old,              # IN
-        floating[:, ::1] centers_new,              # INOUT
-        floating[::1] weight_in_clusters,          # INOUT
-        int[::1] labels):                          # IN
+        floating[:, ::1] X,                # IN READ-ONLY
+        floating[::1] sample_weight,       # IN READ-ONLY
+        floating[:, ::1] centers_old,      # IN
+        floating[:, ::1] centers_new,      # INOUT
+        floating[::1] weight_in_clusters,  # INOUT
+        int[::1] labels):                  # IN
     """Relocate centers which have no sample assigned to them."""
     cdef:
         int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)

diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
@@ -31,12 +31,12 @@ np.import_array()
 
 
 def init_bounds_dense(
-        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
-        floating[:, ::1] centers,                  # IN
-        floating[:, ::1] center_half_distances,    # IN
-        int[::1] labels,                           # OUT
-        floating[::1] upper_bounds,                # OUT
-        floating[:, ::1] lower_bounds):            # OUT
+        floating[:, ::1] X,                      # IN READ-ONLY
+        floating[:, ::1] centers,                # IN
+        floating[:, ::1] center_half_distances,  # IN
+        int[::1] labels,                         # OUT
+        floating[::1] upper_bounds,              # OUT
+        floating[:, ::1] lower_bounds):          # OUT
     """Initialize upper and lower bounds for each sample for dense input data.
 
     Given X, centers and the pairwise distances divided by 2.0 between the
@@ -182,17 +182,17 @@ def init_bounds_sparse(
 
 
 def elkan_iter_chunked_dense(
-        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
-        floating[::1] sample_weight,               # IN
-        floating[:, ::1] centers_old,              # IN
-        floating[:, ::1] centers_new,              # OUT
-        floating[::1] weight_in_clusters,          # OUT
-        floating[:, ::1] center_half_distances,    # IN
-        floating[::1] distance_next_center,        # IN
-        floating[::1] upper_bounds,                # INOUT
-        floating[:, ::1] lower_bounds,             # INOUT
-        int[::1] labels,                           # INOUT
-        floating[::1] center_shift,                # OUT
+        floating[:, ::1] X,                      # IN READ-ONLY
+        floating[::1] sample_weight,             # IN READ-ONLY
+        floating[:, ::1] centers_old,            # IN
+        floating[:, ::1] centers_new,            # OUT
+        floating[::1] weight_in_clusters,        # OUT
+        floating[:, ::1] center_half_distances,  # IN
+        floating[::1] distance_next_center,      # IN
+        floating[::1] upper_bounds,              # INOUT
+        floating[:, ::1] lower_bounds,           # INOUT
+        int[::1] labels,                         # INOUT
+        floating[::1] center_shift,              # OUT
         int n_threads,
         bint update_centers=True):
     """Single iteration of K-means Elkan algorithm with dense input.
@@ -292,7 +292,7 @@ def elkan_iter_chunked_dense(
                 end = start + n_samples_chunk
 
             _update_chunk_dense(
-                &X[start, 0],
+                X[start: end],
                 sample_weight[start: end],
                 centers_old,
                 center_half_distances,
@@ -334,11 +334,8 @@ def elkan_iter_chunked_dense(
 
 
 cdef void _update_chunk_dense(
-        floating *X,                             # IN
-        # expecting C alinged 2D array. XXX: Can be
-        # replaced by const memoryview when cython min
-        # version is >= 0.3
-        floating[::1] sample_weight,             # IN
+        floating[:, ::1] X,                      # IN READ-ONLY
+        floating[::1] sample_weight,             # IN READ-ONLY
         floating[:, ::1] centers_old,            # IN
         floating[:, ::1] center_half_distances,  # IN
         floating[::1] distance_next_center,      # IN
@@ -383,7 +380,7 @@ cdef void _update_chunk_dense(
                     # between the sample and its current assigned center.
                     if not bounds_tight:
                         upper_bound = _euclidean_dense_dense(
-                            X + i * n_features, &centers_old[label, 0], n_features, False)
+                            &X[i, 0], &centers_old[label, 0], n_features, False)
                         lower_bounds[i, label] = upper_bound
                         bounds_tight = 1
 
@@ -394,7 +391,7 @@ cdef void _update_chunk_dense(
                         or (upper_bound > center_half_distances[label, j])):
 
                         distance = _euclidean_dense_dense(
-                            X + i * n_features, &centers_old[j, 0], n_features, False)
+                            &X[i, 0], &centers_old[j, 0], n_features, False)
                         lower_bounds[i, j] = distance
                         if distance < upper_bound:
                             label = j
@@ -406,7 +403,7 @@ cdef void _update_chunk_dense(
         if update_centers:
             weight_in_clusters[label] += sample_weight[i]
             for k in range(n_features):
-                centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i]
+                centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
 
 
 def elkan_iter_chunked_sparse(

diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
@@ -27,14 +27,14 @@ np.import_array()
 
 
 def lloyd_iter_chunked_dense(
-        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
-        floating[::1] sample_weight,               # IN
-        floating[::1] x_squared_norms,             # IN
-        floating[:, ::1] centers_old,              # IN
-        floating[:, ::1] centers_new,              # OUT
-        floating[::1] weight_in_clusters,          # OUT
-        int[::1] labels,                           # OUT
-        floating[::1] center_shift,                # OUT
+        floating[:, ::1] X,                # IN READ-ONLY
+        floating[::1] sample_weight,       # IN READ-ONLY
+        floating[::1] x_squared_norms,     # IN
+        floating[:, ::1] centers_old,      # IN
+        floating[:, ::1] centers_new,      # OUT
+        floating[::1] weight_in_clusters,  # OUT
+        int[::1] labels,                   # OUT
+        floating[::1] center_shift,        # OUT
         int n_threads,
         bint update_centers=True):
     """Single iteration of K-means lloyd algorithm with dense input.
@@ -129,7 +129,7 @@ def lloyd_iter_chunked_dense(
                 end = start + n_samples_chunk
 
             _update_chunk_dense(
-                &X[start, 0],
+                X[start: end],
                 sample_weight[start: end],
                 x_squared_norms[start: end],
                 centers_old,
@@ -162,11 +162,8 @@ def lloyd_iter_chunked_dense(
 
 
 cdef void _update_chunk_dense(
-        floating *X,                          # IN
-        # expecting C alinged 2D array. XXX: Can be
-        # replaced by const memoryview when cython min
-        # version is >= 0.3
-        floating[::1] sample_weight,          # IN
+        floating[:, ::1] X,                   # IN READ-ONLY
+        floating[::1] sample_weight,          # IN READ-ONLY
         floating[::1] x_squared_norms,        # IN
         floating[:, ::1] centers_old,         # IN
         floating[::1] centers_squared_norms,  # IN
@@ -199,7 +196,7 @@ cdef void _update_chunk_dense(
 
     # pairwise_distances += -2 * X.dot(C.T)
     _gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features,
-          -2.0, X, n_features, &centers_old[0, 0], n_features,
+          -2.0, &X[0, 0], n_features, &centers_old[0, 0], n_features,
           1.0, pairwise_distances, n_clusters)
 
     for i in range(n_samples):
@@ -215,7 +212,7 @@ cdef void _update_chunk_dense(
         if update_centers:
             weight_in_clusters[label] += sample_weight[i]
             for k in range(n_features):
-                centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i]
+                centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
 
 
 def lloyd_iter_chunked_sparse(

diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
@@ -14,12 +14,12 @@ np.import_array()
 
 
 def _minibatch_update_dense(
-        np.ndarray[floating, ndim=2, mode="c"] X,  # IN
-        floating[::1] sample_weight,               # IN
-        floating[:, ::1] centers_old,              # IN
-        floating[:, ::1] centers_new,              # OUT
-        floating[::1] weight_sums,                 # INOUT
-        int[::1] labels,                           # IN
+        floating[:, ::1] X,            # IN READ-ONLY
+        floating[::1] sample_weight,   # IN READ-ONLY
+        floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,  # OUT
+        floating[::1] weight_sums,     # INOUT
+        int[::1] labels,               # IN
         int n_threads):
     """Update of the centers for dense MiniBatchKMeans.
 
@@ -59,7 +59,7 @@ def _minibatch_update_dense(
         indices = <int*> malloc(n_samples * sizeof(int))
 
         for cluster_idx in prange(n_clusters, schedule="static"):
-            update_center_dense(cluster_idx, &X[0, 0], sample_weight,
+            update_center_dense(cluster_idx, X, sample_weight,
                                 centers_old, centers_new, weight_sums, labels,
                                 indices)
 
@@ -68,8 +68,8 @@ def _minibatch_update_dense(
 
 cdef void update_center_dense(
         int cluster_idx,
-        floating *X,                   # IN
-        floating[::1] sample_weight,   # IN
+        floating[:, ::1] X,            # IN READ-ONLY
+        floating[::1] sample_weight,   # IN READ-ONLY
         floating[:, ::1] centers_old,  # IN
         floating[:, ::1] centers_new,  # OUT
         floating[::1] weight_sums,     # INOUT
@@ -103,7 +103,7 @@ cdef void update_center_dense(
         for k in range(n_indices):
             sample_idx = indices[k]
             for feature_idx in range(n_features):
-                centers_new[cluster_idx, feature_idx] += X[sample_idx * n_features + feature_idx] * sample_weight[sample_idx]
+                centers_new[cluster_idx, feature_idx] += X[sample_idx, feature_idx] * sample_weight[sample_idx]
 
         # Update the count statistics for this center
         weight_sums[cluster_idx] += wsum

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
@@ -29,6 +29,7 @@
 from ..utils import deprecated
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils._openmp_helpers import _openmp_effective_n_threads
+from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
 from ..exceptions import ConvergenceWarning
 from ._k_means_common import CHUNK_SIZE
 from ._k_means_common import _inertia_dense
@@ -729,6 +730,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):
     else:
         _labels = lloyd_iter_chunked_dense
         _inertia = _inertia_dense
+        X = ReadonlyArrayWrapper(X)
 
     _labels(
         X,
@@ -1449,7 +1451,13 @@ def _mini_batch_step(
         )
     else:
         _minibatch_update_dense(
-            X, sample_weight, centers, centers_new, weight_sums, labels, n_threads
+            ReadonlyArrayWrapper(X),
+            sample_weight,
+            centers,
+            centers_new,
+            weight_sums,
+            labels,
+            n_threads,
         )
 
     # Reassign clusters that have very low weight

diff --git a/sklearn/utils/_readonly_array_wrapper.pyx b/sklearn/utils/_readonly_array_wrapper.pyx
@@ -0,0 +1,68 @@
+"""
+ReadonlyArrayWrapper implements the buffer protocol to make the wraped buffer behave as if
+writeable, even for readonly buffers. This way, even readonly arrays can be passed as
+argument of type (non const) memoryview.
+This is a workaround for the missing support for const fused-typed memoryviews in
+Cython < 3.0.
+
+Note: All it does is LIE about the readonly attribute: tell it's false!
+This way, we can use it on arrays that we don't touch.
+!!! USE CAREFULLY !!!
+"""
+# TODO: Remove with Cython >= 3.0 which supports const memoryviews for fused types.
+
+from cpython cimport Py_buffer
+from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_WRITABLE
+
+import numpy as np
+cimport numpy as np
+
+
+np.import_array()
+
+
+ctypedef fused NUM_TYPES:
+    np.npy_float64
+    np.npy_float32
+    np.npy_int64
+    np.npy_int32
+
+
+cdef class ReadonlyArrayWrapper:
+    cdef object wraps
+
+    def __init__(self, wraps):
+        self.wraps = wraps
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        request_for_writeable = False
+        if flags & PyBUF_WRITABLE:
+            flags ^= PyBUF_WRITABLE
+            request_for_writeable = True
+        PyObject_GetBuffer(self.wraps, buffer, flags)
+        if request_for_writeable:
+            # The following is a lie when self.wraps is readonly!
+            buffer.readonly = False
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        PyBuffer_Release(buffer)
+
+
+def _test_sum(NUM_TYPES[:] x):
+    """This function is for testing only.
+
+    As this function does not modify x, we would like to define it as
+
+            _test_sum(const NUM_TYPES[:] x)
+
+    which is not possible as fused typed const memoryviews aren't
+    supported in Cython<3.0.
+    """
+    cdef:
+        int i
+        int n = x.shape[0]
+        NUM_TYPES sum = 0
+
+    for i in range(n):
+        sum += x[i]
+    return sum
diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py
@@ -82,6 +82,12 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
+    config.add_extension(
+        "_readonly_array_wrapper",
+        sources=["_readonly_array_wrapper.pyx"],
+        libraries=libraries,
+    )
+
     config.add_subpackage("tests")
 
     return config