Use fused types in kmeans_elkan

yenchenlin · yenchenlin · commit abb752caa767 · 2016-05-31T23:10:22.000+08:00
diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
@@ -10,6 +10,7 @@
 import numpy as np
 cimport numpy as np
 cimport cython
+from cython cimport floating
 
 from libc.math cimport sqrt
 
@@ -18,8 +19,8 @@ from ._k_means import _centers_dense
 from ..utils.fixes import partition
 
 
-cdef double euclidian_dist(double* a, double* b, int n_features) nogil:
-    cdef double result, tmp
+cdef floating euclidian_dist(floating* a, floating* b, int n_features) nogil:
+    cdef floating result, tmp
     result = 0
     cdef int i
     for i in range(n_features):
@@ -29,8 +30,8 @@ cdef double euclidian_dist(double* a, double* b, int n_features) nogil:
 
 
 cdef update_labels_distances_inplace(
-        double* X, double* centers, double[:, :] center_half_distances,
-        int[:] labels, double[:, :] lower_bounds, double[:] upper_bounds,
+        floating* X, floating* centers, floating[:, :] center_half_distances,
+        int[:] labels, floating[:, :] lower_bounds, floating[:] upper_bounds,
         int n_samples, int n_features, int n_clusters):
     """
     Calculate upper and lower bounds for each sample.
@@ -81,9 +82,9 @@ cdef update_labels_distances_inplace(
     """
     # assigns closest center to X
     # uses triangle inequality
-    cdef double* x
-    cdef double* c
-    cdef double d_c, dist
+    cdef floating* x
+    cdef floating* c
+    cdef floating d_c, dist
     cdef int c_x, j, sample
     for sample in range(n_samples):
         # assign first cluster center
@@ -103,8 +104,8 @@ cdef update_labels_distances_inplace(
         upper_bounds[sample] = d_c
 
 
-def k_means_elkan(np.ndarray[np.float64_t, ndim=2, mode='c'] X_, int n_clusters,
-                  np.ndarray[np.float64_t, ndim=2, mode='c'] init,
+def k_means_elkan(np.ndarray[floating, ndim=2, mode='c'] X_, int n_clusters,
+                  np.ndarray[floating, ndim=2, mode='c'] init,
                   float tol=1e-4, int max_iter=30, verbose=False):
     """Run Elkan's k-means.
 
@@ -128,30 +129,35 @@ def k_means_elkan(np.ndarray[np.float64_t, ndim=2, mode='c'] X_, int n_clusters,
         Whether to be verbose.
 
     """
-    #initialize
-    cdef np.ndarray[np.float64_t, ndim=2, mode='c'] centers_ = init
-    cdef double* centers_p = <double*>centers_.data
-    cdef double* X_p = <double*>X_.data
-    cdef double* x_p
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+   #initialize
+    cdef np.ndarray[floating, ndim=2, mode='c'] centers_ = init
+    cdef floating* centers_p = <floating*>centers_.data
+    cdef floating* X_p = <floating*>X_.data
+    cdef floating* x_p
     cdef Py_ssize_t n_samples = X_.shape[0]
     cdef Py_ssize_t n_features = X_.shape[1]
     cdef int point_index, center_index, label
-    cdef float upper_bound, distance
-    cdef double[:, :] center_half_distances = euclidean_distances(centers_) / 2.
-    cdef double[:, :] lower_bounds = np.zeros((n_samples, n_clusters))
-    cdef double[:] distance_next_center
+    cdef floating upper_bound, distance
+    cdef floating[:, :] center_half_distances = euclidean_distances(centers_) / 2.
+    cdef floating[:, :] lower_bounds = np.zeros((n_samples, n_clusters), dtype=dtype)
+    cdef floating[:] distance_next_center
     labels_ = np.empty(n_samples, dtype=np.int32)
     cdef int[:] labels = labels_
-    upper_bounds_ = np.empty(n_samples, dtype=np.float)
-    cdef double[:] upper_bounds = upper_bounds_
+    upper_bounds_ = np.empty(n_samples, dtype=dtype)
+    cdef floating[:] upper_bounds = upper_bounds_
 
     # Get the inital set of upper bounds and lower bounds for each sample.
     update_labels_distances_inplace(X_p, centers_p, center_half_distances,
                                     labels, lower_bounds, upper_bounds,
                                     n_samples, n_features, n_clusters)
     cdef np.uint8_t[:] bounds_tight = np.ones(n_samples, dtype=np.uint8)
     cdef np.uint8_t[:] points_to_update = np.zeros(n_samples, dtype=np.uint8)
-    cdef np.ndarray[np.float64_t, ndim=2, mode='c'] new_centers
+    cdef np.ndarray[floating, ndim=2, mode='c'] new_centers
 
     if max_iter <= 0:
         raise ValueError('Number of iterations should be a positive number'
@@ -226,7 +232,7 @@ def k_means_elkan(np.ndarray[np.float64_t, ndim=2, mode='c'] X_, int n_clusters,
 
         # reassign centers
         centers_ = new_centers
-        centers_p = <double*>new_centers.data
+        centers_p = <floating*>new_centers.data
 
         # update between-center distances
         center_half_distances = euclidean_distances(centers_) / 2.