38
38
39
39
###############################################################################
40
40
# Initialization heuristic
41
+ def _k_para_init (X , n_clusters , x_squared_norms , random_state , l = None , r = 5 ):
42
+ """Init n_clusters seeds according to k-means||
43
+
44
+ Parameters
45
+ -----------
46
+ X: array or sparse matrix, shape (n_samples, n_features)
47
+ The data to pick seeds for. To avoid memory copy, the input data
48
+ should be double precision (dtype=np.float64).
49
+
50
+ n_clusters: integer
51
+ The number of seeds to choose
52
+
53
+ l: int
54
+ Number of centers to sample at each iteration of k-means||.
55
+
56
+ r: int
57
+ Number of iterations of k-means|| to perfoem.
58
+
59
+ x_squared_norms: array, shape (n_samples,)
60
+ Squared Euclidean norm of each data point.
61
+
62
+ random_state: numpy.RandomState
63
+ The generator used to initialize the centers.
64
+
65
+ Notes
66
+ -----
67
+ Selects initial cluster centers for k-mean clustering in a smart way
68
+ to speed up convergence. see: Bahmani, Bahman, et al. "Scalable k-means++."
69
+ Proceedings of the VLDB Endowment 5.7 (2012): 622-633.
70
+
71
+ Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
72
+ which is the implementation used in the aforementioned paper.
73
+ """
74
+ n_samples , n_features = X .shape
75
+
76
+ assert x_squared_norms is not None , 'x_squared_norms None in _k_init'
77
+
78
+ # Pick first center randomly
79
+ center_id = random_state .randint (n_samples )
80
+ if sp .issparse (X ):
81
+ center = X [center_id ].toarray ()
82
+ else :
83
+ center = X [center_id , np .newaxis ]
84
+
85
+ # Initialize list of closest distances and calculate current potential
86
+ closest_dist_sq = euclidean_distances (
87
+ center , X , Y_norm_squared = x_squared_norms ,
88
+ squared = True )
89
+ current_pot = closest_dist_sq .sum ()
90
+
91
+ center_ids = [center_id ]
92
+
93
+ r = max (n_clusters / l , r )
94
+
95
+ if l * r < n_clusters :
96
+ raise ValueError ('l * r should be greater or equal to n_clusters (l={}'
97
+ ', r={}, n_clusters={}).' .format (l , r , n_clusters ))
98
+
99
+ for _ in range (r ):
100
+ # Choose new centers by sampling with probability proportional
101
+ # to the squared distance to the closest existing center
102
+ rand_vals = random_state .random_sample (l ) * current_pot
103
+ candidate_ids = np .searchsorted (closest_dist_sq .cumsum (), rand_vals )
104
+
105
+ # Compute distances to new centers
106
+ distance_to_candidates = euclidean_distances (
107
+ X [candidate_ids ], X , Y_norm_squared = x_squared_norms , squared = True )
108
+
109
+ # Compute potential when including the new centers
110
+ distances = np .min (distance_to_candidates , axis = 0 )
111
+ closest_dist_sq = np .minimum (distances , closest_dist_sq )
112
+ current_pot = closest_dist_sq .sum ()
113
+
114
+ center_ids .extend (candidate_ids )
115
+
116
+ centers , _ , _ = k_means (X [center_ids ], n_clusters , init = 'k-means++' ,
117
+ tol = 1e-2 )
118
+
119
+ return centers
41
120
42
121
43
122
def _k_init (X , n_clusters , x_squared_norms , random_state , n_local_trials = None ):
@@ -166,7 +245,7 @@ def _tolerance(X, tol):
166
245
def k_means (X , n_clusters , init = 'k-means++' , precompute_distances = 'auto' ,
167
246
n_init = 10 , max_iter = 300 , verbose = False ,
168
247
tol = 1e-4 , random_state = None , copy_x = True , n_jobs = 1 ,
169
- return_n_iter = False ):
248
+ return_n_iter = False , l_para = None ):
170
249
"""K-means clustering algorithm.
171
250
172
251
Read more in the :ref:`User Guide <k_means>`.
@@ -244,6 +323,9 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
244
323
245
324
return_n_iter : bool, optional
246
325
Whether or not to return the number of iterations.
326
+
327
+ l_para : int, optional, default: None
328
+ Number of centers to sample per iteration when using k-means|| init.
247
329
248
330
Returns
249
331
-------
@@ -321,7 +403,8 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
321
403
labels , inertia , centers , n_iter_ = _kmeans_single (
322
404
X , n_clusters , max_iter = max_iter , init = init , verbose = verbose ,
323
405
precompute_distances = precompute_distances , tol = tol ,
324
- x_squared_norms = x_squared_norms , random_state = random_state )
406
+ x_squared_norms = x_squared_norms , random_state = random_state ,
407
+ l_para = l_para )
325
408
# determine if these results are the best so far
326
409
if best_inertia is None or inertia < best_inertia :
327
410
best_labels = labels .copy ()
@@ -337,7 +420,7 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
337
420
precompute_distances = precompute_distances ,
338
421
x_squared_norms = x_squared_norms ,
339
422
# Change seed to ensure variety
340
- random_state = seed )
423
+ random_state = seed , l_para = l_para )
341
424
for seed in seeds )
342
425
# Get results with the lowest inertia
343
426
labels , inertia , centers , n_iters = zip (* results )
@@ -360,7 +443,7 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
360
443
361
444
def _kmeans_single (X , n_clusters , x_squared_norms , max_iter = 300 ,
362
445
init = 'k-means++' , verbose = False , random_state = None ,
363
- tol = 1e-4 , precompute_distances = True ):
446
+ tol = 1e-4 , precompute_distances = True , l_para = None ):
364
447
"""A single run of k-means, assumes preparation completed prior.
365
448
366
449
Parameters
@@ -429,7 +512,7 @@ def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300,
429
512
best_labels , best_inertia , best_centers = None , None , None
430
513
# init
431
514
centers = _init_centroids (X , n_clusters , init , random_state = random_state ,
432
- x_squared_norms = x_squared_norms )
515
+ x_squared_norms = x_squared_norms , l_para = l_para )
433
516
if verbose :
434
517
print ("Initialization complete" )
435
518
@@ -580,7 +663,7 @@ def _labels_inertia(X, x_squared_norms, centers,
580
663
581
664
582
665
def _init_centroids (X , k , init , random_state = None , x_squared_norms = None ,
583
- init_size = None ):
666
+ init_size = None , l_para = None ):
584
667
"""Compute the initial centroids
585
668
586
669
Parameters
@@ -638,6 +721,9 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
638
721
if isinstance (init , string_types ) and init == 'k-means++' :
639
722
centers = _k_init (X , k , random_state = random_state ,
640
723
x_squared_norms = x_squared_norms )
724
+ elif isinstance (init , string_types ) and init == 'k-means||' :
725
+ centers = _k_para_init (X , k , random_state = random_state ,
726
+ x_squared_norms = x_squared_norms , l = l_para )
641
727
elif isinstance (init , string_types ) and init == 'random' :
642
728
seeds = random_state .permutation (n_samples )[:k ]
643
729
centers = X [seeds ]
@@ -647,7 +733,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
647
733
centers = init (X , k , random_state = random_state )
648
734
else :
649
735
raise ValueError ("the init parameter for the k-means should "
650
- "be 'k-means++' or 'random' or an ndarray, "
736
+ "be 'k-means++', 'k-means||', 'random' or an ndarray, "
651
737
"'%s' (type '%s') was passed." % (init , type (init )))
652
738
653
739
if sp .issparse (centers ):
@@ -678,7 +764,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
678
764
centroid seeds. The final results will be the best output of
679
765
n_init consecutive runs in terms of inertia.
680
766
681
- init : {'k-means++', 'random' or an ndarray}
767
+ init : {'k-means++', 'random', 'k-means||' or an ndarray}
682
768
Method for initialization, defaults to 'k-means++':
683
769
684
770
'k-means++' : selects initial cluster centers for k-mean
@@ -705,6 +791,10 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
705
791
tol : float, default: 1e-4
706
792
Relative tolerance with regards to inertia to declare convergence
707
793
794
+ l_para: int, default None
795
+ Number of centers to sample per iteration when using k-mean||
796
+ initialization.
797
+
708
798
n_jobs : int
709
799
The number of jobs to use for the computation. This works by computing
710
800
each of the n_init runs in parallel.
@@ -767,7 +857,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
767
857
"""
768
858
769
859
def __init__ (self , n_clusters = 8 , init = 'k-means++' , n_init = 10 , max_iter = 300 ,
770
- tol = 1e-4 , precompute_distances = 'auto' ,
860
+ tol = 1e-4 , precompute_distances = 'auto' , l_para = None ,
771
861
verbose = 0 , random_state = None , copy_x = True , n_jobs = 1 ):
772
862
773
863
self .n_clusters = n_clusters
@@ -780,6 +870,7 @@ def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
780
870
self .random_state = random_state
781
871
self .copy_x = copy_x
782
872
self .n_jobs = n_jobs
873
+ self .l_para = l_para
783
874
784
875
def _check_fit_data (self , X ):
785
876
"""Verify that the number of samples given is larger than k"""
@@ -818,7 +909,7 @@ def fit(self, X, y=None):
818
909
verbose = self .verbose , return_n_iter = True ,
819
910
precompute_distances = self .precompute_distances ,
820
911
tol = self .tol , random_state = random_state , copy_x = self .copy_x ,
821
- n_jobs = self .n_jobs )
912
+ n_jobs = self .n_jobs , l_para = self . l_para )
822
913
return self
823
914
824
915
def fit_predict (self , X , y = None ):
@@ -1151,7 +1242,7 @@ class MiniBatchKMeans(KMeans):
1151
1242
only algorithm is initialized by running a batch KMeans on a
1152
1243
random subset of the data. This needs to be larger than n_clusters.
1153
1244
1154
- init : {'k-means++', 'random' or an ndarray}, default: 'k-means++'
1245
+ init : {'k-means++', 'random', 'k-means||' or an ndarray}, default: 'k-means++'
1155
1246
Method for initialization, defaults to 'k-means++':
1156
1247
1157
1248
'k-means++' : selects initial cluster centers for k-mean
0 commit comments