Skip to content

Commit d2d2ae9

Browse files
committed
Added draft version of k-means||
1 parent 861ac13 commit d2d2ae9

File tree

1 file changed

+102
-11
lines changed

1 file changed

+102
-11
lines changed

sklearn/cluster/k_means_.py

Lines changed: 102 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,85 @@
3838

3939
###############################################################################
4040
# Initialization heuristic
41+
def _k_para_init(X, n_clusters, x_squared_norms, random_state, l=None, r=5):
42+
"""Init n_clusters seeds according to k-means||
43+
44+
Parameters
45+
-----------
46+
X: array or sparse matrix, shape (n_samples, n_features)
47+
The data to pick seeds for. To avoid memory copy, the input data
48+
should be double precision (dtype=np.float64).
49+
50+
n_clusters: integer
51+
The number of seeds to choose
52+
53+
l: int
54+
Number of centers to sample at each iteration of k-means||.
55+
56+
r: int
57+
Number of iterations of k-means|| to perfoem.
58+
59+
x_squared_norms: array, shape (n_samples,)
60+
Squared Euclidean norm of each data point.
61+
62+
random_state: numpy.RandomState
63+
The generator used to initialize the centers.
64+
65+
Notes
66+
-----
67+
Selects initial cluster centers for k-mean clustering in a smart way
68+
to speed up convergence. see: Bahmani, Bahman, et al. "Scalable k-means++."
69+
Proceedings of the VLDB Endowment 5.7 (2012): 622-633.
70+
71+
Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
72+
which is the implementation used in the aforementioned paper.
73+
"""
74+
n_samples, n_features = X.shape
75+
76+
assert x_squared_norms is not None, 'x_squared_norms None in _k_init'
77+
78+
# Pick first center randomly
79+
center_id = random_state.randint(n_samples)
80+
if sp.issparse(X):
81+
center = X[center_id].toarray()
82+
else:
83+
center = X[center_id, np.newaxis]
84+
85+
# Initialize list of closest distances and calculate current potential
86+
closest_dist_sq = euclidean_distances(
87+
center, X, Y_norm_squared=x_squared_norms,
88+
squared=True)
89+
current_pot = closest_dist_sq.sum()
90+
91+
center_ids = [center_id]
92+
93+
r = max(n_clusters / l, r)
94+
95+
if l * r < n_clusters:
96+
raise ValueError('l * r should be greater or equal to n_clusters (l={}'
97+
', r={}, n_clusters={}).'.format(l, r, n_clusters))
98+
99+
for _ in range(r):
100+
# Choose new centers by sampling with probability proportional
101+
# to the squared distance to the closest existing center
102+
rand_vals = random_state.random_sample(l) * current_pot
103+
candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals)
104+
105+
# Compute distances to new centers
106+
distance_to_candidates = euclidean_distances(
107+
X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)
108+
109+
# Compute potential when including the new centers
110+
distances = np.min(distance_to_candidates, axis=0)
111+
closest_dist_sq = np.minimum(distances, closest_dist_sq)
112+
current_pot = closest_dist_sq.sum()
113+
114+
center_ids.extend(candidate_ids)
115+
116+
centers, _, _ = k_means(X[center_ids], n_clusters, init='k-means++',
117+
tol=1e-2)
118+
119+
return centers
41120

42121

43122
def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
@@ -166,7 +245,7 @@ def _tolerance(X, tol):
166245
def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
167246
n_init=10, max_iter=300, verbose=False,
168247
tol=1e-4, random_state=None, copy_x=True, n_jobs=1,
169-
return_n_iter=False):
248+
return_n_iter=False, l_para=None):
170249
"""K-means clustering algorithm.
171250
172251
Read more in the :ref:`User Guide <k_means>`.
@@ -244,6 +323,9 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
244323
245324
return_n_iter : bool, optional
246325
Whether or not to return the number of iterations.
326+
327+
l_para : int, optional, default: None
328+
Number of centers to sample per iteration when using k-means|| init.
247329
248330
Returns
249331
-------
@@ -321,7 +403,8 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
321403
labels, inertia, centers, n_iter_ = _kmeans_single(
322404
X, n_clusters, max_iter=max_iter, init=init, verbose=verbose,
323405
precompute_distances=precompute_distances, tol=tol,
324-
x_squared_norms=x_squared_norms, random_state=random_state)
406+
x_squared_norms=x_squared_norms, random_state=random_state,
407+
l_para=l_para)
325408
# determine if these results are the best so far
326409
if best_inertia is None or inertia < best_inertia:
327410
best_labels = labels.copy()
@@ -337,7 +420,7 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
337420
precompute_distances=precompute_distances,
338421
x_squared_norms=x_squared_norms,
339422
# Change seed to ensure variety
340-
random_state=seed)
423+
random_state=seed, l_para=l_para)
341424
for seed in seeds)
342425
# Get results with the lowest inertia
343426
labels, inertia, centers, n_iters = zip(*results)
@@ -360,7 +443,7 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
360443

361444
def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300,
362445
init='k-means++', verbose=False, random_state=None,
363-
tol=1e-4, precompute_distances=True):
446+
tol=1e-4, precompute_distances=True, l_para=None):
364447
"""A single run of k-means, assumes preparation completed prior.
365448
366449
Parameters
@@ -429,7 +512,7 @@ def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300,
429512
best_labels, best_inertia, best_centers = None, None, None
430513
# init
431514
centers = _init_centroids(X, n_clusters, init, random_state=random_state,
432-
x_squared_norms=x_squared_norms)
515+
x_squared_norms=x_squared_norms, l_para=l_para)
433516
if verbose:
434517
print("Initialization complete")
435518

@@ -580,7 +663,7 @@ def _labels_inertia(X, x_squared_norms, centers,
580663

581664

582665
def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
583-
init_size=None):
666+
init_size=None, l_para=None):
584667
"""Compute the initial centroids
585668
586669
Parameters
@@ -638,6 +721,9 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
638721
if isinstance(init, string_types) and init == 'k-means++':
639722
centers = _k_init(X, k, random_state=random_state,
640723
x_squared_norms=x_squared_norms)
724+
elif isinstance(init, string_types) and init == 'k-means||':
725+
centers = _k_para_init(X, k, random_state=random_state,
726+
x_squared_norms=x_squared_norms, l=l_para)
641727
elif isinstance(init, string_types) and init == 'random':
642728
seeds = random_state.permutation(n_samples)[:k]
643729
centers = X[seeds]
@@ -647,7 +733,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
647733
centers = init(X, k, random_state=random_state)
648734
else:
649735
raise ValueError("the init parameter for the k-means should "
650-
"be 'k-means++' or 'random' or an ndarray, "
736+
"be 'k-means++', 'k-means||', 'random' or an ndarray, "
651737
"'%s' (type '%s') was passed." % (init, type(init)))
652738

653739
if sp.issparse(centers):
@@ -678,7 +764,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
678764
centroid seeds. The final results will be the best output of
679765
n_init consecutive runs in terms of inertia.
680766
681-
init : {'k-means++', 'random' or an ndarray}
767+
init : {'k-means++', 'random', 'k-means||' or an ndarray}
682768
Method for initialization, defaults to 'k-means++':
683769
684770
'k-means++' : selects initial cluster centers for k-mean
@@ -705,6 +791,10 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
705791
tol : float, default: 1e-4
706792
Relative tolerance with regards to inertia to declare convergence
707793
794+
l_para: int, default None
795+
Number of centers to sample per iteration when using k-mean||
796+
initialization.
797+
708798
n_jobs : int
709799
The number of jobs to use for the computation. This works by computing
710800
each of the n_init runs in parallel.
@@ -767,7 +857,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
767857
"""
768858

769859
def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
770-
tol=1e-4, precompute_distances='auto',
860+
tol=1e-4, precompute_distances='auto', l_para=None,
771861
verbose=0, random_state=None, copy_x=True, n_jobs=1):
772862

773863
self.n_clusters = n_clusters
@@ -780,6 +870,7 @@ def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
780870
self.random_state = random_state
781871
self.copy_x = copy_x
782872
self.n_jobs = n_jobs
873+
self.l_para = l_para
783874

784875
def _check_fit_data(self, X):
785876
"""Verify that the number of samples given is larger than k"""
@@ -818,7 +909,7 @@ def fit(self, X, y=None):
818909
verbose=self.verbose, return_n_iter=True,
819910
precompute_distances=self.precompute_distances,
820911
tol=self.tol, random_state=random_state, copy_x=self.copy_x,
821-
n_jobs=self.n_jobs)
912+
n_jobs=self.n_jobs, l_para=self.l_para)
822913
return self
823914

824915
def fit_predict(self, X, y=None):
@@ -1151,7 +1242,7 @@ class MiniBatchKMeans(KMeans):
11511242
only algorithm is initialized by running a batch KMeans on a
11521243
random subset of the data. This needs to be larger than n_clusters.
11531244
1154-
init : {'k-means++', 'random' or an ndarray}, default: 'k-means++'
1245+
init : {'k-means++', 'random', 'k-means||' or an ndarray}, default: 'k-means++'
11551246
Method for initialization, defaults to 'k-means++':
11561247
11571248
'k-means++' : selects initial cluster centers for k-mean

0 commit comments

Comments
 (0)