Skip to content

Commit 0bf05ba

Browse files
committed
adding adaptive learning rate for minibatch k-means
1 parent 1527b1f commit 0bf05ba

File tree

4 files changed

+430
-52
lines changed

4 files changed

+430
-52
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ reuters/
6464
benchmarks/bench_covertype_data/
6565
benchmarks/HIGGS.csv.gz
6666
bench_pca_solvers.csv
67+
benchmarks/minibatch_results/
6768

6869
*.prefs
6970
.pydevproject

benchmarks/bench_minibatch.py

Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
import os
2+
import time
3+
from collections import defaultdict
4+
from datetime import datetime
5+
from itertools import product
6+
7+
import matplotlib.pyplot as plt
8+
import numpy as np
9+
import pandas as pd
10+
11+
from sklearn import metrics
12+
from sklearn.cluster import KMeans, MiniBatchKMeans
13+
from sklearn.datasets import fetch_20newsgroups, fetch_openml
14+
from sklearn.decomposition._truncated_svd import TruncatedSVD
15+
from sklearn.feature_extraction.text import TfidfVectorizer
16+
from sklearn.preprocessing import StandardScaler
17+
18+
19+
def load_and_preprocess_data(dataset, n_sample=None):
20+
if dataset == "20newsgroups":
21+
newsgroups = fetch_20newsgroups(subset="train")
22+
vectorizer = TfidfVectorizer()
23+
X = vectorizer.fit_transform(newsgroups.data)
24+
y = newsgroups.target
25+
n_components = 100
26+
# Apply Truncated SVD to the sparse matrix X
27+
svd = TruncatedSVD(n_components=n_components)
28+
X = svd.fit_transform(X)
29+
return X, y
30+
else:
31+
try:
32+
X, y = fetch_openml(
33+
name=dataset,
34+
version=1,
35+
as_frame=False,
36+
return_X_y=True,
37+
data_home=None,
38+
cache=True,
39+
parser="auto",
40+
)
41+
except Exception as e:
42+
raise Exception(f"Could not load dataset {dataset}. Error: {e}")
43+
normalize = False
44+
if dataset == "pendigits":
45+
normalize = True
46+
if dataset == "letter":
47+
normalize = True
48+
if n_sample is not None:
49+
shuffle = np.random.permutation(X.shape[0])
50+
X = X[shuffle]
51+
y = y[shuffle]
52+
X = X[: min(X.shape[0], n_sample)]
53+
y = y[: min(X.shape[0], n_sample)]
54+
if normalize:
55+
X = StandardScaler().fit_transform(X)
56+
return X, y
57+
58+
59+
# MARK: -Evaluation
60+
def evaluate(kms, X, labels, num_iters, n_clusters, batch_size, n_runs=50):
61+
62+
evaluations = []
63+
64+
for name, km in kms.items():
65+
train_times = []
66+
print(f"Evaluating {name}")
67+
scores = defaultdict(list)
68+
for seed in range(n_runs):
69+
km.random_state = seed
70+
t0 = time.time()
71+
km.fit(X)
72+
# include the time it took to construct the kernel matrix in
73+
# the training time
74+
train_times.append(time.time() - t0)
75+
scores["NMI"].append(
76+
metrics.normalized_mutual_info_score(labels, km.labels_)
77+
)
78+
scores["ARI"].append(metrics.adjusted_rand_score(labels, km.labels_))
79+
train_times = np.asarray(train_times)
80+
81+
evaluation = {
82+
"estimator": name,
83+
"num_iters": num_iters,
84+
"n_clusters": n_clusters,
85+
"batch_size": batch_size,
86+
"train_time_mean": train_times.mean(),
87+
"train_time_std": train_times.std(),
88+
}
89+
for score_name, score_values in scores.items():
90+
mean_score, std_score = np.mean(score_values), np.std(score_values)
91+
evaluation[score_name + "_mean"] = mean_score
92+
evaluation[score_name + "_std"] = std_score
93+
94+
evaluations.append(evaluation)
95+
96+
print(
97+
f"\n {name}, num_iters: {num_iters}, n_clusters: {n_clusters},\
98+
batch size: {batch_size}"
99+
)
100+
for score_name, score_values in scores.items():
101+
mean_score, std_score = np.mean(score_values), np.std(score_values)
102+
print(f"{score_name}: {mean_score:.3f} ± {std_score:.3f}")
103+
return evaluations
104+
105+
106+
colors = [
107+
"#1f77b4", # muted blue
108+
"#ff7f0e", # safety orange
109+
"#ff7f0e", # safety orange
110+
"#9467bd", # muted purple
111+
"#9467bd", # muted purple
112+
"#8c564b", # chestnut brown
113+
"#e377c2", # raspberry yogurt pink
114+
"#7f7f7f", # middle gray
115+
"#bcbd22", # curry yellow-green
116+
"#17becf", # blue-teal
117+
]
118+
hatches = ["", "//", "", "//", ""]
119+
120+
121+
def plot_results(to_plot):
122+
plt.rcParams.update({"font.size": 24})
123+
plt.figure(figsize=(10, 6))
124+
num_res = len(to_plot) # Number of rows in the grid
125+
# assume all DFs have the same batch sizes
126+
batch_sizes = to_plot[0]["batch_size"].unique()
127+
num_batches = len(batch_sizes)
128+
print(num_batches, num_res)
129+
fig, axes = plt.subplots(
130+
num_batches, num_res, figsize=(7 * num_res, 6 * num_batches)
131+
)
132+
for j in range(num_batches):
133+
for i, df1 in enumerate(to_plot):
134+
b = batch_sizes[j]
135+
name = df1["dataset"].iloc[0]
136+
df = df1[df1["batch_size"] == batch_sizes[j]]
137+
138+
if num_batches == 1 and num_res == 1:
139+
ax = axes
140+
elif num_batches == 1:
141+
ax = axes[i]
142+
else:
143+
ax = axes[j][i]
144+
ax1, ax2 = plot_results_bars(df, ax, i == 0 and j == 0)
145+
if i == 0:
146+
ax1.set_ylabel("Score")
147+
if i == num_res - 1:
148+
ax2.set_ylabel("Time (s)")
149+
ax.set_title(f"{name} (batch size: {b})")
150+
151+
fig.legend(loc="lower center", bbox_to_anchor=(0.5, 1.04), ncol=5, fontsize=34)
152+
153+
plt.tight_layout()
154+
# write to results directory
155+
plt.savefig("minibatch_results/results.png", bbox_inches="tight")
156+
157+
158+
def plot_results_bars(df, ax1, set_labels=True):
159+
metric_names = ["ARI", "NMI"]
160+
time_metric = "train_time"
161+
sorted(df["estimator"].unique())
162+
ax2 = ax1.twinx()
163+
n_metrics = len(metric_names) + 1 # Including train_time
164+
bar_width = 0.4
165+
positions = np.arange(n_metrics) * (len(df["estimator"].unique()) * bar_width + 0.5)
166+
df_comb = df
167+
168+
for i, metric in enumerate(metric_names + [time_metric]):
169+
metric_mean = metric + "_mean"
170+
metric_std = metric + "_std"
171+
for j, name in enumerate(sorted(df["estimator"].unique())):
172+
position = positions[i] + j * bar_width - 0.5
173+
ax = ax1
174+
if metric == time_metric:
175+
ax = ax2
176+
alg_name = name[2:]
177+
ax.bar(
178+
position,
179+
df_comb[df_comb["estimator"] == name][metric_mean].iloc[0],
180+
bar_width,
181+
color=colors[j],
182+
label=(alg_name) if i == 0 and set_labels else "",
183+
yerr=df_comb[df_comb["estimator"] == name][metric_std].iloc[0],
184+
capsize=5,
185+
hatch=hatches[j],
186+
edgecolor="black",
187+
linewidth=1,
188+
)
189+
ax1.set_xticks(positions + bar_width / 2)
190+
ax1.set_xticklabels(metric_names + ["runtime"])
191+
return ax1, ax2
192+
193+
194+
result_files = []
195+
196+
197+
n_runs = 10
198+
n_iters = [100]
199+
batch_size_values = [1024]
200+
tol = 1e-4
201+
202+
to_plot = []
203+
204+
dataset_names = [
205+
"pendigits",
206+
"har",
207+
"mnist_784",
208+
"letter",
209+
"20newsgroups",
210+
]
211+
print("Running on datasets:", dataset_names)
212+
for dataset_name in dataset_names:
213+
n = None
214+
X, Y = load_and_preprocess_data(dataset_name, n_sample=n)
215+
if n is None:
216+
n = X.shape[0]
217+
218+
num_clusters = np.unique(Y).shape[0]
219+
print(f"num clusters: {num_clusters}")
220+
n_clusters = np.unique(Y).shape[0]
221+
n_clusters_values = [n_clusters]
222+
evaluations = []
223+
current_datetime = datetime.now()
224+
print(f"dataset: {dataset_name}")
225+
evaluations = []
226+
227+
for num_iters, n_clusters, batch_size in product(
228+
n_iters, n_clusters_values, batch_size_values
229+
):
230+
print("#" * 20)
231+
mbk_newlr = MiniBatchKMeans(
232+
n_clusters=n_clusters,
233+
batch_size=batch_size,
234+
max_iter=num_iters,
235+
adaptive_lr=True,
236+
tol=tol,
237+
)
238+
mbk_oldlr = MiniBatchKMeans(
239+
n_clusters=n_clusters,
240+
batch_size=batch_size,
241+
max_iter=num_iters,
242+
adaptive_lr=False,
243+
tol=tol,
244+
)
245+
km_full = KMeans(n_clusters=n_clusters, max_iter=num_iters, tol=tol)
246+
mbks = {
247+
"1.new lr MiniBatch": mbk_newlr,
248+
"2.MiniBatch": mbk_oldlr,
249+
"3.KMeans": km_full,
250+
}
251+
252+
evaluations += evaluate(mbks, X, Y, num_iters, n_clusters, batch_size, n_runs)
253+
254+
# Convert evaluations to DataFrame
255+
df = pd.DataFrame(evaluations)
256+
metric_names = [
257+
"Homogeneity",
258+
"Completeness",
259+
"V-measure",
260+
"ARI",
261+
"Silhouette Coefficient",
262+
"NMI",
263+
]
264+
param_vals = {
265+
"num_iters": n_iters,
266+
"n_clusters": n_clusters_values,
267+
"batch_size": batch_size_values,
268+
"n_runs": n_runs,
269+
"n": n,
270+
}
271+
272+
if not os.path.exists("minibatch_results"):
273+
os.makedirs("minibatch_results")
274+
275+
df["dataset"] = dataset_name
276+
to_plot.append(df)
277+
278+
plot_results(to_plot)

0 commit comments

Comments
 (0)