|
| 1 | +import os |
| 2 | +import time |
| 3 | +from collections import defaultdict |
| 4 | +from datetime import datetime |
| 5 | +from itertools import product |
| 6 | + |
| 7 | +import matplotlib.pyplot as plt |
| 8 | +import numpy as np |
| 9 | +import pandas as pd |
| 10 | + |
| 11 | +from sklearn import metrics |
| 12 | +from sklearn.cluster import KMeans, MiniBatchKMeans |
| 13 | +from sklearn.datasets import fetch_20newsgroups, fetch_openml |
| 14 | +from sklearn.decomposition._truncated_svd import TruncatedSVD |
| 15 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 16 | +from sklearn.preprocessing import StandardScaler |
| 17 | + |
| 18 | + |
| 19 | +def load_and_preprocess_data(dataset, n_sample=None): |
| 20 | + if dataset == "20newsgroups": |
| 21 | + newsgroups = fetch_20newsgroups(subset="train") |
| 22 | + vectorizer = TfidfVectorizer() |
| 23 | + X = vectorizer.fit_transform(newsgroups.data) |
| 24 | + y = newsgroups.target |
| 25 | + n_components = 100 |
| 26 | + # Apply Truncated SVD to the sparse matrix X |
| 27 | + svd = TruncatedSVD(n_components=n_components) |
| 28 | + X = svd.fit_transform(X) |
| 29 | + return X, y |
| 30 | + else: |
| 31 | + try: |
| 32 | + X, y = fetch_openml( |
| 33 | + name=dataset, |
| 34 | + version=1, |
| 35 | + as_frame=False, |
| 36 | + return_X_y=True, |
| 37 | + data_home=None, |
| 38 | + cache=True, |
| 39 | + parser="auto", |
| 40 | + ) |
| 41 | + except Exception as e: |
| 42 | + raise Exception(f"Could not load dataset {dataset}. Error: {e}") |
| 43 | + normalize = False |
| 44 | + if dataset == "pendigits": |
| 45 | + normalize = True |
| 46 | + if dataset == "letter": |
| 47 | + normalize = True |
| 48 | + if n_sample is not None: |
| 49 | + shuffle = np.random.permutation(X.shape[0]) |
| 50 | + X = X[shuffle] |
| 51 | + y = y[shuffle] |
| 52 | + X = X[: min(X.shape[0], n_sample)] |
| 53 | + y = y[: min(X.shape[0], n_sample)] |
| 54 | + if normalize: |
| 55 | + X = StandardScaler().fit_transform(X) |
| 56 | + return X, y |
| 57 | + |
| 58 | + |
| 59 | +# MARK: -Evaluation |
| 60 | +def evaluate(kms, X, labels, num_iters, n_clusters, batch_size, n_runs=50): |
| 61 | + |
| 62 | + evaluations = [] |
| 63 | + |
| 64 | + for name, km in kms.items(): |
| 65 | + train_times = [] |
| 66 | + print(f"Evaluating {name}") |
| 67 | + scores = defaultdict(list) |
| 68 | + for seed in range(n_runs): |
| 69 | + km.random_state = seed |
| 70 | + t0 = time.time() |
| 71 | + km.fit(X) |
| 72 | + # include the time it took to construct the kernel matrix in |
| 73 | + # the training time |
| 74 | + train_times.append(time.time() - t0) |
| 75 | + scores["NMI"].append( |
| 76 | + metrics.normalized_mutual_info_score(labels, km.labels_) |
| 77 | + ) |
| 78 | + scores["ARI"].append(metrics.adjusted_rand_score(labels, km.labels_)) |
| 79 | + train_times = np.asarray(train_times) |
| 80 | + |
| 81 | + evaluation = { |
| 82 | + "estimator": name, |
| 83 | + "num_iters": num_iters, |
| 84 | + "n_clusters": n_clusters, |
| 85 | + "batch_size": batch_size, |
| 86 | + "train_time_mean": train_times.mean(), |
| 87 | + "train_time_std": train_times.std(), |
| 88 | + } |
| 89 | + for score_name, score_values in scores.items(): |
| 90 | + mean_score, std_score = np.mean(score_values), np.std(score_values) |
| 91 | + evaluation[score_name + "_mean"] = mean_score |
| 92 | + evaluation[score_name + "_std"] = std_score |
| 93 | + |
| 94 | + evaluations.append(evaluation) |
| 95 | + |
| 96 | + print( |
| 97 | + f"\n {name}, num_iters: {num_iters}, n_clusters: {n_clusters},\ |
| 98 | + batch size: {batch_size}" |
| 99 | + ) |
| 100 | + for score_name, score_values in scores.items(): |
| 101 | + mean_score, std_score = np.mean(score_values), np.std(score_values) |
| 102 | + print(f"{score_name}: {mean_score:.3f} ± {std_score:.3f}") |
| 103 | + return evaluations |
| 104 | + |
| 105 | + |
| 106 | +colors = [ |
| 107 | + "#1f77b4", # muted blue |
| 108 | + "#ff7f0e", # safety orange |
| 109 | + "#ff7f0e", # safety orange |
| 110 | + "#9467bd", # muted purple |
| 111 | + "#9467bd", # muted purple |
| 112 | + "#8c564b", # chestnut brown |
| 113 | + "#e377c2", # raspberry yogurt pink |
| 114 | + "#7f7f7f", # middle gray |
| 115 | + "#bcbd22", # curry yellow-green |
| 116 | + "#17becf", # blue-teal |
| 117 | +] |
| 118 | +hatches = ["", "//", "", "//", ""] |
| 119 | + |
| 120 | + |
| 121 | +def plot_results(to_plot): |
| 122 | + plt.rcParams.update({"font.size": 24}) |
| 123 | + plt.figure(figsize=(10, 6)) |
| 124 | + num_res = len(to_plot) # Number of rows in the grid |
| 125 | + # assume all DFs have the same batch sizes |
| 126 | + batch_sizes = to_plot[0]["batch_size"].unique() |
| 127 | + num_batches = len(batch_sizes) |
| 128 | + print(num_batches, num_res) |
| 129 | + fig, axes = plt.subplots( |
| 130 | + num_batches, num_res, figsize=(7 * num_res, 6 * num_batches) |
| 131 | + ) |
| 132 | + for j in range(num_batches): |
| 133 | + for i, df1 in enumerate(to_plot): |
| 134 | + b = batch_sizes[j] |
| 135 | + name = df1["dataset"].iloc[0] |
| 136 | + df = df1[df1["batch_size"] == batch_sizes[j]] |
| 137 | + |
| 138 | + if num_batches == 1 and num_res == 1: |
| 139 | + ax = axes |
| 140 | + elif num_batches == 1: |
| 141 | + ax = axes[i] |
| 142 | + else: |
| 143 | + ax = axes[j][i] |
| 144 | + ax1, ax2 = plot_results_bars(df, ax, i == 0 and j == 0) |
| 145 | + if i == 0: |
| 146 | + ax1.set_ylabel("Score") |
| 147 | + if i == num_res - 1: |
| 148 | + ax2.set_ylabel("Time (s)") |
| 149 | + ax.set_title(f"{name} (batch size: {b})") |
| 150 | + |
| 151 | + fig.legend(loc="lower center", bbox_to_anchor=(0.5, 1.04), ncol=5, fontsize=34) |
| 152 | + |
| 153 | + plt.tight_layout() |
| 154 | + # write to results directory |
| 155 | + plt.savefig("minibatch_results/results.png", bbox_inches="tight") |
| 156 | + |
| 157 | + |
| 158 | +def plot_results_bars(df, ax1, set_labels=True): |
| 159 | + metric_names = ["ARI", "NMI"] |
| 160 | + time_metric = "train_time" |
| 161 | + sorted(df["estimator"].unique()) |
| 162 | + ax2 = ax1.twinx() |
| 163 | + n_metrics = len(metric_names) + 1 # Including train_time |
| 164 | + bar_width = 0.4 |
| 165 | + positions = np.arange(n_metrics) * (len(df["estimator"].unique()) * bar_width + 0.5) |
| 166 | + df_comb = df |
| 167 | + |
| 168 | + for i, metric in enumerate(metric_names + [time_metric]): |
| 169 | + metric_mean = metric + "_mean" |
| 170 | + metric_std = metric + "_std" |
| 171 | + for j, name in enumerate(sorted(df["estimator"].unique())): |
| 172 | + position = positions[i] + j * bar_width - 0.5 |
| 173 | + ax = ax1 |
| 174 | + if metric == time_metric: |
| 175 | + ax = ax2 |
| 176 | + alg_name = name[2:] |
| 177 | + ax.bar( |
| 178 | + position, |
| 179 | + df_comb[df_comb["estimator"] == name][metric_mean].iloc[0], |
| 180 | + bar_width, |
| 181 | + color=colors[j], |
| 182 | + label=(alg_name) if i == 0 and set_labels else "", |
| 183 | + yerr=df_comb[df_comb["estimator"] == name][metric_std].iloc[0], |
| 184 | + capsize=5, |
| 185 | + hatch=hatches[j], |
| 186 | + edgecolor="black", |
| 187 | + linewidth=1, |
| 188 | + ) |
| 189 | + ax1.set_xticks(positions + bar_width / 2) |
| 190 | + ax1.set_xticklabels(metric_names + ["runtime"]) |
| 191 | + return ax1, ax2 |
| 192 | + |
| 193 | + |
| 194 | +result_files = [] |
| 195 | + |
| 196 | + |
| 197 | +n_runs = 10 |
| 198 | +n_iters = [100] |
| 199 | +batch_size_values = [1024] |
| 200 | +tol = 1e-4 |
| 201 | + |
| 202 | +to_plot = [] |
| 203 | + |
| 204 | +dataset_names = [ |
| 205 | + "pendigits", |
| 206 | + "har", |
| 207 | + "mnist_784", |
| 208 | + "letter", |
| 209 | + "20newsgroups", |
| 210 | +] |
| 211 | +print("Running on datasets:", dataset_names) |
| 212 | +for dataset_name in dataset_names: |
| 213 | + n = None |
| 214 | + X, Y = load_and_preprocess_data(dataset_name, n_sample=n) |
| 215 | + if n is None: |
| 216 | + n = X.shape[0] |
| 217 | + |
| 218 | + num_clusters = np.unique(Y).shape[0] |
| 219 | + print(f"num clusters: {num_clusters}") |
| 220 | + n_clusters = np.unique(Y).shape[0] |
| 221 | + n_clusters_values = [n_clusters] |
| 222 | + evaluations = [] |
| 223 | + current_datetime = datetime.now() |
| 224 | + print(f"dataset: {dataset_name}") |
| 225 | + evaluations = [] |
| 226 | + |
| 227 | + for num_iters, n_clusters, batch_size in product( |
| 228 | + n_iters, n_clusters_values, batch_size_values |
| 229 | + ): |
| 230 | + print("#" * 20) |
| 231 | + mbk_newlr = MiniBatchKMeans( |
| 232 | + n_clusters=n_clusters, |
| 233 | + batch_size=batch_size, |
| 234 | + max_iter=num_iters, |
| 235 | + adaptive_lr=True, |
| 236 | + tol=tol, |
| 237 | + ) |
| 238 | + mbk_oldlr = MiniBatchKMeans( |
| 239 | + n_clusters=n_clusters, |
| 240 | + batch_size=batch_size, |
| 241 | + max_iter=num_iters, |
| 242 | + adaptive_lr=False, |
| 243 | + tol=tol, |
| 244 | + ) |
| 245 | + km_full = KMeans(n_clusters=n_clusters, max_iter=num_iters, tol=tol) |
| 246 | + mbks = { |
| 247 | + "1.new lr MiniBatch": mbk_newlr, |
| 248 | + "2.MiniBatch": mbk_oldlr, |
| 249 | + "3.KMeans": km_full, |
| 250 | + } |
| 251 | + |
| 252 | + evaluations += evaluate(mbks, X, Y, num_iters, n_clusters, batch_size, n_runs) |
| 253 | + |
| 254 | + # Convert evaluations to DataFrame |
| 255 | + df = pd.DataFrame(evaluations) |
| 256 | + metric_names = [ |
| 257 | + "Homogeneity", |
| 258 | + "Completeness", |
| 259 | + "V-measure", |
| 260 | + "ARI", |
| 261 | + "Silhouette Coefficient", |
| 262 | + "NMI", |
| 263 | + ] |
| 264 | + param_vals = { |
| 265 | + "num_iters": n_iters, |
| 266 | + "n_clusters": n_clusters_values, |
| 267 | + "batch_size": batch_size_values, |
| 268 | + "n_runs": n_runs, |
| 269 | + "n": n, |
| 270 | + } |
| 271 | + |
| 272 | + if not os.path.exists("minibatch_results"): |
| 273 | + os.makedirs("minibatch_results") |
| 274 | + |
| 275 | + df["dataset"] = dataset_name |
| 276 | + to_plot.append(df) |
| 277 | + |
| 278 | +plot_results(to_plot) |
0 commit comments