|
| 1 | +""" |
| 2 | +======================== |
| 3 | +IncrementalPCA benchmark |
| 4 | +======================== |
| 5 | +
|
| 6 | +Benchmarks for IncrementalPCA |
| 7 | +
|
| 8 | +""" |
| 9 | + |
| 10 | +import numpy as np |
| 11 | +import gc |
| 12 | +from time import time |
| 13 | +from collections import defaultdict |
| 14 | +import matplotlib.pyplot as plt |
| 15 | +from sklearn.datasets import fetch_lfw_people |
| 16 | +from sklearn.decomposition import IncrementalPCA, RandomizedPCA, PCA |
| 17 | + |
| 18 | + |
| 19 | +def plot_results(X, y, label): |
| 20 | + plt.plot(X, y, label=label, marker='o') |
| 21 | + |
| 22 | + |
| 23 | +def benchmark(estimator, data): |
| 24 | + gc.collect() |
| 25 | + print("Benching %s" % estimator) |
| 26 | + t0 = time() |
| 27 | + estimator.fit(data) |
| 28 | + training_time = time() - t0 |
| 29 | + data_t = estimator.transform(data) |
| 30 | + data_r = estimator.inverse_transform(data_t) |
| 31 | + reconstruction_error = np.mean(np.abs(data - data_r)) |
| 32 | + return {'time': training_time, 'error': reconstruction_error} |
| 33 | + |
| 34 | + |
| 35 | +def plot_feature_times(all_times, batch_size, all_components, data): |
| 36 | + plt.figure() |
| 37 | + plot_results(all_components, all_times['pca'], label="PCA") |
| 38 | + plot_results(all_components, all_times['ipca'], |
| 39 | + label="IncrementalPCA, bsize=%i" % batch_size) |
| 40 | + plot_results(all_components, all_times['rpca'], label="RandomizedPCA") |
| 41 | + plt.legend(loc="upper left") |
| 42 | + plt.suptitle("Algorithm runtime vs. n_components\n \ |
| 43 | + LFW, size %i x %i" % data.shape) |
| 44 | + plt.xlabel("Number of components (out of max %i)" % data.shape[1]) |
| 45 | + plt.ylabel("Time (seconds)") |
| 46 | + |
| 47 | + |
| 48 | +def plot_feature_errors(all_errors, batch_size, all_components, data): |
| 49 | + plt.figure() |
| 50 | + plot_results(all_components, all_errors['pca'], label="PCA") |
| 51 | + plot_results(all_components, all_errors['ipca'], |
| 52 | + label="IncrementalPCA, bsize=%i" % batch_size) |
| 53 | + plot_results(all_components, all_errors['rpca'], label="RandomizedPCA") |
| 54 | + plt.legend(loc="lower left") |
| 55 | + plt.suptitle("Algorithm error vs. n_components\n" |
| 56 | + "LFW, size %i x %i" % data.shape) |
| 57 | + plt.xlabel("Number of components (out of max %i)" % data.shape[1]) |
| 58 | + plt.ylabel("Mean absolute error") |
| 59 | + |
| 60 | + |
| 61 | +def plot_batch_times(all_times, n_features, all_batch_sizes, data): |
| 62 | + plt.figure() |
| 63 | + plot_results(all_batch_sizes, all_times['pca'], label="PCA") |
| 64 | + plot_results(all_batch_sizes, all_times['rpca'], label="RandomizedPCA") |
| 65 | + plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA") |
| 66 | + plt.legend(loc="lower left") |
| 67 | + plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \ |
| 68 | + LFW, size %i x %i" % ( |
| 69 | + n_features, data.shape[0], data.shape[1])) |
| 70 | + plt.xlabel("Batch size") |
| 71 | + plt.ylabel("Time (seconds)") |
| 72 | + |
| 73 | + |
| 74 | +def plot_batch_errors(all_errors, n_features, all_batch_sizes, data): |
| 75 | + plt.figure() |
| 76 | + plot_results(all_batch_sizes, all_errors['pca'], label="PCA") |
| 77 | + plot_results(all_batch_sizes, all_errors['ipca'], label="IncrementalPCA") |
| 78 | + plt.legend(loc="lower left") |
| 79 | + plt.suptitle("Algorithm error vs. batch_size for n_components %i\n \ |
| 80 | + LFW, size %i x %i" % ( |
| 81 | + n_features, data.shape[0], data.shape[1])) |
| 82 | + plt.xlabel("Batch size") |
| 83 | + plt.ylabel("Mean absolute error") |
| 84 | + |
| 85 | + |
| 86 | +def fixed_batch_size_comparison(data): |
| 87 | + all_features = [i.astype(int) for i in np.linspace(data.shape[1] // 10, |
| 88 | + data.shape[1], num=5)] |
| 89 | + batch_size = 1000 |
| 90 | + # Compare runtimes and error for fixed batch size |
| 91 | + all_times = defaultdict(list) |
| 92 | + all_errors = defaultdict(list) |
| 93 | + for n_components in all_features: |
| 94 | + pca = PCA(n_components=n_components) |
| 95 | + rpca = RandomizedPCA(n_components=n_components, random_state=1999) |
| 96 | + ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) |
| 97 | + results_dict = {k: benchmark(est, data) for k, est in [('pca', pca), |
| 98 | + ('ipca', ipca), |
| 99 | + ('rpca', rpca)]} |
| 100 | + |
| 101 | + for k in sorted(results_dict.keys()): |
| 102 | + all_times[k].append(results_dict[k]['time']) |
| 103 | + all_errors[k].append(results_dict[k]['error']) |
| 104 | + |
| 105 | + plot_feature_times(all_times, batch_size, all_features, data) |
| 106 | + plot_feature_errors(all_errors, batch_size, all_features, data) |
| 107 | + |
| 108 | + |
| 109 | +def variable_batch_size_comparison(data): |
| 110 | + batch_sizes = [i.astype(int) for i in np.linspace(data.shape[0] // 10, |
| 111 | + data.shape[0], num=10)] |
| 112 | + |
| 113 | + for n_components in [i.astype(int) for i in |
| 114 | + np.linspace(data.shape[1] // 10, |
| 115 | + data.shape[1], num=4)]: |
| 116 | + all_times = defaultdict(list) |
| 117 | + all_errors = defaultdict(list) |
| 118 | + pca = PCA(n_components=n_components) |
| 119 | + rpca = RandomizedPCA(n_components=n_components, random_state=1999) |
| 120 | + results_dict = {k: benchmark(est, data) for k, est in [('pca', pca), |
| 121 | + ('rpca', rpca)]} |
| 122 | + |
| 123 | + # Create flat baselines to compare the variation over batch size |
| 124 | + all_times['pca'].extend([results_dict['pca']['time']] * |
| 125 | + len(batch_sizes)) |
| 126 | + all_errors['pca'].extend([results_dict['pca']['error']] * |
| 127 | + len(batch_sizes)) |
| 128 | + all_times['rpca'].extend([results_dict['rpca']['time']] * |
| 129 | + len(batch_sizes)) |
| 130 | + all_errors['rpca'].extend([results_dict['rpca']['error']] * |
| 131 | + len(batch_sizes)) |
| 132 | + for batch_size in batch_sizes: |
| 133 | + ipca = IncrementalPCA(n_components=n_components, |
| 134 | + batch_size=batch_size) |
| 135 | + results_dict = {k: benchmark(est, data) for k, est in [('ipca', |
| 136 | + ipca)]} |
| 137 | + all_times['ipca'].append(results_dict['ipca']['time']) |
| 138 | + all_errors['ipca'].append(results_dict['ipca']['error']) |
| 139 | + |
| 140 | + plot_batch_times(all_times, n_components, batch_sizes, data) |
| 141 | + # RandomizedPCA error is always worse (approx 100x) than other PCA |
| 142 | + # tests |
| 143 | + plot_batch_errors(all_errors, n_components, batch_sizes, data) |
| 144 | + |
| 145 | +faces = fetch_lfw_people(resize=.2, min_faces_per_person=5) |
| 146 | +# limit dataset to 5000 people (don't care who they are!) |
| 147 | +X = faces.data[:5000] |
| 148 | +n_samples, h, w = faces.images.shape |
| 149 | +n_features = X.shape[1] |
| 150 | + |
| 151 | +X -= X.mean(axis=0) |
| 152 | +X /= X.std(axis=0) |
| 153 | + |
| 154 | +fixed_batch_size_comparison(X) |
| 155 | +variable_batch_size_comparison(X) |
| 156 | +plt.show() |
0 commit comments