Finalized version of benchmark plots

kastnerkyle · kastnerkyle · commit adc28e7665cc · 2014-06-25T15:36:30.000+02:00
diff --git a/benchmarks/bench_plot_incremental_pca.py b/benchmarks/bench_plot_incremental_pca.py
@@ -28,37 +28,26 @@ def calc_time_and_err(tf, arr):
     return meas_time, meas_err
 
 
-def compute_feature_bench(arr, n_components, batch_size):
-    print("===========================")
-    print("Computing feature bench for n_components %i, batch_size %i" % (
-        n_components, batch_size))
+def compute_pca_bench(arr, n_components):
     print("===========================")
-
+    print("Computing PCA bench for n_components %i" % n_components)
     pca = PCA(n_components=n_components)
     pca_time, pca_err = calc_time_and_err(pca, arr)
+    return pca_time, pca_err
 
-    ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
-    ipca_time, ipca_err = calc_time_and_err(ipca, arr)
 
+def compute_rpca_bench(arr, n_components, random_seed=1999):
+    print("===========================")
+    print("Computing RandomizedPCA bench for n_components %i" % n_components)
     rpca = RandomizedPCA(n_components=n_components, random_state=1999)
     rpca_time, rpca_err = calc_time_and_err(rpca, arr)
-    return pca_time, rpca_time, ipca_time, pca_err, rpca_err, ipca_err
-
-
-def compute_pca_bench(arr, n_components):
-    print("===========================")
-    print("Computing PCA bench for n_components %i" % n_components)
-    print("===========================")
-    pca = PCA(n_components=n_components)
-    pca_time, pca_err = calc_time_and_err(pca, arr)
-    return pca_time, pca_err
+    return rpca_time, rpca_err
 
 
 def compute_ipca_bench(arr, n_components, batch_size):
     print("===========================")
     print("Computing IncrementalPCA bench for n_components %i, batch_size %i" % (
         n_components, batch_size))
-    print("===========================")
 
     ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
     ipca_time, ipca_err = calc_time_and_err(ipca, arr)
@@ -78,7 +67,7 @@ def plot_feature_times(all_pca_times, all_rpca_times, all_ipca_times,
     plt.ylabel("Time (seconds)")
 
     ax = plt.gca()
-    n_ticks = 11
+    n_ticks = 10
     step_size = arr.shape[1] // 10
     ax.xaxis.set_major_locator(LinearLocator(numticks=n_ticks))
     ax.set_xticklabels(list(range(step_size, n_ticks * step_size - 1,
@@ -98,26 +87,28 @@ def plot_feature_errors(all_pca_err, all_rpca_err, all_ipca_err,
     plt.ylabel("Mean absolute error")
 
     ax = plt.gca()
-    n_ticks = 11
+    n_ticks = 10
     step_size = arr.shape[1] // 10
     ax.xaxis.set_major_locator(LinearLocator(numticks=n_ticks))
     ax.set_xticklabels(list(range(step_size, n_ticks * step_size - 1,
                                   step_size)))
 
 
-def plot_batch_times(all_pca_times, all_ipca_times, n_features, arr):
+def plot_batch_times(all_pca_times, all_rpca_times, all_ipca_times,
+                     n_features, arr):
     plt.figure()
     plt.plot(all_pca_times, label="PCA")
+    plt.plot(all_rpca_times, label="RandomizedPCA")
     plt.plot(all_ipca_times, label="IncrementalPCA")
     plt.legend(loc="lower left")
-    plt.suptitle("Algorithm runtime vs. batch_size %i\n \
+    plt.suptitle("Algorithm runtime vs. batch_size for n_features %i\n \
                  Low-rank matix (effective_rank 10), size %i x %i" % (
                  n_features, arr.shape[0], arr.shape[1]))
     plt.xlabel("Batch size")
     plt.ylabel("Time (seconds)")
 
     ax = plt.gca()
-    n_ticks = 11
+    n_ticks = 10
     step_size = arr.shape[0] // 10
     ax.xaxis.set_major_locator(LinearLocator(numticks=n_ticks))
     ax.set_xticklabels(list(range(step_size, n_ticks * step_size - 1,
@@ -136,10 +127,11 @@ def plot_batch_errors(all_pca_err, all_ipca_err, n_features, arr):
     plt.ylabel("Mean absolute error")
 
     ax = plt.gca()
-    n_ticks = 11
+    n_ticks = 10
     step_size = arr.shape[0] // 10
     ax.xaxis.set_major_locator(LinearLocator(numticks=n_ticks))
-    ax.set_xticklabels(list(range(1, n_ticks * step_size - 1, step_size)))
+    ax.set_xticklabels(list(range(step_size, n_ticks * step_size - 1,
+                                  step_size)))
 
 
 def fixed_batch_size_comparison(arr):
@@ -155,8 +147,9 @@ def fixed_batch_size_comparison(arr):
     bs = 1000
     # Compare runtimes and error for fixed batch size
     for nf in all_features:
-        (pca_time, rpca_time, ipca_time, pca_err,
-         rpca_err, ipca_err) = compute_feature_bench(arr, nf, bs)
+        pca_time, pca_err = compute_pca_bench(arr, nf)
+        rpca_time, rpca_err = compute_rpca_bench(arr, nf)
+        ipca_time, ipca_err = compute_ipca_bench(arr, nf, bs)
         all_pca_times.append(pca_time)
         all_rpca_times.append(rpca_time)
         all_ipca_times.append(ipca_time)
@@ -177,7 +170,7 @@ def fixed_batch_size_comparison(arr):
 
 def variable_batch_size_comparison(arr):
     batch_sizes = list(map(int, np.linspace(arr.shape[0] // 10, arr.shape[0],
-                                            num=5)))
+                                            num=10)))
     all_features = list(map(int, np.linspace(arr.shape[1] // 10, arr.shape[1],
                                              num=4)))
     for nf in all_features:
@@ -187,6 +180,12 @@ def variable_batch_size_comparison(arr):
         all_pca_times.extend([pca_time] * len(batch_sizes))
         all_pca_err.extend([pca_err] * len(batch_sizes))
 
+        all_rpca_times = []
+        all_rpca_err = []
+        rpca_time, rpca_err = compute_rpca_bench(arr, nf)
+        all_rpca_times.extend([rpca_time] * len(batch_sizes))
+        all_rpca_err.extend([rpca_err] * len(batch_sizes))
+
         all_ipca_times = []
         all_ipca_err = []
         for bs in batch_sizes:
@@ -195,14 +194,19 @@ def variable_batch_size_comparison(arr):
             all_ipca_err.append(ipca_err)
 
         all_pca_times = np.array(all_pca_times)
-        all_ipca_times = np.array(all_ipca_times)
         all_pca_err = np.array(all_pca_err)
+
+        all_rpca_times = np.array(all_rpca_times)
+        all_rpca_err = np.array(all_rpca_err)
+
+        all_ipca_times = np.array(all_ipca_times)
         all_ipca_err = np.array(all_ipca_err)
 
-        plot_batch_times(all_pca_times, all_ipca_times, nf, arr)
+        plot_batch_times(all_pca_times, all_rpca_times, all_ipca_times, nf, arr)
+        # RandomizePCA error is always worse (approx 100x) than other PCA tests
         plot_batch_errors(all_pca_err, all_ipca_err, nf, arr)
 
-faces = fetch_lfw_people(resize=.15, min_faces_per_person=5)
+faces = fetch_lfw_people(resize=.2, min_faces_per_person=5)
 # limit dataset to 5000 people (don't care who they are!)
 X = faces.data[:5000]
 n_samples, h, w = faces.images.shape