scikit-learn · yaichm · Apr 16, 2025 · Apr 16, 2025 · Apr 20, 2025 · Apr 20, 2025
diff --git a/benchmarks/bench_isomap_auto_vs_randomized.py b/benchmarks/bench_isomap_auto_vs_randomized.py
@@ -0,0 +1,106 @@
+"""
+======================================================================
+Benchmark: Comparing Isomap Solvers - Execution Time vs. Representation
+======================================================================
+
+This benchmark demonstrates how different eigenvalue solvers in Isomap
+can affect execution time and embedding quality.
+
+Description:
+------------
+We use a subset of handwritten digits (`load_digits` with 6 classes).
+Each data point is projected into a lower-dimensional space (2D) using
+two different solvers (`auto` and `randomized`).
+
+What you can observe:
+----------------------
+- The `auto` solver provides a reference solution.
+- The `randomized` solver is tested for comparison in terms of
+  representation quality and execution time.
+
+Further exploration:
+---------------------
+You can modify the number of neighbors (`n_neighbors`) or experiment with
+other Isomap solvers.
+"""
+
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib import offsetbox
+
+from sklearn.datasets import load_digits
+from sklearn.manifold import Isomap
+from sklearn.preprocessing import MinMaxScaler
+
+# 1- Data Loading
+# ---------------
+digits = load_digits(n_class=6)
+X, y = digits.data, digits.target
+n_neighbors = 30  # Number of neighbors for Isomap
+
+
+# 2- Visualization Function
+# -------------------------
+def plot_embedding(ax, X, title):
+    """Displays projected points with image annotations."""
+    X = MinMaxScaler().fit_transform(X)
+
+    for digit in digits.target_names:
+        ax.scatter(
+            *X[y == digit].T,
+            marker=f"${digit}$",
+            s=60,
+            color=plt.cm.Dark2(digit),
+            alpha=0.425,
+            zorder=2,
+        )
+
+    # Add digit images in the projected space
+    shown_images = np.array([[1.0, 1.0]])
+    for i in range(X.shape[0]):
+        dist = np.sum((X[i] - shown_images) ** 2, 1)
+        if np.min(dist) < 4e-3:
+            continue
+        shown_images = np.concatenate([shown_images, [X[i]]], axis=0)
+        imagebox = offsetbox.AnnotationBbox(
+            offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), X[i]
+        )
+        imagebox.set(zorder=1)
+        ax.add_artist(imagebox)
+
+    ax.set_title(title)
+    ax.axis("off")
+
+
+# 3- Define Embeddings and Benchmark
+# ----------------------------------
+embeddings = {
+    "Isomap (auto solver)": Isomap(
+        n_neighbors=n_neighbors, n_components=2, eigen_solver="auto"
+    ),
+    "Isomap (randomized solver)": Isomap(
+        n_neighbors=n_neighbors, n_components=2, eigen_solver="randomized_value"
+    ),
+}
+
+projections, timing = {}, {}
+
+# Compute embeddings
+for name, transformer in embeddings.items():
+    print(f"Computing {name}...")
+    start_time = time.time()
+    projections[name] = transformer.fit_transform(X, y)
+    timing[name] = time.time() - start_time
+
+# 4- Display Results
+# ------------------
+fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+
+for ax, (name, proj) in zip(axes, projections.items()):
+    title = f"{name} (time: {timing[name]:.3f}s)"
+    plot_embedding(ax, proj, title)
+
+plt.tight_layout()
+plt.show()
diff --git a/benchmarks/bench_isomap_execution_time_full_vs_randomized.py b/benchmarks/bench_isomap_execution_time_full_vs_randomized.py
@@ -0,0 +1,118 @@
+"""
+======================================================================
+Isomap Solvers Benchmark: Execution Time vs Number of Samples
+======================================================================
+
+This benchmark demonstrates how the choice of eigen_solver in Isomap
+can significantly affect computation time, especially as the dataset
+size increases.
+
+Description:
+------------
+Synthetic datasets are generated using `make_classification` with a
+fixed number of features. The number of samples is
+varied from 1000 to 4000.
+
+For each setting, Isomap is applied using two different solvers:
+- 'auto' (full eigendecomposition)
+- 'randomized_value'
+
+The execution time of each solver is measured for each number of
+samples, and the average time over multiple runs (default: 3) is
+plotted.
+
+What you can observe:
+---------------------
+If n_components < 10, the randomized and auto solvers produce similar
+results (in this case, the arpack solver is selected).
+However, when n_components > 10, the randomized solver becomes significantly
+faster, especially as the number of samples increases.
+
+"""
+
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.datasets import make_classification
+from sklearn.manifold import Isomap
+
+# 1 - Experiment Setup
+# -- -- -- -- -- -- -- -- -- -
+n_samples_list = [1000, 2000, 3000, 4000]
+n_neighbors = 30
+n_components_list = [2, 10]
+n_features = 100
+n_iter = 3  # Number of repetitions for averaging execution time
+
+# Store timings for each value of n_components
+timing_all = {}
+
+for n_components in n_components_list:
+    # Create containers for timing results
+    timing = {
+        "auto": np.zeros((len(n_samples_list), n_iter)),
+        "randomized_value": np.zeros((len(n_samples_list), n_iter)),
+    }
+
+    for j, n in enumerate(n_samples_list):
+        # Generate synthetic classification dataset
+        X, _ = make_classification(
+            n_samples=n,
+            n_features=n_features,
+            n_redundant=0,
+            n_clusters_per_class=1,
+            n_classes=1,
+            random_state=42,
+        )
+
+        # Evaluate both solvers for multiple repetitions
+        for solver in ["auto", "randomized_value"]:
+            for i in range(n_iter):
+                model = Isomap(
+                    n_neighbors=n_neighbors,
+                    n_components=n_components,
+                    eigen_solver=solver,
+                )
+                start = time.perf_counter()
+                model.fit(X)
+                elapsed = time.perf_counter() - start
+                timing[solver][j, i] = elapsed
+
+    timing_all[n_components] = timing
+
+fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+
+for idx, n_components in enumerate(n_components_list):
+    ax = axes[idx]
+    timing = timing_all[n_components]
+    avg_full = timing["auto"].mean(axis=1)
+    std_full = timing["auto"].std(axis=1)
+    avg_rand = timing["randomized_value"].mean(axis=1)
+    std_rand = timing["randomized_value"].std(axis=1)
+
+    ax.errorbar(
+        n_samples_list,
+        avg_full,
+        yerr=std_full,
+        label="Isomap (full)",
+        marker="o",
+        linestyle="-",
+    )
+    ax.errorbar(
+        n_samples_list,
+        avg_rand,
+        yerr=std_rand,
+        label="Isomap (randomized)",
+        marker="x",
+        linestyle="--",
+    )
+    ax.set_xlabel("Number of Samples")
+    ax.set_ylabel("Execution Time (seconds)")
+    ax.set_title(f"Isomap Execution Time (n_components = {n_components})")
+    ax.legend()
+    ax.grid(True)
+
+plt.tight_layout()
+plt.show()
diff --git a/benchmarks/bench_isomap_solvers_n_samples_vs_reconstruction_error.py b/benchmarks/bench_isomap_solvers_n_samples_vs_reconstruction_error.py
@@ -0,0 +1,99 @@
+"""
+========================================================================
+Benchmark: Isomap Reconstruction Error - Standard vs. Randomized Solver
+========================================================================
+
+This benchmark illustrates how the number of samples impacts the quality
+of the Isomap embedding, using reconstruction error as a metric.
+
+Description:
+------------
+We generate synthetic 2D non-linear data (two concentric circles) with
+varying numbers of samples. For each subset, we compare the reconstruction
+error of two Isomap solvers:
+
+- The `auto` solver (standard dense or arpack, selected automatically).
+- The  `randomized_value` solver .
+
+What you can observe:
+---------------------
+-The randomized and auto solvers yield the same errors for different samples,
+which means that randomized provides the same projection quality as auto.
+
+Further exploration:
+---------------------
+- Modify the number of neighbors or iterations.
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.datasets import make_circles
+from sklearn.manifold import Isomap
+
+# 1- Experiment Configuration
+# ---------------------------
+min_n_samples, max_n_samples = 100, 4000
+n_samples_grid_size = 4  # Number of sample sizes to test
+
+n_samples_range = [
+    int(
+        min_n_samples
+        + np.floor((x / (n_samples_grid_size - 1)) * (max_n_samples - min_n_samples))
+    )
+    for x in range(0, n_samples_grid_size)
+]
+
+n_components = 2
+n_iter = 3  # Number of repetitions per sample size
+
+# 2- Data Generation
+# ------------------
+n_features = 2
+X_full, y_full = make_circles(
+    n_samples=max_n_samples, factor=0.3, noise=0.05, random_state=0
+)
+
+# 3- Benchmark Execution
+# ----------------------
+errors_randomized = []
+errors_full = []
+
+for n_samples in n_samples_range:
+    X, y = X_full[:n_samples], y_full[:n_samples]
+    print(f"Computing for n_samples = {n_samples}")
+
+    # Instantiate Isomap solvers
+    isomap_randomized = Isomap(
+        n_neighbors=50, n_components=n_components, eigen_solver="randomized_value"
+    )
+    isomap_auto = Isomap(n_neighbors=50, n_components=n_components, eigen_solver="auto")
+
+    # Fit and record reconstruction error
+    isomap_randomized.fit(X)
+    err_rand = isomap_randomized.reconstruction_error()
+    errors_randomized.append(err_rand)
+
+    isomap_auto.fit(X)
+    err_auto = isomap_auto.reconstruction_error()
+    errors_full.append(err_auto)
+
+# 4- Results Visualization
+# ------------------------
+plt.figure(figsize=(10, 6))
+plt.scatter(n_samples_range, errors_full, label="Isomap (auto)", color="b", marker="*")
+plt.scatter(
+    n_samples_range,
+    errors_randomized,
+    label="Isomap (randomized)",
+    color="r",
+    marker="x",
+)
+
+plt.title("Isomap Reconstruction Error vs. Number of Samples")
+plt.xlabel("Number of Samples")
+plt.ylabel("Reconstruction Error")
+plt.legend()
+plt.grid(True)
+plt.tight_layout()
+plt.show()
diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py
@@ -78,6 +78,7 @@
 ref_time = np.empty((len(n_compo_range), n_iter)) * np.nan
 a_time = np.empty((len(n_compo_range), n_iter)) * np.nan
 r_time = np.empty((len(n_compo_range), n_iter)) * np.nan
+rv_time = np.empty((len(n_compo_range), n_iter)) * np.nan
 # loop
 for j, n_components in enumerate(n_compo_range):
     n_components = int(n_components)
@@ -119,13 +120,28 @@
         # check that the result is still correct despite the approximation
         assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))
 
+    # D- randomized_value
+    print("  - randomized_value solver")
+    for i in range(n_iter):
+        start_time = time.perf_counter()
+        rv_pred = (
+            KernelPCA(n_components, eigen_solver="randomized_value")
+            .fit(X_train)
+            .transform(X_test)
+        )
+        rv_time[j, i] = time.perf_counter() - start_time
+        # check that the result is still correct despite the approximation
+        assert_array_almost_equal(np.abs(rv_pred), np.abs(ref_pred))
+
 # Compute statistics for the 3 methods
 avg_ref_time = ref_time.mean(axis=1)
 std_ref_time = ref_time.std(axis=1)
 avg_a_time = a_time.mean(axis=1)
 std_a_time = a_time.std(axis=1)
 avg_r_time = r_time.mean(axis=1)
 std_r_time = r_time.std(axis=1)
+avg_rv_time = rv_time.mean(axis=1)
+std_rv_time = rv_time.std(axis=1)
 
 
 # 4- Plots
@@ -160,6 +176,15 @@
     color="b",
     label="randomized",
 )
+ax.errorbar(
+    n_compo_range,
+    avg_rv_time,
+    yerr=std_rv_time,
+    marker="x",
+    linestyle="",
+    color="purple",
+    label="randomized_value",
+)
 ax.legend(loc="upper left")
 
 # customize axes