Skip to content

ENH: Faster Eigen Decomposition For Isomap & KernelPCA #31247

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions benchmarks/bench_isomap_auto_vs_randomized.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
======================================================================
Benchmark: Comparing Isomap Solvers - Execution Time vs. Representation
======================================================================

This benchmark demonstrates how different eigenvalue solvers in Isomap
can affect execution time and embedding quality.

Description:
------------
We use a subset of handwritten digits (`load_digits` with 6 classes).
Each data point is projected into a lower-dimensional space (2D) using
two different solvers (`auto` and `randomized`).

What you can observe:
----------------------
- The `auto` solver provides a reference solution.
- The `randomized` solver is tested for comparison in terms of
representation quality and execution time.

Further exploration:
---------------------
You can modify the number of neighbors (`n_neighbors`) or experiment with
other Isomap solvers.
"""

import time

import matplotlib.pyplot as plt
import numpy as np
from matplotlib import offsetbox

from sklearn.datasets import load_digits
from sklearn.manifold import Isomap
from sklearn.preprocessing import MinMaxScaler

# 1- Data Loading
# ---------------
digits = load_digits(n_class=6)
X, y = digits.data, digits.target
n_neighbors = 30 # Number of neighbors for Isomap


# 2- Visualization Function
# -------------------------
def plot_embedding(ax, X, title):
"""Displays projected points with image annotations."""
X = MinMaxScaler().fit_transform(X)

for digit in digits.target_names:
ax.scatter(
*X[y == digit].T,
marker=f"${digit}$",
s=60,
color=plt.cm.Dark2(digit),
alpha=0.425,
zorder=2,
)

# Add digit images in the projected space
shown_images = np.array([[1.0, 1.0]])
for i in range(X.shape[0]):
dist = np.sum((X[i] - shown_images) ** 2, 1)
if np.min(dist) < 4e-3:
continue
shown_images = np.concatenate([shown_images, [X[i]]], axis=0)
imagebox = offsetbox.AnnotationBbox(
offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), X[i]
)
imagebox.set(zorder=1)
ax.add_artist(imagebox)

ax.set_title(title)
ax.axis("off")


# 3- Define Embeddings and Benchmark
# ----------------------------------
embeddings = {
"Isomap (auto solver)": Isomap(
n_neighbors=n_neighbors, n_components=2, eigen_solver="auto"
),
"Isomap (randomized solver)": Isomap(
n_neighbors=n_neighbors, n_components=2, eigen_solver="randomized_value"
),
}

projections, timing = {}, {}

# Compute embeddings
for name, transformer in embeddings.items():
print(f"Computing {name}...")
start_time = time.time()
projections[name] = transformer.fit_transform(X, y)
timing[name] = time.time() - start_time

# 4- Display Results
# ------------------
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for ax, (name, proj) in zip(axes, projections.items()):
title = f"{name} (time: {timing[name]:.3f}s)"
plot_embedding(ax, proj, title)

plt.tight_layout()
plt.show()
118 changes: 118 additions & 0 deletions benchmarks/bench_isomap_execution_time_full_vs_randomized.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""
======================================================================
Isomap Solvers Benchmark: Execution Time vs Number of Samples
======================================================================

This benchmark demonstrates how the choice of eigen_solver in Isomap
can significantly affect computation time, especially as the dataset
size increases.

Description:
------------
Synthetic datasets are generated using `make_classification` with a
fixed number of features. The number of samples is
varied from 1000 to 4000.

For each setting, Isomap is applied using two different solvers:
- 'auto' (full eigendecomposition)
- 'randomized_value'

The execution time of each solver is measured for each number of
samples, and the average time over multiple runs (default: 3) is
plotted.

What you can observe:
---------------------
If n_components < 10, the randomized and auto solvers produce similar
results (in this case, the arpack solver is selected).
However, when n_components > 10, the randomized solver becomes significantly
faster, especially as the number of samples increases.

"""

import time

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import make_classification
from sklearn.manifold import Isomap

# 1 - Experiment Setup
# -- -- -- -- -- -- -- -- -- -
n_samples_list = [1000, 2000, 3000, 4000]
n_neighbors = 30
n_components_list = [2, 10]
n_features = 100
n_iter = 3 # Number of repetitions for averaging execution time

# Store timings for each value of n_components
timing_all = {}

for n_components in n_components_list:
# Create containers for timing results
timing = {
"auto": np.zeros((len(n_samples_list), n_iter)),
"randomized_value": np.zeros((len(n_samples_list), n_iter)),
}

for j, n in enumerate(n_samples_list):
# Generate synthetic classification dataset
X, _ = make_classification(
n_samples=n,
n_features=n_features,
n_redundant=0,
n_clusters_per_class=1,
n_classes=1,
random_state=42,
)

# Evaluate both solvers for multiple repetitions
for solver in ["auto", "randomized_value"]:
for i in range(n_iter):
model = Isomap(
n_neighbors=n_neighbors,
n_components=n_components,
eigen_solver=solver,
)
start = time.perf_counter()
model.fit(X)
elapsed = time.perf_counter() - start
timing[solver][j, i] = elapsed

timing_all[n_components] = timing

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for idx, n_components in enumerate(n_components_list):
ax = axes[idx]
timing = timing_all[n_components]
avg_full = timing["auto"].mean(axis=1)
std_full = timing["auto"].std(axis=1)
avg_rand = timing["randomized_value"].mean(axis=1)
std_rand = timing["randomized_value"].std(axis=1)

ax.errorbar(
n_samples_list,
avg_full,
yerr=std_full,
label="Isomap (full)",
marker="o",
linestyle="-",
)
ax.errorbar(
n_samples_list,
avg_rand,
yerr=std_rand,
label="Isomap (randomized)",
marker="x",
linestyle="--",
)
ax.set_xlabel("Number of Samples")
ax.set_ylabel("Execution Time (seconds)")
ax.set_title(f"Isomap Execution Time (n_components = {n_components})")
ax.legend()
ax.grid(True)

plt.tight_layout()
plt.show()
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
========================================================================
Benchmark: Isomap Reconstruction Error - Standard vs. Randomized Solver
========================================================================

This benchmark illustrates how the number of samples impacts the quality
of the Isomap embedding, using reconstruction error as a metric.

Description:
------------
We generate synthetic 2D non-linear data (two concentric circles) with
varying numbers of samples. For each subset, we compare the reconstruction
error of two Isomap solvers:

- The `auto` solver (standard dense or arpack, selected automatically).
- The `randomized_value` solver .

What you can observe:
---------------------
-The randomized and auto solvers yield the same errors for different samples,
which means that randomized provides the same projection quality as auto.

Further exploration:
---------------------
- Modify the number of neighbors or iterations.
"""

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import make_circles
from sklearn.manifold import Isomap

# 1- Experiment Configuration
# ---------------------------
min_n_samples, max_n_samples = 100, 4000
n_samples_grid_size = 4 # Number of sample sizes to test

n_samples_range = [
int(
min_n_samples
+ np.floor((x / (n_samples_grid_size - 1)) * (max_n_samples - min_n_samples))
)
for x in range(0, n_samples_grid_size)
]

n_components = 2
n_iter = 3 # Number of repetitions per sample size

# 2- Data Generation
# ------------------
n_features = 2
X_full, y_full = make_circles(
n_samples=max_n_samples, factor=0.3, noise=0.05, random_state=0
)

# 3- Benchmark Execution
# ----------------------
errors_randomized = []
errors_full = []

for n_samples in n_samples_range:
X, y = X_full[:n_samples], y_full[:n_samples]
print(f"Computing for n_samples = {n_samples}")

# Instantiate Isomap solvers
isomap_randomized = Isomap(
n_neighbors=50, n_components=n_components, eigen_solver="randomized_value"
)
isomap_auto = Isomap(n_neighbors=50, n_components=n_components, eigen_solver="auto")

# Fit and record reconstruction error
isomap_randomized.fit(X)
err_rand = isomap_randomized.reconstruction_error()
errors_randomized.append(err_rand)

isomap_auto.fit(X)
err_auto = isomap_auto.reconstruction_error()
errors_full.append(err_auto)

# 4- Results Visualization
# ------------------------
plt.figure(figsize=(10, 6))
plt.scatter(n_samples_range, errors_full, label="Isomap (auto)", color="b", marker="*")
plt.scatter(
n_samples_range,
errors_randomized,
label="Isomap (randomized)",
color="r",
marker="x",
)

plt.title("Isomap Reconstruction Error vs. Number of Samples")
plt.xlabel("Number of Samples")
plt.ylabel("Reconstruction Error")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
25 changes: 25 additions & 0 deletions benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
ref_time = np.empty((len(n_compo_range), n_iter)) * np.nan
a_time = np.empty((len(n_compo_range), n_iter)) * np.nan
r_time = np.empty((len(n_compo_range), n_iter)) * np.nan
rv_time = np.empty((len(n_compo_range), n_iter)) * np.nan
# loop
for j, n_components in enumerate(n_compo_range):
n_components = int(n_components)
Expand Down Expand Up @@ -119,13 +120,28 @@
# check that the result is still correct despite the approximation
assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))

# D- randomized_value
print(" - randomized_value solver")
for i in range(n_iter):
start_time = time.perf_counter()
rv_pred = (
KernelPCA(n_components, eigen_solver="randomized_value")
.fit(X_train)
.transform(X_test)
)
rv_time[j, i] = time.perf_counter() - start_time
# check that the result is still correct despite the approximation
assert_array_almost_equal(np.abs(rv_pred), np.abs(ref_pred))

# Compute statistics for the 3 methods
avg_ref_time = ref_time.mean(axis=1)
std_ref_time = ref_time.std(axis=1)
avg_a_time = a_time.mean(axis=1)
std_a_time = a_time.std(axis=1)
avg_r_time = r_time.mean(axis=1)
std_r_time = r_time.std(axis=1)
avg_rv_time = rv_time.mean(axis=1)
std_rv_time = rv_time.std(axis=1)


# 4- Plots
Expand Down Expand Up @@ -160,6 +176,15 @@
color="b",
label="randomized",
)
ax.errorbar(
n_compo_range,
avg_rv_time,
yerr=std_rv_time,
marker="x",
linestyle="",
color="purple",
label="randomized_value",
)
ax.legend(loc="upper left")

# customize axes
Expand Down
Loading