diff --git a/benchmarks/bench_isomap_auto_vs_randomized.py b/benchmarks/bench_isomap_auto_vs_randomized.py new file mode 100644 index 0000000000000..947e4e27752bc --- /dev/null +++ b/benchmarks/bench_isomap_auto_vs_randomized.py @@ -0,0 +1,106 @@ +""" +====================================================================== +Benchmark: Comparing Isomap Solvers - Execution Time vs. Representation +====================================================================== + +This benchmark demonstrates how different eigenvalue solvers in Isomap +can affect execution time and embedding quality. + +Description: +------------ +We use a subset of handwritten digits (`load_digits` with 6 classes). +Each data point is projected into a lower-dimensional space (2D) using +two different solvers (`auto` and `randomized`). + +What you can observe: +---------------------- +- The `auto` solver provides a reference solution. +- The `randomized` solver is tested for comparison in terms of + representation quality and execution time. + +Further exploration: +--------------------- +You can modify the number of neighbors (`n_neighbors`) or experiment with +other Isomap solvers. +""" + +import time + +import matplotlib.pyplot as plt +import numpy as np +from matplotlib import offsetbox + +from sklearn.datasets import load_digits +from sklearn.manifold import Isomap +from sklearn.preprocessing import MinMaxScaler + +# 1- Data Loading +# --------------- +digits = load_digits(n_class=6) +X, y = digits.data, digits.target +n_neighbors = 30 # Number of neighbors for Isomap + + +# 2- Visualization Function +# ------------------------- +def plot_embedding(ax, X, title): + """Displays projected points with image annotations.""" + X = MinMaxScaler().fit_transform(X) + + for digit in digits.target_names: + ax.scatter( + *X[y == digit].T, + marker=f"${digit}$", + s=60, + color=plt.cm.Dark2(digit), + alpha=0.425, + zorder=2, + ) + + # Add digit images in the projected space + shown_images = np.array([[1.0, 1.0]]) + for i in range(X.shape[0]): + dist = np.sum((X[i] - shown_images) ** 2, 1) + if np.min(dist) < 4e-3: + continue + shown_images = np.concatenate([shown_images, [X[i]]], axis=0) + imagebox = offsetbox.AnnotationBbox( + offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), X[i] + ) + imagebox.set(zorder=1) + ax.add_artist(imagebox) + + ax.set_title(title) + ax.axis("off") + + +# 3- Define Embeddings and Benchmark +# ---------------------------------- +embeddings = { + "Isomap (auto solver)": Isomap( + n_neighbors=n_neighbors, n_components=2, eigen_solver="auto" + ), + "Isomap (randomized solver)": Isomap( + n_neighbors=n_neighbors, n_components=2, eigen_solver="randomized_value" + ), +} + +projections, timing = {}, {} + +# Compute embeddings +for name, transformer in embeddings.items(): + print(f"Computing {name}...") + start_time = time.time() + projections[name] = transformer.fit_transform(X, y) + timing[name] = time.time() - start_time + +# 4- Display Results +# ------------------ +fig, axes = plt.subplots(1, 2, figsize=(14, 6)) + +for ax, (name, proj) in zip(axes, projections.items()): + title = f"{name} (time: {timing[name]:.3f}s)" + plot_embedding(ax, proj, title) + +plt.tight_layout() +plt.show() diff --git a/benchmarks/bench_isomap_execution_time_full_vs_randomized.py b/benchmarks/bench_isomap_execution_time_full_vs_randomized.py new file mode 100644 index 0000000000000..68f2861f9aac0 --- /dev/null +++ b/benchmarks/bench_isomap_execution_time_full_vs_randomized.py @@ -0,0 +1,118 @@ +""" +====================================================================== +Isomap Solvers Benchmark: Execution Time vs Number of Samples +====================================================================== + +This benchmark demonstrates how the choice of eigen_solver in Isomap +can significantly affect computation time, especially as the dataset +size increases. + +Description: +------------ +Synthetic datasets are generated using `make_classification` with a +fixed number of features. The number of samples is +varied from 1000 to 4000. + +For each setting, Isomap is applied using two different solvers: +- 'auto' (full eigendecomposition) +- 'randomized_value' + +The execution time of each solver is measured for each number of +samples, and the average time over multiple runs (default: 3) is +plotted. + +What you can observe: +--------------------- +If n_components < 10, the randomized and auto solvers produce similar +results (in this case, the arpack solver is selected). +However, when n_components > 10, the randomized solver becomes significantly +faster, especially as the number of samples increases. + +""" + +import time + +import matplotlib.pyplot as plt +import numpy as np + +from sklearn.datasets import make_classification +from sklearn.manifold import Isomap + +# 1 - Experiment Setup +# -- -- -- -- -- -- -- -- -- - +n_samples_list = [1000, 2000, 3000, 4000] +n_neighbors = 30 +n_components_list = [2, 10] +n_features = 100 +n_iter = 3 # Number of repetitions for averaging execution time + +# Store timings for each value of n_components +timing_all = {} + +for n_components in n_components_list: + # Create containers for timing results + timing = { + "auto": np.zeros((len(n_samples_list), n_iter)), + "randomized_value": np.zeros((len(n_samples_list), n_iter)), + } + + for j, n in enumerate(n_samples_list): + # Generate synthetic classification dataset + X, _ = make_classification( + n_samples=n, + n_features=n_features, + n_redundant=0, + n_clusters_per_class=1, + n_classes=1, + random_state=42, + ) + + # Evaluate both solvers for multiple repetitions + for solver in ["auto", "randomized_value"]: + for i in range(n_iter): + model = Isomap( + n_neighbors=n_neighbors, + n_components=n_components, + eigen_solver=solver, + ) + start = time.perf_counter() + model.fit(X) + elapsed = time.perf_counter() - start + timing[solver][j, i] = elapsed + + timing_all[n_components] = timing + +fig, axes = plt.subplots(1, 2, figsize=(14, 6)) + +for idx, n_components in enumerate(n_components_list): + ax = axes[idx] + timing = timing_all[n_components] + avg_full = timing["auto"].mean(axis=1) + std_full = timing["auto"].std(axis=1) + avg_rand = timing["randomized_value"].mean(axis=1) + std_rand = timing["randomized_value"].std(axis=1) + + ax.errorbar( + n_samples_list, + avg_full, + yerr=std_full, + label="Isomap (full)", + marker="o", + linestyle="-", + ) + ax.errorbar( + n_samples_list, + avg_rand, + yerr=std_rand, + label="Isomap (randomized)", + marker="x", + linestyle="--", + ) + ax.set_xlabel("Number of Samples") + ax.set_ylabel("Execution Time (seconds)") + ax.set_title(f"Isomap Execution Time (n_components = {n_components})") + ax.legend() + ax.grid(True) + +plt.tight_layout() +plt.show() diff --git a/benchmarks/bench_isomap_solvers_n_samples_vs_reconstruction_error.py b/benchmarks/bench_isomap_solvers_n_samples_vs_reconstruction_error.py new file mode 100644 index 0000000000000..bc518e570d29d --- /dev/null +++ b/benchmarks/bench_isomap_solvers_n_samples_vs_reconstruction_error.py @@ -0,0 +1,99 @@ +""" +======================================================================== +Benchmark: Isomap Reconstruction Error - Standard vs. Randomized Solver +======================================================================== + +This benchmark illustrates how the number of samples impacts the quality +of the Isomap embedding, using reconstruction error as a metric. + +Description: +------------ +We generate synthetic 2D non-linear data (two concentric circles) with +varying numbers of samples. For each subset, we compare the reconstruction +error of two Isomap solvers: + +- The `auto` solver (standard dense or arpack, selected automatically). +- The `randomized_value` solver . + +What you can observe: +--------------------- +-The randomized and auto solvers yield the same errors for different samples, +which means that randomized provides the same projection quality as auto. + +Further exploration: +--------------------- +- Modify the number of neighbors or iterations. +""" + +import matplotlib.pyplot as plt +import numpy as np + +from sklearn.datasets import make_circles +from sklearn.manifold import Isomap + +# 1- Experiment Configuration +# --------------------------- +min_n_samples, max_n_samples = 100, 4000 +n_samples_grid_size = 4 # Number of sample sizes to test + +n_samples_range = [ + int( + min_n_samples + + np.floor((x / (n_samples_grid_size - 1)) * (max_n_samples - min_n_samples)) + ) + for x in range(0, n_samples_grid_size) +] + +n_components = 2 +n_iter = 3 # Number of repetitions per sample size + +# 2- Data Generation +# ------------------ +n_features = 2 +X_full, y_full = make_circles( + n_samples=max_n_samples, factor=0.3, noise=0.05, random_state=0 +) + +# 3- Benchmark Execution +# ---------------------- +errors_randomized = [] +errors_full = [] + +for n_samples in n_samples_range: + X, y = X_full[:n_samples], y_full[:n_samples] + print(f"Computing for n_samples = {n_samples}") + + # Instantiate Isomap solvers + isomap_randomized = Isomap( + n_neighbors=50, n_components=n_components, eigen_solver="randomized_value" + ) + isomap_auto = Isomap(n_neighbors=50, n_components=n_components, eigen_solver="auto") + + # Fit and record reconstruction error + isomap_randomized.fit(X) + err_rand = isomap_randomized.reconstruction_error() + errors_randomized.append(err_rand) + + isomap_auto.fit(X) + err_auto = isomap_auto.reconstruction_error() + errors_full.append(err_auto) + +# 4- Results Visualization +# ------------------------ +plt.figure(figsize=(10, 6)) +plt.scatter(n_samples_range, errors_full, label="Isomap (auto)", color="b", marker="*") +plt.scatter( + n_samples_range, + errors_randomized, + label="Isomap (randomized)", + color="r", + marker="x", +) + +plt.title("Isomap Reconstruction Error vs. Number of Samples") +plt.xlabel("Number of Samples") +plt.ylabel("Reconstruction Error") +plt.legend() +plt.grid(True) +plt.tight_layout() +plt.show() diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py index a468f7b3e1abf..4e3bf5a55f93d 100644 --- a/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py +++ b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py @@ -78,6 +78,7 @@ ref_time = np.empty((len(n_compo_range), n_iter)) * np.nan a_time = np.empty((len(n_compo_range), n_iter)) * np.nan r_time = np.empty((len(n_compo_range), n_iter)) * np.nan +rv_time = np.empty((len(n_compo_range), n_iter)) * np.nan # loop for j, n_components in enumerate(n_compo_range): n_components = int(n_components) @@ -119,6 +120,19 @@ # check that the result is still correct despite the approximation assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred)) + # D- randomized_value + print(" - randomized_value solver") + for i in range(n_iter): + start_time = time.perf_counter() + rv_pred = ( + KernelPCA(n_components, eigen_solver="randomized_value") + .fit(X_train) + .transform(X_test) + ) + rv_time[j, i] = time.perf_counter() - start_time + # check that the result is still correct despite the approximation + assert_array_almost_equal(np.abs(rv_pred), np.abs(ref_pred)) + # Compute statistics for the 3 methods avg_ref_time = ref_time.mean(axis=1) std_ref_time = ref_time.std(axis=1) @@ -126,6 +140,8 @@ std_a_time = a_time.std(axis=1) avg_r_time = r_time.mean(axis=1) std_r_time = r_time.std(axis=1) +avg_rv_time = rv_time.mean(axis=1) +std_rv_time = rv_time.std(axis=1) # 4- Plots @@ -160,6 +176,15 @@ color="b", label="randomized", ) +ax.errorbar( + n_compo_range, + avg_rv_time, + yerr=std_rv_time, + marker="x", + linestyle="", + color="purple", + label="randomized_value", +) ax.legend(loc="upper left") # customize axes diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py index cae74c6f442ff..8c956d3ec0c24 100644 --- a/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py +++ b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py @@ -80,6 +80,7 @@ ref_time = np.empty((len(n_samples_range), n_iter)) * np.nan a_time = np.empty((len(n_samples_range), n_iter)) * np.nan r_time = np.empty((len(n_samples_range), n_iter)) * np.nan +rv_time = np.empty((len(n_samples_range), n_iter)) * np.nan # loop for j, n_samples in enumerate(n_samples_range): @@ -125,6 +126,19 @@ # check that the result is still correct despite the approximation assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred)) + # D- randomized_value + print(" - randomized_value") + for i in range(n_iter): + start_time = time.perf_counter() + rv_pred = ( + KernelPCA(n_components, eigen_solver="randomized_value") + .fit(X_train) + .transform(X_test) + ) + rv_time[j, i] = time.perf_counter() - start_time + # check that the result is still correct despite the approximation + assert_array_almost_equal(np.abs(rv_pred), np.abs(ref_pred)) + # Compute statistics for the 3 methods avg_ref_time = ref_time.mean(axis=1) std_ref_time = ref_time.std(axis=1) @@ -132,7 +146,8 @@ std_a_time = a_time.std(axis=1) avg_r_time = r_time.mean(axis=1) std_r_time = r_time.std(axis=1) - +avg_rv_time = rv_time.mean(axis=1) +std_rv_time = rv_time.std(axis=1) # 4- Plots # -------- @@ -167,6 +182,15 @@ color="b", label="randomized", ) +ax.errorbar( + n_samples_range, + avg_rv_time, + yerr=std_rv_time, + marker="x", + linestyle="", + color="purple", + label="randomized_value", +) ax.legend(loc="upper left") # customize axes diff --git a/doc/whats_new/upcoming_changes/sklearn.utils/31247.feature.rst b/doc/whats_new/upcoming_changes/sklearn.utils/31247.feature.rst new file mode 100644 index 0000000000000..91c89d6090f98 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.utils/31247.feature.rst @@ -0,0 +1,17 @@ +Feature +------------ + +- Added :func:`randomized_eigen_decomposition`, an approximate eigen decomposition method based on the approach from + **Halko, Martinsson, and Tropp (2011)**: *Finding Structure with Randomness: Probabilistic Algorithms for Constructing + Approximate Matrix Decompositions*. This method provides a faster alternative to existing eigen decomposition techniques. + +- Integrated :func:`eigen_decomposition_one_pass` into :class:`sklearn.manifold.Isomap` and + :class:`sklearn.decomposition.KernelPCA` as an additional option for eigen decomposition. + +- Added a test suite comparing the new method to existing solvers (:obj:`arpack`, :obj:`dense`, etc.), ensuring numerical + accuracy and stability. + +- Included benchmarks to analyze the **performance improvements** over traditional eigen decomposition approaches. + +by :user: `Sylvain Marié<@smarie>`, `Mohamed yaich<@yaichm>`, `Oussama Er-rabie<@eroussama>`, `Mohamed Dlimi<@Dlimim>`, +`Hamza Zeroual<@HamzaLuffy>` and `Amine Hannoun<@AmineHannoun>`. \ No newline at end of file diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 79573651eeb84..c727570ac97fb 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -263,7 +263,9 @@ class KernelPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator "kernel_params": [dict, None], "alpha": [Interval(Real, 0, None, closed="left")], "fit_inverse_transform": ["boolean"], - "eigen_solver": [StrOptions({"auto", "dense", "arpack", "randomized"})], + "eigen_solver": [ + StrOptions({"auto", "dense", "arpack", "randomized", "randomized_value"}) + ], "tol": [Interval(Real, 0, None, closed="left")], "max_iter": [ Interval(Integral, 1, None, closed="left"), @@ -363,6 +365,14 @@ def _fit_transform_in_place(self, K): random_state=self.random_state, selection="module", ) + elif eigen_solver == "randomized_value": + self.eigenvalues_, self.eigenvectors_ = _randomized_eigsh( + K, + n_components=n_components, + n_iter=self.iterated_power, + random_state=self.random_state, + selection="value", + ) # make sure that the eigenvalues are ok and fix numerical issues self.eigenvalues_ = _check_psd_eigenvalues( diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 57ae75c184622..36165dc304c97 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -37,7 +37,7 @@ def histogram(x, y, **kwargs): assert kwargs == {} # no kernel_params that we didn't ask for return np.minimum(x, y).sum() - for eigen_solver in ("auto", "dense", "arpack", "randomized"): + for eigen_solver in ("auto", "dense", "arpack", "randomized", "randomized_value"): for kernel in ("linear", "rbf", "poly", histogram): # histogram kernel produces singular matrix inside linalg.solve # XXX use a least-squares approximation? @@ -128,7 +128,7 @@ def test_kernel_pca_sparse(csr_container, global_random_seed): X_fit = csr_container(rng.random_sample((5, 4))) X_pred = csr_container(rng.random_sample((2, 4))) - for eigen_solver in ("auto", "arpack", "randomized"): + for eigen_solver in ("auto", "arpack", "randomized", "randomized_value"): for kernel in ("linear", "rbf", "poly"): # transform fit data kpca = KernelPCA( @@ -191,7 +191,7 @@ def test_kernel_pca_n_components(): X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) - for eigen_solver in ("dense", "arpack", "randomized"): + for eigen_solver in ("dense", "arpack", "randomized", "randomized_value"): for c in [1, 2, 4]: kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver) shape = kpca.fit(X_fit).transform(X_pred).shape @@ -252,7 +252,7 @@ def test_kernel_pca_precomputed(global_random_seed): X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) - for eigen_solver in ("dense", "arpack", "randomized"): + for eigen_solver in ("dense", "arpack", "randomized", "randomized_value"): X_kpca = ( KernelPCA(4, eigen_solver=eigen_solver, random_state=0) .fit(X_fit) @@ -284,7 +284,9 @@ def test_kernel_pca_precomputed(global_random_seed): assert_array_almost_equal(np.abs(X_kpca_train), np.abs(X_kpca_train2)) -@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"]) +@pytest.mark.parametrize( + "solver", ["auto", "dense", "arpack", "randomized", "randomized_value"] +) def test_kernel_pca_precomputed_non_symmetric(solver): """Check that the kernel centerer works. @@ -386,7 +388,9 @@ def test_kernel_conditioning(): assert np.all(kpca.eigenvalues_ == _check_psd_eigenvalues(kpca.eigenvalues_)) -@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"]) +@pytest.mark.parametrize( + "solver", ["auto", "dense", "arpack", "randomized", "randomized_value"] +) def test_precomputed_kernel_not_psd(solver): """Check how KernelPCA works with non-PSD kernels depending on n_components @@ -440,7 +444,8 @@ def test_precomputed_kernel_not_psd(solver): @pytest.mark.parametrize("n_components", [4, 10, 20]) def test_kernel_pca_solvers_equivalence(n_components): - """Check that 'dense' 'arpack' & 'randomized' solvers give similar results""" + """Check that 'dense' 'arpack' & 'randomized' & 'randomized_value' + solvers give similar results""" # Generate random data n_train, n_test = 1_000, 100 @@ -474,6 +479,15 @@ def test_kernel_pca_solvers_equivalence(n_components): # check that the result is still correct despite the approximation assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred)) + # randomized_value + rv_pred = ( + KernelPCA(n_components, eigen_solver="randomized_value", random_state=0) + .fit(X_fit) + .transform(X_pred) + ) + # check that the result is still correct despite the approximation + assert_array_almost_equal(np.abs(rv_pred), np.abs(ref_pred)) + def test_kernel_pca_inverse_transform_reconstruction(): """Test if the reconstruction is a good approximation. diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index 90154470c18a4..127a3d725d51e 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -57,6 +57,8 @@ class Isomap(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): 'dense' : Use a direct solver (i.e. LAPACK) for the eigenvalue decomposition. + 'randomized_value' : Use randomized solver in order to reduce complexity. + tol : float, default=0 Convergence tolerance passed to arpack or lobpcg. not used if eigen_solver == 'dense'. @@ -169,7 +171,7 @@ class Isomap(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): "n_neighbors": [Interval(Integral, 1, None, closed="left"), None], "radius": [Interval(Real, 0, None, closed="both"), None], "n_components": [Interval(Integral, 1, None, closed="left")], - "eigen_solver": [StrOptions({"auto", "arpack", "dense"})], + "eigen_solver": [StrOptions({"auto", "arpack", "dense", "randomized_value"})], "tol": [Interval(Real, 0, None, closed="left")], "max_iter": [Interval(Integral, 1, None, closed="left"), None], "path_method": [StrOptions({"auto", "FW", "D"})], diff --git a/sklearn/manifold/tests/test_isomap.py b/sklearn/manifold/tests/test_isomap.py index e38b92442e58d..ef6581dc973c2 100644 --- a/sklearn/manifold/tests/test_isomap.py +++ b/sklearn/manifold/tests/test_isomap.py @@ -15,7 +15,7 @@ ) from sklearn.utils.fixes import CSR_CONTAINERS -eigen_solvers = ["auto", "dense", "arpack"] +eigen_solvers = ["auto", "dense", "arpack", "randomized_value"] path_methods = ["auto", "FW", "D"] diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index b98a7747c28aa..e586040d805e0 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -604,6 +604,91 @@ def _randomized_svd( return U[:, :n_components], s[:n_components], Vt[:n_components, :] +def randomized_eigen_decomposition( + A, + n_components, + n_oversamples=10, + n_iter="auto", + power_iteration_normalizer="auto", + random_state=None, +): + """ + Approximate eigenvalue decomposition of a Hermitian matrix A ≈ U Λ U*. + + Parameters + ---------- + A : ndarray of shape (n, n) + Hermitian matrix to decompose. + + n_components : int + Number of eigenvalues and eigenvectors to extract. + + n_oversamples : int, default=10 + Additional number of random vectors to sample the range of A so as to + ensure proper conditioning. The total number of random vectors used to + find the range of A is n_components + n_oversamples. + + n_iter : int or "auto", default="auto" + Number of power iterations. Can be used to deal with slow singular + value decay of A. When "auto", it is set to 4 (or 7 if n_components is + small compared to n). + + power_iteration_normalizer : {'auto', 'QR', 'LU', None}, default='auto' + Normalizer for power iterations. Used by randomized_range_finder. + + random_state : int, RandomState instance or None, default=None + Pseudo-random number generator to control randomness. + + Returns + ------- + U : ndarray of shape (n, n_components) + Approximate eigenvectors. + + Lambda : ndarray of shape (n_components,) + Approximate eigenvalues. + + Notes + ----- + This implementation follows Algorithm 5.3 (Direct Eigenvalue Decomposition) + from: + + Halko, N., Martinsson, P.G., & Tropp, J.A. (2011). + Finding structure with randomness: Probabilistic algorithms for + constructing approximate matrix decompositions. + SIAM Review, 53(2), 217–288. + """ + random_state = check_random_state(random_state) + n_random = n_components + n_oversamples + n = A.shape[0] + + if n_iter == "auto": + n_iter = 7 if n_components < 0.1 * n else 4 + + # Step 1: compute an orthonormal matrix Q approximating the range of A + Q = randomized_range_finder( + A, + size=n_random, + n_iter=n_iter, + power_iteration_normalizer=power_iteration_normalizer, + random_state=random_state, + ) + + # Step 2: project A to the low-dimensional subspace + B = Q.T @ A @ Q + + # Step 3: compute the eigenvalue decomposition of the small matrix + xp, is_array_api_compliant = get_namespace(B) + if is_array_api_compliant: + Lambda, V = xp.linalg.eigh(B) + else: + Lambda, V = linalg.eigh(B) + + # Step 4: compute the approximate eigenvectors of A + U = Q @ V + + return Lambda[-n_components:], U[:, -n_components:] + + def _randomized_eigsh( M, n_components, @@ -690,9 +775,14 @@ def _randomized_eigsh( effective rank. Usually, `n_components` is chosen to be greater than k so increasing `n_oversamples` up to `n_components` should be enough. - Strategy 'value': not implemented yet. - Algorithms 5.3, 5.4 and 5.5 in the Halko et al paper should provide good - candidates for a future implementation. + Strategy 'value': + This randomized algorithm efficiently approximates eigendecompositions + of Hermitian matrices by projection onto a lower-dimensional subspace using basis Q. + Relying on Algorithm 5.3 from Halko et al.'s article, it computes B = Q*AQ, + finds its eigendecomposition B = VΛV*, and forms U = QV to yield A = UΛU* + with bounded error. Unlike the 'module' strategy, it works efficiently with + non-positive semidefinite matrices, handling both positive and negative + eigenvalues directly. Strategy 'module': The principle is that for diagonalizable matrices, the singular values and @@ -724,8 +814,15 @@ def _randomized_eigsh( Halko, et al. (2009) """ if selection == "value": # pragma: no cover - # to do : an algorithm can be found in the Halko et al reference - raise NotImplementedError() + # Call Hako et al 5.3 randomized eigs solver + eigvals, eigvecs = randomized_eigen_decomposition( + M, + n_components=n_components, + n_oversamples=n_oversamples, + n_iter=n_iter, + power_iteration_normalizer=power_iteration_normalizer, + random_state=random_state, + ) elif selection == "module": # Note: no need for deterministic U and Vt (flip_sign=True), diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 907de11702af2..b34a53f22ba9f 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -198,10 +198,6 @@ def test_randomized_eigsh(dtype): # eigenvectors assert eigvecs.shape == (4, 2) - # with 'value' selection method, the negative eigenvalue does not show up - with pytest.raises(NotImplementedError): - _randomized_eigsh(X, n_components=2, selection="value") - @pytest.mark.parametrize("k", (10, 50, 100, 199, 200)) def test_randomized_eigsh_compared_to_others(k): @@ -275,6 +271,44 @@ def test_randomized_eigsh_compared_to_others(k): assert_array_almost_equal(eigvecs_arpack, eigvecs_lapack, decimal=8) +@pytest.mark.parametrize("k", (10, 50, 100, 199, 200)) +def test_randomized_eigsh_value_compared_to_others(k): + """Check that `_randomized_eigsh(value)` is similar to other `eigsh` + + Tests that for a random PSD matrix, `_randomized_eigsh(value)` provides results + comparable to LAPACK (scipy.linalg.eigh) + """ + n_features = 200 + # make a random PSD matrix + X = make_sparse_spd_matrix(n_features, random_state=0) + + # with randomized_value + eigvals, eigvecs = _randomized_eigsh( + X, + n_components=k, + n_oversamples=20, + selection="value", + n_iter=25, + random_state=0, + ) + + # with LAPACK + eigvals_lapack, eigvecs_lapack = eigh( + X, subset_by_index=(n_features - k, n_features - 1) + ) + + # - eigenvalues comparison + assert eigvals.shape == (k,) + # comparison precision + assert_array_almost_equal(eigvals, eigvals_lapack, decimal=6) + # -- eigenvectors comparison + assert eigvecs_lapack.shape == (n_features, k) + dummy_vecs = np.zeros_like(eigvecs).T + eigvecs, _ = svd_flip(eigvecs, dummy_vecs) + eigvecs_lapack, _ = svd_flip(eigvecs_lapack, dummy_vecs) + assert_array_almost_equal(eigvecs, eigvecs_lapack, decimal=4) + + @pytest.mark.parametrize( "n,rank", [ @@ -1119,3 +1153,33 @@ def test_randomized_range_finder_array_api_compliance(array_namespace, device, d assert get_namespace(Q_xp)[0].__name__ == xp.__name__ assert_allclose(_convert_to_numpy(Q_xp, xp), Q_np, atol=atol) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype", + yield_namespace_device_dtype_combinations(), + ids=_get_namespace_device_dtype_ids, +) +def test_randomized_eigsh_value_array_api_compliance(array_namespace, device, dtype): + xp = _array_api_for_tests(array_namespace, device) + + rng = np.random.RandomState(0) + X = rng.normal(size=(30, 10)).astype(dtype) + X = X @ X.T + X_xp = xp.asarray(X, device=device) + n_components = 5 + atol = 1e-5 if dtype == "float32" else 0 + + with config_context(array_api_dispatch=True): + l_np, u_np = _randomized_eigsh( + X, n_components=n_components, selection="value", random_state=0 + ) + l_xp, u_xp = _randomized_eigsh( + X_xp, n_components=n_components, selection="value", random_state=0 + ) + + assert get_namespace(u_xp)[0].__name__ == xp.__name__ + assert get_namespace(l_xp)[0].__name__ == xp.__name__ + + assert_allclose(_convert_to_numpy(u_xp, xp), u_np, atol=atol) + assert_allclose(_convert_to_numpy(l_xp, xp), l_np, atol=atol)