From af363390d880d110d5722170130213d6a7439f36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 19 May 2022 13:33:34 +0200 Subject: [PATCH 1/4] Revert change in sklearn.extmath.util and fix randomized_svd benchmark --- benchmarks/bench_plot_randomized_svd.py | 10 ++++++---- sklearn/utils/extmath.py | 8 ++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py index 081842231039e..36bea0b39d8cd 100644 --- a/benchmarks/bench_plot_randomized_svd.py +++ b/benchmarks/bench_plot_randomized_svd.py @@ -107,7 +107,7 @@ # Determine when to switch to batch computation for matrix norms, # in case the reconstructed (dense) matrix is too large -MAX_MEMORY = int(2e9) +MAX_MEMORY = int(4e9) # The following datasets can be downloaded manually from: # CIFAR 10: https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz @@ -323,8 +323,10 @@ def norm_diff(A, norm=2, msg=True, random_state=None): def scalable_frobenius_norm_discrepancy(X, U, s, V): - # if the input is not too big, just call scipy - if X.shape[0] * X.shape[1] < MAX_MEMORY: + # if the input is not sparse or sparse but not too big, call scipy + if not sp.sparse.issparse(X) or ( + X.shape[0] * X.shape[1] * X.dtype.itemsize < MAX_MEMORY + ): A = X - U.dot(np.diag(s).dot(V)) return norm_diff(A, norm="fro") @@ -498,7 +500,7 @@ def bench_c(datasets, n_comps): if __name__ == "__main__": random_state = check_random_state(1234) - power_iter = np.linspace(0, 6, 7, dtype=int) + power_iter = np.arange(0, 6) n_comps = 50 for dataset_name in datasets: diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 35e392f6e4540..8a7be5fd245f0 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -217,6 +217,10 @@ def randomized_range_finder( # Generating normal random vectors with shape: (A.shape[1], size) Q = random_state.normal(size=(A.shape[1], size)) + if hasattr(A, "dtype") and A.dtype.kind == "f": + # Ensure f32 is preserved as f32 + Q = Q.astype(A.dtype, copy=False) + # Deal with "auto" mode if power_iteration_normalizer == "auto": if n_iter <= 2: @@ -241,10 +245,6 @@ def randomized_range_finder( # Extract an orthonormal basis Q, _ = linalg.qr(safe_sparse_dot(A, Q), mode="economic") - if hasattr(A, "dtype") and A.dtype.kind == "f": - # Ensure f32 is preserved as f32 - Q = Q.astype(A.dtype, copy=False) - return Q From b016b734dbee2a2da8e165cda1ec18667b18385c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 19 May 2022 15:23:57 +0200 Subject: [PATCH 2/4] Put the code back as it was --- sklearn/utils/extmath.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 8a7be5fd245f0..e4513a62bf07e 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -216,7 +216,6 @@ def randomized_range_finder( # Generating normal random vectors with shape: (A.shape[1], size) Q = random_state.normal(size=(A.shape[1], size)) - if hasattr(A, "dtype") and A.dtype.kind == "f": # Ensure f32 is preserved as f32 Q = Q.astype(A.dtype, copy=False) From cbf4906aab2be44a9a2cff3eadabb69ae84cbb90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 20 May 2022 08:19:28 +0200 Subject: [PATCH 3/4] Tweak --- benchmarks/bench_plot_randomized_svd.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py index 36bea0b39d8cd..49b93cd137389 100644 --- a/benchmarks/bench_plot_randomized_svd.py +++ b/benchmarks/bench_plot_randomized_svd.py @@ -323,10 +323,9 @@ def norm_diff(A, norm=2, msg=True, random_state=None): def scalable_frobenius_norm_discrepancy(X, U, s, V): - # if the input is not sparse or sparse but not too big, call scipy - if not sp.sparse.issparse(X) or ( - X.shape[0] * X.shape[1] * X.dtype.itemsize < MAX_MEMORY - ): + if not sp.sparse.issparse(X) or (X.size * X.dtype.itemsize < MAX_MEMORY): + # if the input is not sparse or sparse but not too big, + # U.dot(np.diag(s).dot(V)) will fit in RAM A = X - U.dot(np.diag(s).dot(V)) return norm_diff(A, norm="fro") From 3e2aec8d11ebb28b188cdbe9937a7325b42d50a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 20 May 2022 15:55:40 +0200 Subject: [PATCH 4/4] Fix --- benchmarks/bench_plot_randomized_svd.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py index 49b93cd137389..ecc1bbb92ce61 100644 --- a/benchmarks/bench_plot_randomized_svd.py +++ b/benchmarks/bench_plot_randomized_svd.py @@ -323,7 +323,9 @@ def norm_diff(A, norm=2, msg=True, random_state=None): def scalable_frobenius_norm_discrepancy(X, U, s, V): - if not sp.sparse.issparse(X) or (X.size * X.dtype.itemsize < MAX_MEMORY): + if not sp.sparse.issparse(X) or ( + X.shape[0] * X.shape[1] * X.dtype.itemsize < MAX_MEMORY + ): # if the input is not sparse or sparse but not too big, # U.dot(np.diag(s).dot(V)) will fit in RAM A = X - U.dot(np.diag(s).dot(V))