-
-
Notifications
You must be signed in to change notification settings - Fork 26.2k
[MRG] Implement randomized PCA #12841
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
633e3ca
fe3d3a8
2b6c217
46c2540
99c1b1d
3b3a483
33574d1
f94c6f0
de5726f
d18a339
473b940
16a3c8e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -14,6 +14,7 @@ | |||||
from sklearn.utils.testing import assert_no_warnings | ||||||
from sklearn.utils.testing import ignore_warnings | ||||||
from sklearn.utils.testing import assert_less | ||||||
from sklearn.utils.testing import assert_allclose | ||||||
|
||||||
from sklearn import datasets | ||||||
from sklearn.decomposition import PCA | ||||||
|
@@ -256,34 +257,33 @@ def test_singular_values(): | |||||
|
||||||
X = rng.randn(n_samples, n_features) | ||||||
|
||||||
pca = PCA(n_components=2, svd_solver='full', | ||||||
random_state=rng).fit(X) | ||||||
apca = PCA(n_components=2, svd_solver='arpack', | ||||||
pca = PCA(n_components=2, svd_solver='full', random_state=rng).fit(X) | ||||||
apca = PCA(n_components=2, svd_solver='arpack', random_state=rng).fit(X) | ||||||
# Increase the number of power iterations to get greater accuracy in tests | ||||||
rpca = PCA(n_components=2, svd_solver='randomized', iterated_power=40, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This whole If you have to set Also in general, please avoid style fixes that are not related to the issue (like spaces between |
||||||
random_state=rng).fit(X) | ||||||
rpca = PCA(n_components=2, svd_solver='randomized', | ||||||
random_state=rng).fit(X) | ||||||
assert_array_almost_equal(pca.singular_values_, apca.singular_values_, 12) | ||||||
assert_array_almost_equal(pca.singular_values_, rpca.singular_values_, 1) | ||||||
assert_array_almost_equal(apca.singular_values_, rpca.singular_values_, 1) | ||||||
assert_allclose(pca.singular_values_, apca.singular_values_) | ||||||
assert_allclose(pca.singular_values_, rpca.singular_values_) | ||||||
assert_allclose(apca.singular_values_, rpca.singular_values_) | ||||||
|
||||||
# Compare to the Frobenius norm | ||||||
X_pca = pca.transform(X) | ||||||
X_apca = apca.transform(X) | ||||||
X_rpca = rpca.transform(X) | ||||||
assert_array_almost_equal(np.sum(pca.singular_values_**2.0), | ||||||
np.linalg.norm(X_pca, "fro")**2.0, 12) | ||||||
assert_array_almost_equal(np.sum(apca.singular_values_**2.0), | ||||||
np.linalg.norm(X_apca, "fro")**2.0, 9) | ||||||
assert_array_almost_equal(np.sum(rpca.singular_values_**2.0), | ||||||
np.linalg.norm(X_rpca, "fro")**2.0, 0) | ||||||
assert_array_almost_equal(np.sum(pca.singular_values_ ** 2.0), | ||||||
np.linalg.norm(X_pca, "fro") ** 2.0, 12) | ||||||
assert_array_almost_equal(np.sum(apca.singular_values_ ** 2.0), | ||||||
np.linalg.norm(X_apca, "fro") ** 2.0, 9) | ||||||
assert_array_almost_equal(np.sum(rpca.singular_values_ ** 2.0), | ||||||
np.linalg.norm(X_rpca, "fro") ** 2.0, 0) | ||||||
|
||||||
# Compare to the 2-norms of the score vectors | ||||||
assert_array_almost_equal(pca.singular_values_, | ||||||
np.sqrt(np.sum(X_pca**2.0, axis=0)), 12) | ||||||
assert_array_almost_equal(apca.singular_values_, | ||||||
np.sqrt(np.sum(X_apca**2.0, axis=0)), 12) | ||||||
assert_array_almost_equal(rpca.singular_values_, | ||||||
np.sqrt(np.sum(X_rpca**2.0, axis=0)), 2) | ||||||
assert_allclose(pca.singular_values_, | ||||||
np.sqrt(np.sum(X_pca ** 2.0, axis=0))) | ||||||
assert_allclose(apca.singular_values_, | ||||||
np.sqrt(np.sum(X_apca ** 2.0, axis=0))) | ||||||
assert_allclose(rpca.singular_values_, | ||||||
np.sqrt(np.sum(X_rpca ** 2.0, axis=0))) | ||||||
|
||||||
# Set the singular values and see what we get back | ||||||
rng = np.random.RandomState(0) | ||||||
|
@@ -297,17 +297,18 @@ def test_singular_values(): | |||||
rpca = PCA(n_components=3, svd_solver='randomized', random_state=rng) | ||||||
X_pca = pca.fit_transform(X) | ||||||
|
||||||
X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0)) | ||||||
X_pca /= np.sqrt(np.sum(X_pca ** 2.0, axis=0)) | ||||||
X_pca[:, 0] *= 3.142 | ||||||
X_pca[:, 1] *= 2.718 | ||||||
|
||||||
X_hat = np.dot(X_pca, pca.components_) | ||||||
pca.fit(X_hat) | ||||||
apca.fit(X_hat) | ||||||
rpca.fit(X_hat) | ||||||
assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14) | ||||||
assert_array_almost_equal(apca.singular_values_, [3.142, 2.718, 1.0], 14) | ||||||
assert_array_almost_equal(rpca.singular_values_, [3.142, 2.718, 1.0], 14) | ||||||
|
||||||
assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0]) | ||||||
assert_allclose(apca.singular_values_, [3.142, 2.718, 1.0]) | ||||||
assert_allclose(rpca.singular_values_, [3.142, 2.718, 1.0]) | ||||||
|
||||||
|
||||||
def test_pca_check_projection(): | ||||||
|
@@ -683,15 +684,50 @@ def test_svd_solver_auto(): | |||||
assert_array_almost_equal(pca.components_, pca_test.components_) | ||||||
|
||||||
|
||||||
@pytest.mark.parametrize('svd_solver', solver_list) | ||||||
def test_pca_sparse_input(svd_solver): | ||||||
def test_pca_sparse_input_randomized_solver(): | ||||||
rng = np.random.RandomState(0) | ||||||
n_samples = 100 | ||||||
n_features = 80 | ||||||
|
||||||
X = rng.binomial(1, 0.1, (n_samples, n_features)) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick: use |
||||||
X_sp = sp.sparse.csr_matrix(X) | ||||||
|
||||||
# Compute the randomized decomposition on the dense matrix | ||||||
pca = PCA(n_components=3, svd_solver='randomized', | ||||||
pavlin-policar marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
random_state=0).fit(X) | ||||||
# And compute the randomized decomposition on the sparse matrix. | ||||||
pca_sp = PCA(n_components=3, svd_solver='randomized', | ||||||
pavlin-policar marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
random_state=0).fit(X_sp) | ||||||
|
||||||
# Ensure the singular values are close to the exact singular values | ||||||
assert_allclose(pca_sp.singular_values_, pca.singular_values_) | ||||||
|
||||||
# Ensure that the basis is close to the true basis | ||||||
X_pca = pca.transform(X) | ||||||
X_sppca = pca_sp.transform(X) | ||||||
assert_allclose(X_sppca, X_pca) | ||||||
|
||||||
|
||||||
@pytest.mark.parametrize('svd_solver', ['full', 'arpack']) | ||||||
def test_pca_sparse_input_bad_solvers(svd_solver): | ||||||
X = np.random.RandomState(0).rand(5, 4) | ||||||
X = sp.sparse.csr_matrix(X) | ||||||
assert(sp.sparse.issparse(X)) | ||||||
|
||||||
pca = PCA(n_components=3, svd_solver=svd_solver) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
||||||
assert_raises(TypeError, pca.fit, X) | ||||||
with pytest.raises(ValueError, match='Only the randomized solver supports ' | ||||||
'sparse matrices'): | ||||||
pca.fit(X) | ||||||
|
||||||
pavlin-policar marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
def test_pca_auto_solver_selects_randomized_solver_for_sparse_matrices(): | ||||||
X = np.random.RandomState(0).rand(5, 4) | ||||||
X = sp.sparse.csr_matrix(X) | ||||||
|
||||||
pca = PCA(n_components=3, svd_solver='auto') | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
pca.fit(X) | ||||||
|
||||||
assert pca._fit_svd_solver == 'randomized' | ||||||
|
||||||
|
||||||
def test_pca_bad_solver(): | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should play it safe here and only call
randomized_pca
whenX
is sparse. Who knows, there might be bugs inrandomized_pca
, andrandomized_svd
is probably faster for dense matrices.