-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
TST refactor test_truncated_svd #14140
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a55eb87
b7ed247
a912d34
b9e044d
8a2870e
14f5517
9aae4a1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,227 +7,178 @@ | |
|
||
from sklearn.decomposition import TruncatedSVD, PCA | ||
from sklearn.utils import check_random_state | ||
from sklearn.utils.testing import (assert_array_almost_equal, assert_equal, | ||
assert_raises, assert_greater, | ||
assert_array_less, assert_allclose) | ||
from sklearn.utils.testing import assert_array_less, assert_allclose | ||
|
||
SVD_SOLVERS = ['arpack', 'randomized'] | ||
|
||
# Make an X that looks somewhat like a small tf-idf matrix. | ||
# XXX newer versions of SciPy >0.16 have scipy.sparse.rand for this. | ||
shape = 60, 55 | ||
n_samples, n_features = shape | ||
rng = check_random_state(42) | ||
X = rng.randint(-100, 20, np.product(shape)).reshape(shape) | ||
X = sp.csr_matrix(np.maximum(X, 0), dtype=np.float64) | ||
X.data[:] = 1 + np.log(X.data) | ||
Xdense = X.A | ||
|
||
@pytest.fixture(scope='module') | ||
def X_sparse(): | ||
# Make an X that looks somewhat like a small tf-idf matrix. | ||
rng = check_random_state(42) | ||
X = sp.random(60, 55, density=0.2, format="csr", random_state=rng) | ||
X.data[:] = 1 + np.log(X.data) | ||
return X | ||
|
||
def test_algorithms(): | ||
|
||
@pytest.mark.parametrize("solver", ['randomized']) | ||
@pytest.mark.parametrize('kind', ('dense', 'sparse')) | ||
def test_solvers(X_sparse, solver, kind): | ||
X = X_sparse if kind == 'sparse' else X_sparse.toarray() | ||
svd_a = TruncatedSVD(30, algorithm="arpack") | ||
svd_r = TruncatedSVD(30, algorithm="randomized", random_state=42) | ||
svd = TruncatedSVD(30, algorithm=solver, random_state=42) | ||
|
||
Xa = svd_a.fit_transform(X)[:, :6] | ||
Xr = svd_r.fit_transform(X)[:, :6] | ||
assert_array_almost_equal(Xa, Xr, decimal=5) | ||
Xr = svd.fit_transform(X)[:, :6] | ||
assert_allclose(Xa, Xr, rtol=2e-3) | ||
|
||
comp_a = np.abs(svd_a.components_) | ||
comp_r = np.abs(svd_r.components_) | ||
comp = np.abs(svd.components_) | ||
# All elements are equal, but some elements are more equal than others. | ||
assert_array_almost_equal(comp_a[:9], comp_r[:9]) | ||
assert_array_almost_equal(comp_a[9:], comp_r[9:], decimal=2) | ||
assert_allclose(comp_a[:9], comp[:9], rtol=1e-3) | ||
assert_allclose(comp_a[9:], comp[9:], atol=1e-2) | ||
|
||
|
||
def test_attributes(): | ||
for n_components in (10, 25, 41): | ||
tsvd = TruncatedSVD(n_components).fit(X) | ||
assert_equal(tsvd.n_components, n_components) | ||
assert_equal(tsvd.components_.shape, (n_components, n_features)) | ||
@pytest.mark.parametrize("n_components", (10, 25, 41)) | ||
def test_attributes(n_components, X_sparse): | ||
n_features = X_sparse.shape[1] | ||
tsvd = TruncatedSVD(n_components).fit(X_sparse) | ||
assert tsvd.n_components == n_components | ||
assert tsvd.components_.shape == (n_components, n_features) | ||
|
||
|
||
@pytest.mark.parametrize('algorithm', ("arpack", "randomized")) | ||
def test_too_many_components(algorithm): | ||
@pytest.mark.parametrize('algorithm', SVD_SOLVERS) | ||
def test_too_many_components(algorithm, X_sparse): | ||
n_features = X_sparse.shape[1] | ||
for n_components in (n_features, n_features + 1): | ||
rth marked this conversation as resolved.
Show resolved
Hide resolved
|
||
tsvd = TruncatedSVD(n_components=n_components, algorithm=algorithm) | ||
assert_raises(ValueError, tsvd.fit, X) | ||
with pytest.raises(ValueError): | ||
tsvd.fit(X_sparse) | ||
|
||
|
||
@pytest.mark.parametrize('fmt', ("array", "csr", "csc", "coo", "lil")) | ||
def test_sparse_formats(fmt): | ||
Xfmt = Xdense if fmt == "dense" else getattr(X, "to" + fmt)() | ||
def test_sparse_formats(fmt, X_sparse): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's interesting that this is not quite covered by common tests. Apparently we check predict and predict_proba, but not transform. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Opened #14176 to address this. |
||
n_samples = X_sparse.shape[0] | ||
Xfmt = (X_sparse.toarray() | ||
if fmt == "dense" else getattr(X_sparse, "to" + fmt)()) | ||
tsvd = TruncatedSVD(n_components=11) | ||
Xtrans = tsvd.fit_transform(Xfmt) | ||
assert_equal(Xtrans.shape, (n_samples, 11)) | ||
assert Xtrans.shape == (n_samples, 11) | ||
Xtrans = tsvd.transform(Xfmt) | ||
assert_equal(Xtrans.shape, (n_samples, 11)) | ||
assert Xtrans.shape == (n_samples, 11) | ||
|
||
|
||
@pytest.mark.parametrize('algo', ("arpack", "randomized")) | ||
def test_inverse_transform(algo): | ||
@pytest.mark.parametrize('algo', SVD_SOLVERS) | ||
def test_inverse_transform(algo, X_sparse): | ||
# We need a lot of components for the reconstruction to be "almost | ||
# equal" in all positions. XXX Test means or sums instead? | ||
tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo) | ||
Xt = tsvd.fit_transform(X) | ||
Xt = tsvd.fit_transform(X_sparse) | ||
Xinv = tsvd.inverse_transform(Xt) | ||
assert_array_almost_equal(Xinv, Xdense, decimal=1) | ||
assert_allclose(Xinv, X_sparse.toarray(), rtol=1e-1, atol=2e-1) | ||
|
||
|
||
def test_integers(): | ||
Xint = X.astype(np.int64) | ||
def test_integers(X_sparse): | ||
n_samples = X_sparse.shape[0] | ||
Xint = X_sparse.astype(np.int64) | ||
tsvd = TruncatedSVD(n_components=6) | ||
Xtrans = tsvd.fit_transform(Xint) | ||
assert_equal(Xtrans.shape, (n_samples, tsvd.n_components)) | ||
|
||
|
||
def test_explained_variance(): | ||
# Test sparse data | ||
svd_a_10_sp = TruncatedSVD(10, algorithm="arpack") | ||
svd_r_10_sp = TruncatedSVD(10, algorithm="randomized", random_state=42) | ||
svd_a_20_sp = TruncatedSVD(20, algorithm="arpack") | ||
svd_r_20_sp = TruncatedSVD(20, algorithm="randomized", random_state=42) | ||
X_trans_a_10_sp = svd_a_10_sp.fit_transform(X) | ||
X_trans_r_10_sp = svd_r_10_sp.fit_transform(X) | ||
X_trans_a_20_sp = svd_a_20_sp.fit_transform(X) | ||
X_trans_r_20_sp = svd_r_20_sp.fit_transform(X) | ||
|
||
# Test dense data | ||
svd_a_10_de = TruncatedSVD(10, algorithm="arpack") | ||
svd_r_10_de = TruncatedSVD(10, algorithm="randomized", random_state=42) | ||
svd_a_20_de = TruncatedSVD(20, algorithm="arpack") | ||
svd_r_20_de = TruncatedSVD(20, algorithm="randomized", random_state=42) | ||
X_trans_a_10_de = svd_a_10_de.fit_transform(X.toarray()) | ||
X_trans_r_10_de = svd_r_10_de.fit_transform(X.toarray()) | ||
X_trans_a_20_de = svd_a_20_de.fit_transform(X.toarray()) | ||
X_trans_r_20_de = svd_r_20_de.fit_transform(X.toarray()) | ||
|
||
# helper arrays for tests below | ||
svds = (svd_a_10_sp, svd_r_10_sp, svd_a_20_sp, svd_r_20_sp, svd_a_10_de, | ||
svd_r_10_de, svd_a_20_de, svd_r_20_de) | ||
svds_trans = ( | ||
(svd_a_10_sp, X_trans_a_10_sp), | ||
(svd_r_10_sp, X_trans_r_10_sp), | ||
(svd_a_20_sp, X_trans_a_20_sp), | ||
(svd_r_20_sp, X_trans_r_20_sp), | ||
(svd_a_10_de, X_trans_a_10_de), | ||
(svd_r_10_de, X_trans_r_10_de), | ||
(svd_a_20_de, X_trans_a_20_de), | ||
(svd_r_20_de, X_trans_r_20_de), | ||
) | ||
svds_10_v_20 = ( | ||
(svd_a_10_sp, svd_a_20_sp), | ||
(svd_r_10_sp, svd_r_20_sp), | ||
(svd_a_10_de, svd_a_20_de), | ||
(svd_r_10_de, svd_r_20_de), | ||
) | ||
svds_sparse_v_dense = ( | ||
(svd_a_10_sp, svd_a_10_de), | ||
(svd_a_20_sp, svd_a_20_de), | ||
(svd_r_10_sp, svd_r_10_de), | ||
(svd_r_20_sp, svd_r_20_de), | ||
) | ||
assert Xtrans.shape == (n_samples, tsvd.n_components) | ||
|
||
# Assert the 1st component is equal | ||
for svd_10, svd_20 in svds_10_v_20: | ||
assert_array_almost_equal( | ||
svd_10.explained_variance_ratio_, | ||
svd_20.explained_variance_ratio_[:10], | ||
decimal=5, | ||
) | ||
|
||
# Assert that 20 components has higher explained variance than 10 | ||
for svd_10, svd_20 in svds_10_v_20: | ||
assert_greater( | ||
svd_20.explained_variance_ratio_.sum(), | ||
svd_10.explained_variance_ratio_.sum(), | ||
) | ||
|
||
@pytest.mark.parametrize('kind', ('dense', 'sparse')) | ||
@pytest.mark.parametrize('n_components', [10, 20]) | ||
@pytest.mark.parametrize('solver', SVD_SOLVERS) | ||
def test_explained_variance(X_sparse, kind, n_components, solver): | ||
X = X_sparse if kind == 'sparse' else X_sparse.toarray() | ||
svd = TruncatedSVD(n_components, algorithm=solver) | ||
X_tr = svd.fit_transform(X) | ||
# Assert that all the values are greater than 0 | ||
for svd in svds: | ||
assert_array_less(0.0, svd.explained_variance_ratio_) | ||
assert_array_less(0.0, svd.explained_variance_ratio_) | ||
|
||
# Assert that total explained variance is less than 1 | ||
for svd in svds: | ||
assert_array_less(svd.explained_variance_ratio_.sum(), 1.0) | ||
|
||
# Compare sparse vs. dense | ||
for svd_sparse, svd_dense in svds_sparse_v_dense: | ||
assert_array_almost_equal(svd_sparse.explained_variance_ratio_, | ||
svd_dense.explained_variance_ratio_) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was removed as it is indirectly tested with checking that |
||
assert_array_less(svd.explained_variance_ratio_.sum(), 1.0) | ||
|
||
# Test that explained_variance is correct | ||
for svd, transformed in svds_trans: | ||
total_variance = np.var(X.toarray(), axis=0).sum() | ||
variances = np.var(transformed, axis=0) | ||
true_explained_variance_ratio = variances / total_variance | ||
total_variance = np.var(X_sparse.toarray(), axis=0).sum() | ||
variances = np.var(X_tr, axis=0) | ||
true_explained_variance_ratio = variances / total_variance | ||
|
||
assert_array_almost_equal( | ||
svd.explained_variance_ratio_, | ||
true_explained_variance_ratio, | ||
) | ||
assert_allclose( | ||
svd.explained_variance_ratio_, | ||
true_explained_variance_ratio, | ||
) | ||
|
||
|
||
def test_singular_values(): | ||
# Check that the TruncatedSVD output has the correct singular values | ||
@pytest.mark.parametrize('kind', ('dense', 'sparse')) | ||
@pytest.mark.parametrize('solver', SVD_SOLVERS) | ||
def test_explained_variance_components_10_20(X_sparse, kind, solver): | ||
X = X_sparse if kind == 'sparse' else X_sparse.toarray() | ||
svd_10 = TruncatedSVD(10, algorithm=solver).fit(X) | ||
svd_20 = TruncatedSVD(20, algorithm=solver).fit(X) | ||
|
||
rng = np.random.RandomState(0) | ||
n_samples = 100 | ||
n_features = 80 | ||
# Assert the 1st component is equal | ||
assert_allclose( | ||
svd_10.explained_variance_ratio_, | ||
svd_20.explained_variance_ratio_[:10], | ||
rtol=3e-3, | ||
) | ||
|
||
# Assert that 20 components has higher explained variance than 10 | ||
assert ( | ||
svd_20.explained_variance_ratio_.sum() > | ||
svd_10.explained_variance_ratio_.sum() | ||
) | ||
|
||
|
||
@pytest.mark.parametrize('solver', SVD_SOLVERS) | ||
def test_singular_values_consistency(solver): | ||
# Check that the TruncatedSVD output has the correct singular values | ||
rng = np.random.RandomState(0) | ||
rth marked this conversation as resolved.
Show resolved
Hide resolved
|
||
n_samples, n_features = 100, 80 | ||
X = rng.randn(n_samples, n_features) | ||
|
||
apca = TruncatedSVD(n_components=2, algorithm='arpack', | ||
random_state=rng).fit(X) | ||
rpca = TruncatedSVD(n_components=2, algorithm='arpack', | ||
random_state=rng).fit(X) | ||
assert_array_almost_equal(apca.singular_values_, rpca.singular_values_, 12) | ||
pca = TruncatedSVD(n_components=2, algorithm=solver, | ||
random_state=rng).fit(X) | ||
|
||
# Compare to the Frobenius norm | ||
X_apca = apca.transform(X) | ||
X_rpca = rpca.transform(X) | ||
assert_array_almost_equal(np.sum(apca.singular_values_**2.0), | ||
np.linalg.norm(X_apca, "fro")**2.0, 12) | ||
assert_array_almost_equal(np.sum(rpca.singular_values_**2.0), | ||
np.linalg.norm(X_rpca, "fro")**2.0, 12) | ||
X_pca = pca.transform(X) | ||
rth marked this conversation as resolved.
Show resolved
Hide resolved
|
||
assert_allclose(np.sum(pca.singular_values_**2.0), | ||
np.linalg.norm(X_pca, "fro")**2.0, rtol=1e-2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above |
||
|
||
# Compare to the 2-norms of the score vectors | ||
assert_array_almost_equal(apca.singular_values_, | ||
np.sqrt(np.sum(X_apca**2.0, axis=0)), 12) | ||
assert_array_almost_equal(rpca.singular_values_, | ||
np.sqrt(np.sum(X_rpca**2.0, axis=0)), 12) | ||
assert_allclose(pca.singular_values_, | ||
np.sqrt(np.sum(X_pca**2.0, axis=0)), rtol=1e-2) | ||
|
||
|
||
@pytest.mark.parametrize('solver', SVD_SOLVERS) | ||
def test_singular_values_expected(solver): | ||
# Set the singular values and see what we get back | ||
rng = np.random.RandomState(0) | ||
n_samples = 100 | ||
n_features = 110 | ||
|
||
X = rng.randn(n_samples, n_features) | ||
|
||
apca = TruncatedSVD(n_components=3, algorithm='arpack', | ||
random_state=rng) | ||
rpca = TruncatedSVD(n_components=3, algorithm='randomized', | ||
random_state=rng) | ||
X_apca = apca.fit_transform(X) | ||
X_rpca = rpca.fit_transform(X) | ||
|
||
X_apca /= np.sqrt(np.sum(X_apca**2.0, axis=0)) | ||
X_rpca /= np.sqrt(np.sum(X_rpca**2.0, axis=0)) | ||
X_apca[:, 0] *= 3.142 | ||
X_apca[:, 1] *= 2.718 | ||
X_rpca[:, 0] *= 3.142 | ||
X_rpca[:, 1] *= 2.718 | ||
|
||
X_hat_apca = np.dot(X_apca, apca.components_) | ||
X_hat_rpca = np.dot(X_rpca, rpca.components_) | ||
apca.fit(X_hat_apca) | ||
rpca.fit(X_hat_rpca) | ||
assert_array_almost_equal(apca.singular_values_, [3.142, 2.718, 1.0], 14) | ||
assert_array_almost_equal(rpca.singular_values_, [3.142, 2.718, 1.0], 14) | ||
|
||
|
||
def test_truncated_svd_eq_pca(): | ||
pca = TruncatedSVD(n_components=3, algorithm=solver, | ||
random_state=rng) | ||
X_pca = pca.fit_transform(X) | ||
|
||
X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0)) | ||
X_pca[:, 0] *= 3.142 | ||
X_pca[:, 1] *= 2.718 | ||
|
||
X_hat_pca = np.dot(X_pca, pca.components_) | ||
pca.fit(X_hat_pca) | ||
assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0], rtol=1e-14) | ||
|
||
|
||
def test_truncated_svd_eq_pca(X_sparse): | ||
# TruncatedSVD should be equal to PCA on centered data | ||
|
||
X_c = X - X.mean(axis=0) | ||
X_dense = X_sparse.toarray() | ||
|
||
X_c = X_dense - X_dense.mean(axis=0) | ||
|
||
params = dict(n_components=10, random_state=42) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I always wanted to do this more for our tests. (Make data into module level fixtures) Are we okay with this change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see why not.
The only risk is in place modification of the data. Common tests that check read-only arrays as input should prevent it somewhat. Still one way around it could be a module or session scoped fixture that load and prepares the data, and a function scoped fixture that makes a copy of it. Another alternative could be to return data array with a read-only=True flag.
Still that's something we could look into separately. Here that should be fairly equivalent to using a global variable with data.