Skip to content

TST refactor test_truncated_svd #14140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 24, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
273 changes: 112 additions & 161 deletions sklearn/decomposition/tests/test_truncated_svd.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,227 +7,178 @@

from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.utils import check_random_state
from sklearn.utils.testing import (assert_array_almost_equal, assert_equal,
assert_raises, assert_greater,
assert_array_less, assert_allclose)
from sklearn.utils.testing import assert_array_less, assert_allclose

SVD_SOLVERS = ['arpack', 'randomized']

# Make an X that looks somewhat like a small tf-idf matrix.
# XXX newer versions of SciPy >0.16 have scipy.sparse.rand for this.
shape = 60, 55
n_samples, n_features = shape
rng = check_random_state(42)
X = rng.randint(-100, 20, np.product(shape)).reshape(shape)
X = sp.csr_matrix(np.maximum(X, 0), dtype=np.float64)
X.data[:] = 1 + np.log(X.data)
Xdense = X.A

@pytest.fixture(scope='module')
def X_sparse():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I always wanted to do this more for our tests. (Make data into module level fixtures) Are we okay with this change?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see why not.

The only risk is in place modification of the data. Common tests that check read-only arrays as input should prevent it somewhat. Still one way around it could be a module or session scoped fixture that load and prepares the data, and a function scoped fixture that makes a copy of it. Another alternative could be to return data array with a read-only=True flag.

Still that's something we could look into separately. Here that should be fairly equivalent to using a global variable with data.

# Make an X that looks somewhat like a small tf-idf matrix.
rng = check_random_state(42)
X = sp.random(60, 55, density=0.2, format="csr", random_state=rng)
X.data[:] = 1 + np.log(X.data)
return X

def test_algorithms():

@pytest.mark.parametrize("solver", ['randomized'])
@pytest.mark.parametrize('kind', ('dense', 'sparse'))
def test_solvers(X_sparse, solver, kind):
X = X_sparse if kind == 'sparse' else X_sparse.toarray()
svd_a = TruncatedSVD(30, algorithm="arpack")
svd_r = TruncatedSVD(30, algorithm="randomized", random_state=42)
svd = TruncatedSVD(30, algorithm=solver, random_state=42)

Xa = svd_a.fit_transform(X)[:, :6]
Xr = svd_r.fit_transform(X)[:, :6]
assert_array_almost_equal(Xa, Xr, decimal=5)
Xr = svd.fit_transform(X)[:, :6]
assert_allclose(Xa, Xr, rtol=2e-3)

comp_a = np.abs(svd_a.components_)
comp_r = np.abs(svd_r.components_)
comp = np.abs(svd.components_)
# All elements are equal, but some elements are more equal than others.
assert_array_almost_equal(comp_a[:9], comp_r[:9])
assert_array_almost_equal(comp_a[9:], comp_r[9:], decimal=2)
assert_allclose(comp_a[:9], comp[:9], rtol=1e-3)
assert_allclose(comp_a[9:], comp[9:], atol=1e-2)


def test_attributes():
for n_components in (10, 25, 41):
tsvd = TruncatedSVD(n_components).fit(X)
assert_equal(tsvd.n_components, n_components)
assert_equal(tsvd.components_.shape, (n_components, n_features))
@pytest.mark.parametrize("n_components", (10, 25, 41))
def test_attributes(n_components, X_sparse):
n_features = X_sparse.shape[1]
tsvd = TruncatedSVD(n_components).fit(X_sparse)
assert tsvd.n_components == n_components
assert tsvd.components_.shape == (n_components, n_features)


@pytest.mark.parametrize('algorithm', ("arpack", "randomized"))
def test_too_many_components(algorithm):
@pytest.mark.parametrize('algorithm', SVD_SOLVERS)
def test_too_many_components(algorithm, X_sparse):
n_features = X_sparse.shape[1]
for n_components in (n_features, n_features + 1):
tsvd = TruncatedSVD(n_components=n_components, algorithm=algorithm)
assert_raises(ValueError, tsvd.fit, X)
with pytest.raises(ValueError):
tsvd.fit(X_sparse)


@pytest.mark.parametrize('fmt', ("array", "csr", "csc", "coo", "lil"))
def test_sparse_formats(fmt):
Xfmt = Xdense if fmt == "dense" else getattr(X, "to" + fmt)()
def test_sparse_formats(fmt, X_sparse):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's interesting that this is not quite covered by common tests. Apparently we check predict and predict_proba, but not transform.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #14176 to address this.

n_samples = X_sparse.shape[0]
Xfmt = (X_sparse.toarray()
if fmt == "dense" else getattr(X_sparse, "to" + fmt)())
tsvd = TruncatedSVD(n_components=11)
Xtrans = tsvd.fit_transform(Xfmt)
assert_equal(Xtrans.shape, (n_samples, 11))
assert Xtrans.shape == (n_samples, 11)
Xtrans = tsvd.transform(Xfmt)
assert_equal(Xtrans.shape, (n_samples, 11))
assert Xtrans.shape == (n_samples, 11)


@pytest.mark.parametrize('algo', ("arpack", "randomized"))
def test_inverse_transform(algo):
@pytest.mark.parametrize('algo', SVD_SOLVERS)
def test_inverse_transform(algo, X_sparse):
# We need a lot of components for the reconstruction to be "almost
# equal" in all positions. XXX Test means or sums instead?
tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo)
Xt = tsvd.fit_transform(X)
Xt = tsvd.fit_transform(X_sparse)
Xinv = tsvd.inverse_transform(Xt)
assert_array_almost_equal(Xinv, Xdense, decimal=1)
assert_allclose(Xinv, X_sparse.toarray(), rtol=1e-1, atol=2e-1)


def test_integers():
Xint = X.astype(np.int64)
def test_integers(X_sparse):
n_samples = X_sparse.shape[0]
Xint = X_sparse.astype(np.int64)
tsvd = TruncatedSVD(n_components=6)
Xtrans = tsvd.fit_transform(Xint)
assert_equal(Xtrans.shape, (n_samples, tsvd.n_components))


def test_explained_variance():
# Test sparse data
svd_a_10_sp = TruncatedSVD(10, algorithm="arpack")
svd_r_10_sp = TruncatedSVD(10, algorithm="randomized", random_state=42)
svd_a_20_sp = TruncatedSVD(20, algorithm="arpack")
svd_r_20_sp = TruncatedSVD(20, algorithm="randomized", random_state=42)
X_trans_a_10_sp = svd_a_10_sp.fit_transform(X)
X_trans_r_10_sp = svd_r_10_sp.fit_transform(X)
X_trans_a_20_sp = svd_a_20_sp.fit_transform(X)
X_trans_r_20_sp = svd_r_20_sp.fit_transform(X)

# Test dense data
svd_a_10_de = TruncatedSVD(10, algorithm="arpack")
svd_r_10_de = TruncatedSVD(10, algorithm="randomized", random_state=42)
svd_a_20_de = TruncatedSVD(20, algorithm="arpack")
svd_r_20_de = TruncatedSVD(20, algorithm="randomized", random_state=42)
X_trans_a_10_de = svd_a_10_de.fit_transform(X.toarray())
X_trans_r_10_de = svd_r_10_de.fit_transform(X.toarray())
X_trans_a_20_de = svd_a_20_de.fit_transform(X.toarray())
X_trans_r_20_de = svd_r_20_de.fit_transform(X.toarray())

# helper arrays for tests below
svds = (svd_a_10_sp, svd_r_10_sp, svd_a_20_sp, svd_r_20_sp, svd_a_10_de,
svd_r_10_de, svd_a_20_de, svd_r_20_de)
svds_trans = (
(svd_a_10_sp, X_trans_a_10_sp),
(svd_r_10_sp, X_trans_r_10_sp),
(svd_a_20_sp, X_trans_a_20_sp),
(svd_r_20_sp, X_trans_r_20_sp),
(svd_a_10_de, X_trans_a_10_de),
(svd_r_10_de, X_trans_r_10_de),
(svd_a_20_de, X_trans_a_20_de),
(svd_r_20_de, X_trans_r_20_de),
)
svds_10_v_20 = (
(svd_a_10_sp, svd_a_20_sp),
(svd_r_10_sp, svd_r_20_sp),
(svd_a_10_de, svd_a_20_de),
(svd_r_10_de, svd_r_20_de),
)
svds_sparse_v_dense = (
(svd_a_10_sp, svd_a_10_de),
(svd_a_20_sp, svd_a_20_de),
(svd_r_10_sp, svd_r_10_de),
(svd_r_20_sp, svd_r_20_de),
)
assert Xtrans.shape == (n_samples, tsvd.n_components)

# Assert the 1st component is equal
for svd_10, svd_20 in svds_10_v_20:
assert_array_almost_equal(
svd_10.explained_variance_ratio_,
svd_20.explained_variance_ratio_[:10],
decimal=5,
)

# Assert that 20 components has higher explained variance than 10
for svd_10, svd_20 in svds_10_v_20:
assert_greater(
svd_20.explained_variance_ratio_.sum(),
svd_10.explained_variance_ratio_.sum(),
)

@pytest.mark.parametrize('kind', ('dense', 'sparse'))
@pytest.mark.parametrize('n_components', [10, 20])
@pytest.mark.parametrize('solver', SVD_SOLVERS)
def test_explained_variance(X_sparse, kind, n_components, solver):
X = X_sparse if kind == 'sparse' else X_sparse.toarray()
svd = TruncatedSVD(n_components, algorithm=solver)
X_tr = svd.fit_transform(X)
# Assert that all the values are greater than 0
for svd in svds:
assert_array_less(0.0, svd.explained_variance_ratio_)
assert_array_less(0.0, svd.explained_variance_ratio_)

# Assert that total explained variance is less than 1
for svd in svds:
assert_array_less(svd.explained_variance_ratio_.sum(), 1.0)

# Compare sparse vs. dense
for svd_sparse, svd_dense in svds_sparse_v_dense:
assert_array_almost_equal(svd_sparse.explained_variance_ratio_,
svd_dense.explained_variance_ratio_)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was removed as it is indirectly tested with checking that explained_variance_ratio_ for both dense and sparse is equal to the expected value below.

assert_array_less(svd.explained_variance_ratio_.sum(), 1.0)

# Test that explained_variance is correct
for svd, transformed in svds_trans:
total_variance = np.var(X.toarray(), axis=0).sum()
variances = np.var(transformed, axis=0)
true_explained_variance_ratio = variances / total_variance
total_variance = np.var(X_sparse.toarray(), axis=0).sum()
variances = np.var(X_tr, axis=0)
true_explained_variance_ratio = variances / total_variance

assert_array_almost_equal(
svd.explained_variance_ratio_,
true_explained_variance_ratio,
)
assert_allclose(
svd.explained_variance_ratio_,
true_explained_variance_ratio,
)


def test_singular_values():
# Check that the TruncatedSVD output has the correct singular values
@pytest.mark.parametrize('kind', ('dense', 'sparse'))
@pytest.mark.parametrize('solver', SVD_SOLVERS)
def test_explained_variance_components_10_20(X_sparse, kind, solver):
X = X_sparse if kind == 'sparse' else X_sparse.toarray()
svd_10 = TruncatedSVD(10, algorithm=solver).fit(X)
svd_20 = TruncatedSVD(20, algorithm=solver).fit(X)

rng = np.random.RandomState(0)
n_samples = 100
n_features = 80
# Assert the 1st component is equal
assert_allclose(
svd_10.explained_variance_ratio_,
svd_20.explained_variance_ratio_[:10],
rtol=3e-3,
)

# Assert that 20 components has higher explained variance than 10
assert (
svd_20.explained_variance_ratio_.sum() >
svd_10.explained_variance_ratio_.sum()
)


@pytest.mark.parametrize('solver', SVD_SOLVERS)
def test_singular_values_consistency(solver):
# Check that the TruncatedSVD output has the correct singular values
rng = np.random.RandomState(0)
n_samples, n_features = 100, 80
X = rng.randn(n_samples, n_features)

apca = TruncatedSVD(n_components=2, algorithm='arpack',
random_state=rng).fit(X)
rpca = TruncatedSVD(n_components=2, algorithm='arpack',
random_state=rng).fit(X)
assert_array_almost_equal(apca.singular_values_, rpca.singular_values_, 12)
pca = TruncatedSVD(n_components=2, algorithm=solver,
random_state=rng).fit(X)

# Compare to the Frobenius norm
X_apca = apca.transform(X)
X_rpca = rpca.transform(X)
assert_array_almost_equal(np.sum(apca.singular_values_**2.0),
np.linalg.norm(X_apca, "fro")**2.0, 12)
assert_array_almost_equal(np.sum(rpca.singular_values_**2.0),
np.linalg.norm(X_rpca, "fro")**2.0, 12)
X_pca = pca.transform(X)
assert_allclose(np.sum(pca.singular_values_**2.0),
np.linalg.norm(X_pca, "fro")**2.0, rtol=1e-2)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above


# Compare to the 2-norms of the score vectors
assert_array_almost_equal(apca.singular_values_,
np.sqrt(np.sum(X_apca**2.0, axis=0)), 12)
assert_array_almost_equal(rpca.singular_values_,
np.sqrt(np.sum(X_rpca**2.0, axis=0)), 12)
assert_allclose(pca.singular_values_,
np.sqrt(np.sum(X_pca**2.0, axis=0)), rtol=1e-2)


@pytest.mark.parametrize('solver', SVD_SOLVERS)
def test_singular_values_expected(solver):
# Set the singular values and see what we get back
rng = np.random.RandomState(0)
n_samples = 100
n_features = 110

X = rng.randn(n_samples, n_features)

apca = TruncatedSVD(n_components=3, algorithm='arpack',
random_state=rng)
rpca = TruncatedSVD(n_components=3, algorithm='randomized',
random_state=rng)
X_apca = apca.fit_transform(X)
X_rpca = rpca.fit_transform(X)

X_apca /= np.sqrt(np.sum(X_apca**2.0, axis=0))
X_rpca /= np.sqrt(np.sum(X_rpca**2.0, axis=0))
X_apca[:, 0] *= 3.142
X_apca[:, 1] *= 2.718
X_rpca[:, 0] *= 3.142
X_rpca[:, 1] *= 2.718

X_hat_apca = np.dot(X_apca, apca.components_)
X_hat_rpca = np.dot(X_rpca, rpca.components_)
apca.fit(X_hat_apca)
rpca.fit(X_hat_rpca)
assert_array_almost_equal(apca.singular_values_, [3.142, 2.718, 1.0], 14)
assert_array_almost_equal(rpca.singular_values_, [3.142, 2.718, 1.0], 14)


def test_truncated_svd_eq_pca():
pca = TruncatedSVD(n_components=3, algorithm=solver,
random_state=rng)
X_pca = pca.fit_transform(X)

X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0))
X_pca[:, 0] *= 3.142
X_pca[:, 1] *= 2.718

X_hat_pca = np.dot(X_pca, pca.components_)
pca.fit(X_hat_pca)
assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0], rtol=1e-14)


def test_truncated_svd_eq_pca(X_sparse):
# TruncatedSVD should be equal to PCA on centered data

X_c = X - X.mean(axis=0)
X_dense = X_sparse.toarray()

X_c = X_dense - X_dense.mean(axis=0)

params = dict(n_components=10, random_state=42)

Expand Down