Skip to content

[MRG] Add sparse efficiency warning to randomized_svd (fixes randomized_svd is slow for dok_matrix and lil_matrix #11262) #11264

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions sklearn/utils/extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
import warnings

import numpy as np
from scipy import linalg
from scipy.sparse import issparse, csr_matrix
from scipy import linalg, sparse

from . import check_random_state, deprecated
from .fixes import np_version
Expand Down Expand Up @@ -60,9 +59,9 @@ def row_norms(X, squared=False):

Performs no input validation.
"""
if issparse(X):
if not isinstance(X, csr_matrix):
X = csr_matrix(X)
if sparse.issparse(X):
if not isinstance(X, sparse.csr_matrix):
X = sparse.csr_matrix(X)
norms = csr_row_norms(X)
else:
norms = np.einsum('ij,ij->i', X, X)
Expand Down Expand Up @@ -131,7 +130,7 @@ def safe_sparse_dot(a, b, dense_output=False):
dot_product : array or sparse matrix
sparse if ``a`` or ``b`` is sparse and ``dense_output=False``.
"""
if issparse(a) or issparse(b):
if sparse.issparse(a) or sparse.issparse(b):
ret = a * b
if dense_output and hasattr(ret, "toarray"):
ret = ret.toarray()
Expand Down Expand Up @@ -307,6 +306,12 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
analysis
A. Szlam et al. 2014
"""
if isinstance(M, (sparse.lil_matrix, sparse.dok_matrix)):
warnings.warn("Calculating SVD of a {} is expensive. "
"csr_matrix is more efficient.".format(
type(M).__name__),
sparse.SparseEfficiencyWarning)

random_state = check_random_state(random_state)
n_random = n_components + n_oversamples
n_samples, n_features = M.shape
Expand Down Expand Up @@ -620,7 +625,7 @@ def safe_min(X):
Adapated from http://stackoverflow.com/q/13426580

"""
if issparse(X):
if sparse.issparse(X):
if len(X.data) == 0:
return 0
m = X.data.min()
Expand All @@ -633,7 +638,7 @@ def make_nonnegative(X, min_value=0):
"""Ensure `X.min()` >= `min_value`."""
min_ = safe_min(X)
if min_ < min_value:
if issparse(X):
if sparse.issparse(X):
raise ValueError("Cannot make the data matrix"
" nonnegative because it is sparse."
" Adding a value to every entry would"
Expand Down
15 changes: 15 additions & 0 deletions sklearn/utils/tests/test_extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,21 @@ def test_randomized_svd_power_iteration_normalizer():
assert_greater(15, np.abs(error_2 - error))


def test_randomized_svd_sparse_warnings():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you figure out a way to make the test less time-consuming (e.g., smaller matrix?). Currently, it's among the most time-consuming tests (See Travis log).

# randomized_svd throws a warning for lil and dok matrix
rng = np.random.RandomState(42)
X = make_low_rank_matrix(50, 20, effective_rank=10, random_state=rng)
n_components = 5
for cls in (sparse.lil_matrix, sparse.dok_matrix):
X = cls(X)
assert_warns_message(
sparse.SparseEfficiencyWarning,
"Calculating SVD of a {} is expensive. "
"csr_matrix is more efficient.".format(cls.__name__),
randomized_svd, X, n_components, n_iter=1,
power_iteration_normalizer='none')


def test_svd_flip():
# Check that svd_flip works in both situations, and reconstructs input.
rs = np.random.RandomState(1999)
Expand Down