Skip to content

[MRG] Add support for 64 bit indices in CSR array normalization #9663

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 5, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 17 additions & 16 deletions sklearn/utils/sparsefuncs_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ from cython cimport floating

np.import_array()

ctypedef fused integral:
int
long long

ctypedef np.float64_t DOUBLE

Expand All @@ -30,11 +33,11 @@ def csr_row_norms(X):

def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data,
shape,
np.ndarray[int, ndim=1, mode="c"] X_indices,
np.ndarray[int, ndim=1, mode="c"] X_indptr):
np.ndarray[integral, ndim=1, mode="c"] X_indices,
np.ndarray[integral, ndim=1, mode="c"] X_indptr):
cdef:
unsigned int n_samples = shape[0]
unsigned int n_features = shape[1]
unsigned long long n_samples = shape[0]
unsigned long long n_features = shape[1]
np.ndarray[DOUBLE, ndim=1, mode="c"] norms

np.npy_intp i, j
Expand Down Expand Up @@ -326,17 +329,16 @@ def inplace_csr_row_normalize_l1(X):

def _inplace_csr_row_normalize_l1(np.ndarray[floating, ndim=1] X_data,
shape,
np.ndarray[int, ndim=1] X_indices,
np.ndarray[int, ndim=1] X_indptr):
cdef unsigned int n_samples = shape[0]
cdef unsigned int n_features = shape[1]
np.ndarray[integral, ndim=1] X_indices,
np.ndarray[integral, ndim=1] X_indptr):
cdef unsigned long long n_samples = shape[0]
cdef unsigned long long n_features = shape[1]

# the column indices for row i are stored in:
# indices[indptr[i]:indices[i+1]]
# and their corresponding values are stored in:
# data[indptr[i]:indptr[i+1]]
cdef unsigned int i
cdef unsigned int j
cdef np.npy_intp i, j
cdef double sum_

for i in xrange(n_samples):
Expand All @@ -361,13 +363,12 @@ def inplace_csr_row_normalize_l2(X):

def _inplace_csr_row_normalize_l2(np.ndarray[floating, ndim=1] X_data,
shape,
np.ndarray[int, ndim=1] X_indices,
np.ndarray[int, ndim=1] X_indptr):
cdef unsigned int n_samples = shape[0]
cdef unsigned int n_features = shape[1]
np.ndarray[integral, ndim=1] X_indices,
np.ndarray[integral, ndim=1] X_indptr):
cdef integral n_samples = shape[0]
cdef integral n_features = shape[1]

cdef unsigned int i
cdef unsigned int j
cdef np.npy_intp i, j
cdef double sum_

for i in xrange(n_samples):
Expand Down
17 changes: 13 additions & 4 deletions sklearn/utils/tests/test_extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,19 @@ def test_row_norms():
precision)
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)

Xcsr = sparse.csr_matrix(X, dtype=dtype)
assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True),
precision)
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision)
for csr_index_dtype in [np.int32, np.int64]:
Xcsr = sparse.csr_matrix(X, dtype=dtype)
# csr_matrix will use int32 indices by default,
# up-casting those to int64 when necessary
if csr_index_dtype is np.int64:
Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype)
Xcsr.indices = Xcsr.indices.astype(csr_index_dtype)
assert Xcsr.indices.dtype == csr_index_dtype
assert Xcsr.indptr.dtype == csr_index_dtype
assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True),
precision)
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr),
precision)


def test_randomized_svd_low_rank_with_noise():
Expand Down
18 changes: 13 additions & 5 deletions sklearn/utils/tests/test_sparsefuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,8 +478,16 @@ def test_inplace_normalize():
for dtype in (np.float64, np.float32):
X = rs.randn(10, 5).astype(dtype)
X_csr = sp.csr_matrix(X)
inplace_csr_row_normalize(X_csr)
assert_equal(X_csr.dtype, dtype)
if inplace_csr_row_normalize is inplace_csr_row_normalize_l2:
X_csr.data **= 2
assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones)
for index_dtype in [np.int32, np.int64]:
# csr_matrix will use int32 indices by default,
# up-casting those to int64 when necessary
if index_dtype is np.int64:
X_csr.indptr = X_csr.indptr.astype(index_dtype)
X_csr.indices = X_csr.indices.astype(index_dtype)
assert X_csr.indices.dtype == index_dtype
assert X_csr.indptr.dtype == index_dtype
inplace_csr_row_normalize(X_csr)
assert_equal(X_csr.dtype, dtype)
if inplace_csr_row_normalize is inplace_csr_row_normalize_l2:
X_csr.data **= 2
assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones)