From a0dd9d8359e063f421d23b17766f3eedd14096d7 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 19 Oct 2018 17:02:44 +0200 Subject: [PATCH 1/4] fix csr row norms dtype --- sklearn/utils/sparsefuncs_fast.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index b40b843e94322..a43d7fdd85486 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -39,13 +39,12 @@ def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data, cdef: unsigned long long n_samples = shape[0] unsigned long long n_features = shape[1] - np.ndarray[DOUBLE, ndim=1, mode="c"] norms + floating[::1] norms = np.zeros(n_samples, dtype=X_data.dtype) - np.npy_intp i, j + unsigned long long i + int j double sum_ - norms = np.zeros(n_samples, dtype=np.float64) - for i in range(n_samples): sum_ = 0.0 for j in range(X_indptr[i], X_indptr[i + 1]): From 298b4caabaccfa30550af3023908d14b5b6a3158 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 19 Oct 2018 17:03:01 +0200 Subject: [PATCH 2/4] add test --- sklearn/utils/tests/test_sparsefuncs.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 03c0c717d3174..67806d7117e51 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -18,7 +18,8 @@ count_nonzero, csc_median_axis_0) from sklearn.utils.sparsefuncs_fast import (assign_rows_csr, inplace_csr_row_normalize_l1, - inplace_csr_row_normalize_l2) + inplace_csr_row_normalize_l2, + csr_row_norms) from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_allclose @@ -512,3 +513,14 @@ def test_inplace_normalize(): if inplace_csr_row_normalize is inplace_csr_row_normalize_l2: X_csr.data **= 2 assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_csr_row_norms(dtype): + # checks that csr_row_norms returns the same output as + # scipy.sparse.linalg.norm, and that the dype is the same X's. + X = sp.random(100, 10, format='csr', dtype=dtype) + scipy_norms = sp.linalg.norm(X, axis=1)**2 + norms = csr_row_norms(X) + assert norms.dtype.type is dtype + assert_array_almost_equal(norms, scipy_norms) From 1e0e1e484ad9e6007920d16f5e93bb8c9cb1eb72 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 19 Oct 2018 17:18:58 +0200 Subject: [PATCH 3/4] typo --- sklearn/utils/tests/test_sparsefuncs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 67806d7117e51..3576ae991d23f 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -518,7 +518,7 @@ def test_inplace_normalize(): @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_csr_row_norms(dtype): # checks that csr_row_norms returns the same output as - # scipy.sparse.linalg.norm, and that the dype is the same X's. + # scipy.sparse.linalg.norm, and that the dype is the same as X. X = sp.random(100, 10, format='csr', dtype=dtype) scipy_norms = sp.linalg.norm(X, axis=1)**2 norms = csr_row_norms(X) From 46b5ea338cca28fc6067f67288c76f0a3f6a4100 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 19 Oct 2018 17:53:20 +0200 Subject: [PATCH 4/4] fix norms def --- sklearn/utils/sparsefuncs_fast.pyx | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index a43d7fdd85486..ece0d0df06218 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -25,24 +25,28 @@ ctypedef fused integral: ctypedef np.float64_t DOUBLE + def csr_row_norms(X): """L2 norm of each row in CSR matrix X.""" if X.dtype not in [np.float32, np.float64]: X = X.astype(np.float64) - return _csr_row_norms(X.data, X.shape, X.indices, X.indptr) + + norms = np.zeros(X.shape[0], dtype=X.data.dtype) + _csr_row_norms(X.data, X.shape, X.indices, X.indptr, norms) + + return norms def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data, shape, np.ndarray[integral, ndim=1, mode="c"] X_indices, - np.ndarray[integral, ndim=1, mode="c"] X_indptr): + np.ndarray[integral, ndim=1, mode="c"] X_indptr, + floating[::1] norms): cdef: unsigned long long n_samples = shape[0] - unsigned long long n_features = shape[1] - floating[::1] norms = np.zeros(n_samples, dtype=X_data.dtype) - + unsigned long long i - int j + integral j double sum_ for i in range(n_samples): @@ -51,8 +55,6 @@ def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data, sum_ += X_data[j] * X_data[j] norms[i] = sum_ - return norms - def csr_mean_variance_axis0(X): """Compute mean and variance along axis 0 on a CSR matrix