scikit-learn · MechCoder · Apr 16, 2016 · Mar 26, 2016 · Apr 2, 2016 · Apr 16, 2016
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -131,6 +131,9 @@ Enhancements
    - Add option to show ``indicator features`` in the output of Imputer.
      By `Mani Teja`_.
 
+   - Reduce the memory usage for 32-bit float input arrays of :func:`utils.mean_variance_axis` and 
+     :func:`utils.incr_mean_variance_axis` by supporting cython fused types. By `YenChen Lin`_.
+
 Bug fixes
 .........
 

diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx
@@ -64,24 +64,36 @@ def csr_mean_variance_axis0(X):
         Feature-wise variances
 
     """
-    cdef unsigned int n_samples = X.shape[0]
-    cdef unsigned int n_features = X.shape[1]
+    if X.dtype != np.float32:
+        X = X.astype(np.float64)
+    return _csr_mean_variance_axis0(X.data, X.shape, X.indices)
 
-    cdef np.ndarray[DOUBLE, ndim=1, mode="c"] X_data
-    X_data = np.asarray(X.data, dtype=np.float64)     # might copy!
-    cdef np.ndarray[int, ndim=1] X_indices = X.indices
+
+def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
+                             shape,
+                             np.ndarray[int, ndim=1] X_indices):
+    # Implement the function here since variables using fused types
+    # cannot be declared directly and can only be passed as function arguments
+    cdef unsigned int n_samples = shape[0]
+    cdef unsigned int n_features = shape[1]
 
     cdef unsigned int i
     cdef unsigned int non_zero = X_indices.shape[0]
     cdef unsigned int col_ind
-    cdef double diff
+    cdef floating diff
 
     # means[j] contains the mean of feature j
-    cdef np.ndarray[DOUBLE, ndim=1] means = np.zeros(n_features,
-                                                     dtype=np.float64)
-
+    cdef np.ndarray[floating, ndim=1] means
     # variances[j] contains the variance of feature j
-    cdef np.ndarray[DOUBLE, ndim=1] variances = np.zeros_like(means)
+    cdef np.ndarray[floating, ndim=1] variances
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    means = np.zeros(n_features, dtype=dtype)
+    variances = np.zeros_like(means, dtype=dtype)
 
     # counts[j] contains the number of samples where feature j is non-zero
     cdef np.ndarray[int, ndim=1] counts = np.zeros(n_features,
@@ -124,27 +136,38 @@ def csc_mean_variance_axis0(X):
         Feature-wise variances
 
     """
-    cdef unsigned int n_samples = X.shape[0]
-    cdef unsigned int n_features = X.shape[1]
+    if X.dtype != np.float32:
+        X = X.astype(np.float64)
+    return _csc_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr)
+
 
-    cdef np.ndarray[DOUBLE, ndim=1] X_data
-    X_data = np.asarray(X.data, dtype=np.float64)     # might copy!
-    cdef np.ndarray[int, ndim=1] X_indices = X.indices
-    cdef np.ndarray[int, ndim=1] X_indptr = X.indptr
+def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
+                             shape,
+                             np.ndarray[int, ndim=1] X_indices,
+                             np.ndarray[int, ndim=1] X_indptr):
+    # Implement the function here since variables using fused types
+    # cannot be declared directly and can only be passed as function arguments
+    cdef unsigned int n_samples = shape[0]
+    cdef unsigned int n_features = shape[1]
 
     cdef unsigned int i
     cdef unsigned int j
     cdef unsigned int counts
     cdef unsigned int startptr
     cdef unsigned int endptr
-    cdef double diff
+    cdef floating diff
 
     # means[j] contains the mean of feature j
-    cdef np.ndarray[DOUBLE, ndim=1] means = np.zeros(n_features,
-                                                     dtype=np.float64)
-
+    cdef np.ndarray[floating, ndim=1] means
     # variances[j] contains the variance of feature j
-    cdef np.ndarray[DOUBLE, ndim=1] variances = np.zeros_like(means)
+    cdef np.ndarray[floating, ndim=1] variances
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    means = np.zeros(n_features, dtype=dtype)
+    variances = np.zeros_like(means, dtype=dtype)
 
     for i in xrange(n_features):
 
@@ -210,29 +233,58 @@ def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n):
     `utils.extmath._batch_mean_variance_update`.
 
     """
-    cdef unsigned long n_samples = X.shape[0]
-    cdef unsigned int n_features = X.shape[1]
+    if X.dtype != np.float32:
+        X = X.astype(np.float64)
+    return _incr_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr,
+                                     X.format, last_mean, last_var, last_n)
+
+
+def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
+                              shape,
+                              np.ndarray[int, ndim=1] X_indices,
+                              np.ndarray[int, ndim=1] X_indptr,
+                              X_format,
+                              last_mean,
+                              last_var,
+                              unsigned long last_n):
+    # Implement the function here since variables using fused types
+    # cannot be declared directly and can only be passed as function arguments
+    cdef unsigned long n_samples = shape[0]
+    cdef unsigned int n_features = shape[1]
     cdef unsigned int i
 
     # last = stats until now
     # new = the current increment
     # updated = the aggregated stats
     # when arrays, they are indexed by i per-feature
-    cdef np.ndarray[DOUBLE, ndim=1] new_mean = np.zeros(n_features,
-                                                      dtype=np.float64)
-    cdef np.ndarray[DOUBLE, ndim=1] new_var = np.zeros_like(new_mean)
+    cdef np.ndarray[floating, ndim=1] new_mean
+    cdef np.ndarray[floating, ndim=1] new_var
+    cdef np.ndarray[floating, ndim=1] updated_mean
+    cdef np.ndarray[floating, ndim=1] updated_var
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    new_mean = np.zeros(n_features, dtype=dtype)
+    new_var = np.zeros_like(new_mean, dtype=dtype)
+    updated_mean = np.zeros_like(new_mean, dtype=dtype)
+    updated_var = np.zeros_like(new_mean, dtype=dtype)
+
     cdef unsigned long new_n
-    cdef np.ndarray[DOUBLE, ndim=1] updated_mean = np.zeros_like(new_mean)
-    cdef np.ndarray[DOUBLE, ndim=1] updated_var = np.zeros_like(new_mean)
     cdef unsigned long updated_n
-    cdef DOUBLE last_over_new_n
+    cdef floating last_over_new_n
 
     # Obtain new stats first
     new_n = n_samples
-    if isinstance(X, sp.csr_matrix):
-        new_mean, new_var = csr_mean_variance_axis0(X)
-    elif isinstance(X, sp.csc_matrix):
-        new_mean, new_var = csc_mean_variance_axis0(X)
+
+    if X_format == 'csr':
+        # X is a CSR matrix
+        new_mean, new_var = _csr_mean_variance_axis0(X_data, shape, X_indices)
+    else:
+        # X is a CSC matrix
+        new_mean, new_var = _csc_mean_variance_axis0(X_data, shape, X_indices,
+                                                     X_indptr)
 
     # First pass
     if last_n == 0:

diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py
@@ -30,29 +30,26 @@ def test_mean_variance_axis0():
     X_lil = sp.lil_matrix(X)
     X_lil[1, 0] = 0
     X[1, 0] = 0
-    X_csr = sp.csr_matrix(X_lil)
 
-    X_means, X_vars = mean_variance_axis(X_csr, axis=0)
-    assert_array_almost_equal(X_means, np.mean(X, axis=0))
-    assert_array_almost_equal(X_vars, np.var(X, axis=0))
+    assert_raises(TypeError, mean_variance_axis, X_lil, axis=0)
 
+    X_csr = sp.csr_matrix(X_lil)
     X_csc = sp.csc_matrix(X_lil)
-    X_means, X_vars = mean_variance_axis(X_csc, axis=0)
 
-    assert_array_almost_equal(X_means, np.mean(X, axis=0))
-    assert_array_almost_equal(X_vars, np.var(X, axis=0))
-    assert_raises(TypeError, mean_variance_axis, X_lil, axis=0)
+    expected_dtypes = [(np.float32, np.float32),
+                       (np.float64, np.float64),
+                       (np.int32, np.float64),
+                       (np.int64, np.float64)]
 
-    X = X.astype(np.float32)
-    X_csr = X_csr.astype(np.float32)
-    X_csc = X_csr.astype(np.float32)
-    X_means, X_vars = mean_variance_axis(X_csr, axis=0)
-    assert_array_almost_equal(X_means, np.mean(X, axis=0))
-    assert_array_almost_equal(X_vars, np.var(X, axis=0))
-    X_means, X_vars = mean_variance_axis(X_csc, axis=0)
-    assert_array_almost_equal(X_means, np.mean(X, axis=0))
-    assert_array_almost_equal(X_vars, np.var(X, axis=0))
-    assert_raises(TypeError, mean_variance_axis, X_lil, axis=0)
+    for input_dtype, output_dtype in expected_dtypes:
+        X_test = X.astype(input_dtype)
+        for X_sparse in (X_csr, X_csc):
+            X_sparse = X_sparse.astype(input_dtype)
+            X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
+            assert_equal(X_means.dtype, output_dtype)
+            assert_equal(X_vars.dtype, output_dtype)
+            assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
+            assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
 
 
 def test_mean_variance_axis1():
@@ -64,29 +61,26 @@ def test_mean_variance_axis1():
     X_lil = sp.lil_matrix(X)
     X_lil[1, 0] = 0
     X[1, 0] = 0
-    X_csr = sp.csr_matrix(X_lil)
 
-    X_means, X_vars = mean_variance_axis(X_csr, axis=1)
-    assert_array_almost_equal(X_means, np.mean(X, axis=1))
-    assert_array_almost_equal(X_vars, np.var(X, axis=1))
+    assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)
 
+    X_csr = sp.csr_matrix(X_lil)
     X_csc = sp.csc_matrix(X_lil)
-    X_means, X_vars = mean_variance_axis(X_csc, axis=1)
 
-    assert_array_almost_equal(X_means, np.mean(X, axis=1))
-    assert_array_almost_equal(X_vars, np.var(X, axis=1))
-    assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)
+    expected_dtypes = [(np.float32, np.float32),
+                       (np.float64, np.float64),
+                       (np.int32, np.float64),
+                       (np.int64, np.float64)]
 
-    X = X.astype(np.float32)
-    X_csr = X_csr.astype(np.float32)
-    X_csc = X_csr.astype(np.float32)
-    X_means, X_vars = mean_variance_axis(X_csr, axis=1)
-    assert_array_almost_equal(X_means, np.mean(X, axis=1))
-    assert_array_almost_equal(X_vars, np.var(X, axis=1))
-    X_means, X_vars = mean_variance_axis(X_csc, axis=1)
-    assert_array_almost_equal(X_means, np.mean(X, axis=1))
-    assert_array_almost_equal(X_vars, np.var(X, axis=1))
-    assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)
+    for input_dtype, output_dtype in expected_dtypes:
+        X_test = X.astype(input_dtype)
+        for X_sparse in (X_csr, X_csc):
+            X_sparse = X_sparse.astype(input_dtype)
+            X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
+            assert_equal(X_means.dtype, output_dtype)
+            assert_equal(X_vars.dtype, output_dtype)
+            assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
+            assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
 
 
 def test_incr_mean_variance_axis():
@@ -132,34 +126,25 @@ def test_incr_mean_variance_axis():
         X = np.vstack(data_chunks)
         X_lil = sp.lil_matrix(X)
         X_csr = sp.csr_matrix(X_lil)
-        X_means, X_vars = mean_variance_axis(X_csr, axis)
-        X_means_incr, X_vars_incr, n_incr = \
-            incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n)
-        assert_array_almost_equal(X_means, X_means_incr)
-        assert_array_almost_equal(X_vars, X_vars_incr)
-        assert_equal(X.shape[axis], n_incr)
-
         X_csc = sp.csc_matrix(X_lil)
-        X_means, X_vars = mean_variance_axis(X_csc, axis)
-        assert_array_almost_equal(X_means, X_means_incr)
-        assert_array_almost_equal(X_vars, X_vars_incr)
-        assert_equal(X.shape[axis], n_incr)
 
-        # All data but as float
-        X = X.astype(np.float32)
-        X_csr = X_csr.astype(np.float32)
-        X_means, X_vars = mean_variance_axis(X_csr, axis)
-        X_means_incr, X_vars_incr, n_incr = \
-            incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n)
-        assert_array_almost_equal(X_means, X_means_incr)
-        assert_array_almost_equal(X_vars, X_vars_incr)
-        assert_equal(X.shape[axis], n_incr)
-
-        X_csc = X_csr.astype(np.float32)
-        X_means, X_vars = mean_variance_axis(X_csc, axis)
-        assert_array_almost_equal(X_means, X_means_incr)
-        assert_array_almost_equal(X_vars, X_vars_incr)
-        assert_equal(X.shape[axis], n_incr)
+        expected_dtypes = [(np.float32, np.float32),
+                           (np.float64, np.float64),
+                           (np.int32, np.float64),
+                           (np.int64, np.float64)]
+
+        for input_dtype, output_dtype in expected_dtypes:
+            for X_sparse in (X_csr, X_csc):
+                X_sparse = X_sparse.astype(input_dtype)
+                X_means, X_vars = mean_variance_axis(X_sparse, axis)
+                X_means_incr, X_vars_incr, n_incr = \
+                    incr_mean_variance_axis(X_sparse, axis, last_mean,
+                                            last_var, last_n)
+                assert_equal(X_means_incr.dtype, output_dtype)
+                assert_equal(X_vars_incr.dtype, output_dtype)
+                assert_array_almost_equal(X_means, X_means_incr)
+                assert_array_almost_equal(X_vars, X_vars_incr)
+                assert_equal(X.shape[axis], n_incr)
 
 
 def test_mean_variance_illegal_axis():