Skip to content

[MRG+2] Use fused types in sparse mean variance functions #6593

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ Enhancements
- Add option to show ``indicator features`` in the output of Imputer.
By `Mani Teja`_.

- Reduce the memory usage for 32-bit float input arrays of :func:`utils.mean_variance_axis` and
:func:`utils.incr_mean_variance_axis` by supporting cython fused types. By `YenChen Lin`_.

Bug fixes
.........

Expand Down
118 changes: 85 additions & 33 deletions sklearn/utils/sparsefuncs_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -64,24 +64,36 @@ def csr_mean_variance_axis0(X):
Feature-wise variances

"""
cdef unsigned int n_samples = X.shape[0]
cdef unsigned int n_features = X.shape[1]
if X.dtype != np.float32:
X = X.astype(np.float64)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can cast X to np.float32 if X.dtype is np.int32 right?

return _csr_mean_variance_axis0(X.data, X.shape, X.indices)

cdef np.ndarray[DOUBLE, ndim=1, mode="c"] X_data
X_data = np.asarray(X.data, dtype=np.float64) # might copy!
cdef np.ndarray[int, ndim=1] X_indices = X.indices

def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
shape,
np.ndarray[int, ndim=1] X_indices):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could write a small comment if we want to figure out why you wrote this wrapper. (That is, you can pass arguments that could have different dtypes only as arguments to functions and in this case floating identifies the dtype of the argument passed. The same cannot be done with variables)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ofc in a much better way.

# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef unsigned int n_samples = shape[0]
cdef unsigned int n_features = shape[1]

cdef unsigned int i
cdef unsigned int non_zero = X_indices.shape[0]
cdef unsigned int col_ind
cdef double diff
cdef floating diff

# means[j] contains the mean of feature j
cdef np.ndarray[DOUBLE, ndim=1] means = np.zeros(n_features,
dtype=np.float64)

cdef np.ndarray[floating, ndim=1] means
# variances[j] contains the variance of feature j
cdef np.ndarray[DOUBLE, ndim=1] variances = np.zeros_like(means)
cdef np.ndarray[floating, ndim=1] variances

if floating is float:
dtype = np.float32
else:
dtype = np.float64

means = np.zeros(n_features, dtype=dtype)
variances = np.zeros_like(means, dtype=dtype)

# counts[j] contains the number of samples where feature j is non-zero
cdef np.ndarray[int, ndim=1] counts = np.zeros(n_features,
Expand Down Expand Up @@ -124,27 +136,38 @@ def csc_mean_variance_axis0(X):
Feature-wise variances

"""
cdef unsigned int n_samples = X.shape[0]
cdef unsigned int n_features = X.shape[1]
if X.dtype != np.float32:
X = X.astype(np.float64)
return _csc_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr)


cdef np.ndarray[DOUBLE, ndim=1] X_data
X_data = np.asarray(X.data, dtype=np.float64) # might copy!
cdef np.ndarray[int, ndim=1] X_indices = X.indices
cdef np.ndarray[int, ndim=1] X_indptr = X.indptr
def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
shape,
np.ndarray[int, ndim=1] X_indices,
np.ndarray[int, ndim=1] X_indptr):
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef unsigned int n_samples = shape[0]
cdef unsigned int n_features = shape[1]

cdef unsigned int i
cdef unsigned int j
cdef unsigned int counts
cdef unsigned int startptr
cdef unsigned int endptr
cdef double diff
cdef floating diff

# means[j] contains the mean of feature j
cdef np.ndarray[DOUBLE, ndim=1] means = np.zeros(n_features,
dtype=np.float64)

cdef np.ndarray[floating, ndim=1] means
# variances[j] contains the variance of feature j
cdef np.ndarray[DOUBLE, ndim=1] variances = np.zeros_like(means)
cdef np.ndarray[floating, ndim=1] variances
if floating is float:
dtype = np.float32
else:
dtype = np.float64

means = np.zeros(n_features, dtype=dtype)
variances = np.zeros_like(means, dtype=dtype)

for i in xrange(n_features):

Expand Down Expand Up @@ -210,29 +233,58 @@ def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n):
`utils.extmath._batch_mean_variance_update`.

"""
cdef unsigned long n_samples = X.shape[0]
cdef unsigned int n_features = X.shape[1]
if X.dtype != np.float32:
X = X.astype(np.float64)
return _incr_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr,
X.format, last_mean, last_var, last_n)


def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be a cdef right? There is not point in exposing this as part of the Python-level API.

I think we also need the usual decorators:

 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)

Maybe we could enable them globally for this cython file instead.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah sorry for the my carelessness, you are right! Thanks @ogrisel .
However, in my merged PR #6588 and #6539, I didn't use cdef, which seems to be a wrong move.

Should I fix that in this PR or create another PR?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just create another so that this can be merged soon.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Base on the discussion in #6659,
I think we should just leave it as def.

shape,
np.ndarray[int, ndim=1] X_indices,
np.ndarray[int, ndim=1] X_indptr,
X_format,
last_mean,
last_var,
unsigned long last_n):
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef unsigned long n_samples = shape[0]
cdef unsigned int n_features = shape[1]
cdef unsigned int i

# last = stats until now
# new = the current increment
# updated = the aggregated stats
# when arrays, they are indexed by i per-feature
cdef np.ndarray[DOUBLE, ndim=1] new_mean = np.zeros(n_features,
dtype=np.float64)
cdef np.ndarray[DOUBLE, ndim=1] new_var = np.zeros_like(new_mean)
cdef np.ndarray[floating, ndim=1] new_mean
cdef np.ndarray[floating, ndim=1] new_var
cdef np.ndarray[floating, ndim=1] updated_mean
cdef np.ndarray[floating, ndim=1] updated_var
if floating is float:
dtype = np.float32
else:
dtype = np.float64
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style:

dtype = np.float32 if floating is float else np.float64

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally, I find that a bit too much of a tongue-twister!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:)

as you wish.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

haha


new_mean = np.zeros(n_features, dtype=dtype)
new_var = np.zeros_like(new_mean, dtype=dtype)
updated_mean = np.zeros_like(new_mean, dtype=dtype)
updated_var = np.zeros_like(new_mean, dtype=dtype)

cdef unsigned long new_n
cdef np.ndarray[DOUBLE, ndim=1] updated_mean = np.zeros_like(new_mean)
cdef np.ndarray[DOUBLE, ndim=1] updated_var = np.zeros_like(new_mean)
cdef unsigned long updated_n
cdef DOUBLE last_over_new_n
cdef floating last_over_new_n

# Obtain new stats first
new_n = n_samples
if isinstance(X, sp.csr_matrix):
new_mean, new_var = csr_mean_variance_axis0(X)
elif isinstance(X, sp.csc_matrix):
new_mean, new_var = csc_mean_variance_axis0(X)

if X_format == 'csr':
# X is a CSR matrix
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find this way of detecting the type of X hackish. Please pass X in addition to the other arguments to this private subfunction and keep using the old if isinstance(X, sp.csr_matrix) / elif isinstance(X, sp.csc_matrix) tests.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can also use X.format == 'csr'

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, +1 for passing X.format as an additional argument to the private subfunction then.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was the one who suggested this, trying to show off my superior scipy sparse matrix tricks. Sorry about that!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Also I learnt most of it from @jnothman , so I'm not the only one to blame :P )

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Explicit is better than implicit ;)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey, @MechCoder, that's not fair! I would have never said len(X.indptr) == X.shape[0] + 1 entails X.format == 'CSR'. That condition is surely also true for any square CSC matrix.

new_mean, new_var = _csr_mean_variance_axis0(X_data, shape, X_indices)
else:
# X is a CSC matrix
new_mean, new_var = _csc_mean_variance_axis0(X_data, shape, X_indices,
X_indptr)

# First pass
if last_n == 0:
Expand Down
109 changes: 47 additions & 62 deletions sklearn/utils/tests/test_sparsefuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,29 +30,26 @@ def test_mean_variance_axis0():
X_lil = sp.lil_matrix(X)
X_lil[1, 0] = 0
X[1, 0] = 0
X_csr = sp.csr_matrix(X_lil)

X_means, X_vars = mean_variance_axis(X_csr, axis=0)
assert_array_almost_equal(X_means, np.mean(X, axis=0))
assert_array_almost_equal(X_vars, np.var(X, axis=0))
assert_raises(TypeError, mean_variance_axis, X_lil, axis=0)

X_csr = sp.csr_matrix(X_lil)
X_csc = sp.csc_matrix(X_lil)
X_means, X_vars = mean_variance_axis(X_csc, axis=0)

assert_array_almost_equal(X_means, np.mean(X, axis=0))
assert_array_almost_equal(X_vars, np.var(X, axis=0))
assert_raises(TypeError, mean_variance_axis, X_lil, axis=0)
expected_dtypes = [(np.float32, np.float32),
(np.float64, np.float64),
(np.int32, np.float64),
(np.int64, np.float64)]

X = X.astype(np.float32)
X_csr = X_csr.astype(np.float32)
X_csc = X_csr.astype(np.float32)
X_means, X_vars = mean_variance_axis(X_csr, axis=0)
assert_array_almost_equal(X_means, np.mean(X, axis=0))
assert_array_almost_equal(X_vars, np.var(X, axis=0))
X_means, X_vars = mean_variance_axis(X_csc, axis=0)
assert_array_almost_equal(X_means, np.mean(X, axis=0))
assert_array_almost_equal(X_vars, np.var(X, axis=0))
assert_raises(TypeError, mean_variance_axis, X_lil, axis=0)
for input_dtype, output_dtype in expected_dtypes:
X_test = X.astype(input_dtype)
for X_sparse in (X_csr, X_csc):
X_sparse = X_sparse.astype(input_dtype)
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
assert_equal(X_means.dtype, output_dtype)
assert_equal(X_vars.dtype, output_dtype)
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))


def test_mean_variance_axis1():
Expand All @@ -64,29 +61,26 @@ def test_mean_variance_axis1():
X_lil = sp.lil_matrix(X)
X_lil[1, 0] = 0
X[1, 0] = 0
X_csr = sp.csr_matrix(X_lil)

X_means, X_vars = mean_variance_axis(X_csr, axis=1)
assert_array_almost_equal(X_means, np.mean(X, axis=1))
assert_array_almost_equal(X_vars, np.var(X, axis=1))
assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)

X_csr = sp.csr_matrix(X_lil)
X_csc = sp.csc_matrix(X_lil)
X_means, X_vars = mean_variance_axis(X_csc, axis=1)

assert_array_almost_equal(X_means, np.mean(X, axis=1))
assert_array_almost_equal(X_vars, np.var(X, axis=1))
assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)
expected_dtypes = [(np.float32, np.float32),
(np.float64, np.float64),
(np.int32, np.float64),
(np.int64, np.float64)]

X = X.astype(np.float32)
X_csr = X_csr.astype(np.float32)
X_csc = X_csr.astype(np.float32)
X_means, X_vars = mean_variance_axis(X_csr, axis=1)
assert_array_almost_equal(X_means, np.mean(X, axis=1))
assert_array_almost_equal(X_vars, np.var(X, axis=1))
X_means, X_vars = mean_variance_axis(X_csc, axis=1)
assert_array_almost_equal(X_means, np.mean(X, axis=1))
assert_array_almost_equal(X_vars, np.var(X, axis=1))
assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)
for input_dtype, output_dtype in expected_dtypes:
X_test = X.astype(input_dtype)
for X_sparse in (X_csr, X_csc):
X_sparse = X_sparse.astype(input_dtype)
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
assert_equal(X_means.dtype, output_dtype)
assert_equal(X_vars.dtype, output_dtype)
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))


def test_incr_mean_variance_axis():
Expand Down Expand Up @@ -132,34 +126,25 @@ def test_incr_mean_variance_axis():
X = np.vstack(data_chunks)
X_lil = sp.lil_matrix(X)
X_csr = sp.csr_matrix(X_lil)
X_means, X_vars = mean_variance_axis(X_csr, axis)
X_means_incr, X_vars_incr, n_incr = \
incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n)
assert_array_almost_equal(X_means, X_means_incr)
assert_array_almost_equal(X_vars, X_vars_incr)
assert_equal(X.shape[axis], n_incr)

X_csc = sp.csc_matrix(X_lil)
X_means, X_vars = mean_variance_axis(X_csc, axis)
assert_array_almost_equal(X_means, X_means_incr)
assert_array_almost_equal(X_vars, X_vars_incr)
assert_equal(X.shape[axis], n_incr)

# All data but as float
X = X.astype(np.float32)
X_csr = X_csr.astype(np.float32)
X_means, X_vars = mean_variance_axis(X_csr, axis)
X_means_incr, X_vars_incr, n_incr = \
incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n)
assert_array_almost_equal(X_means, X_means_incr)
assert_array_almost_equal(X_vars, X_vars_incr)
assert_equal(X.shape[axis], n_incr)

X_csc = X_csr.astype(np.float32)
X_means, X_vars = mean_variance_axis(X_csc, axis)
assert_array_almost_equal(X_means, X_means_incr)
assert_array_almost_equal(X_vars, X_vars_incr)
assert_equal(X.shape[axis], n_incr)
expected_dtypes = [(np.float32, np.float32),
(np.float64, np.float64),
(np.int32, np.float64),
(np.int64, np.float64)]

for input_dtype, output_dtype in expected_dtypes:
for X_sparse in (X_csr, X_csc):
X_sparse = X_sparse.astype(input_dtype)
X_means, X_vars = mean_variance_axis(X_sparse, axis)
X_means_incr, X_vars_incr, n_incr = \
incr_mean_variance_axis(X_sparse, axis, last_mean,
last_var, last_n)
assert_equal(X_means_incr.dtype, output_dtype)
assert_equal(X_vars_incr.dtype, output_dtype)
assert_array_almost_equal(X_means, X_means_incr)
assert_array_almost_equal(X_vars, X_vars_incr)
assert_equal(X.shape[axis], n_incr)


def test_mean_variance_illegal_axis():
Expand Down