-
-
Notifications
You must be signed in to change notification settings - Fork 26k
[MRG+2] Use fused types in sparse mean variance functions #6593
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,24 +64,36 @@ def csr_mean_variance_axis0(X): | |
Feature-wise variances | ||
|
||
""" | ||
cdef unsigned int n_samples = X.shape[0] | ||
cdef unsigned int n_features = X.shape[1] | ||
if X.dtype != np.float32: | ||
X = X.astype(np.float64) | ||
return _csr_mean_variance_axis0(X.data, X.shape, X.indices) | ||
|
||
cdef np.ndarray[DOUBLE, ndim=1, mode="c"] X_data | ||
X_data = np.asarray(X.data, dtype=np.float64) # might copy! | ||
cdef np.ndarray[int, ndim=1] X_indices = X.indices | ||
|
||
def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, | ||
shape, | ||
np.ndarray[int, ndim=1] X_indices): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could write a small comment if we want to figure out why you wrote this wrapper. (That is, you can pass arguments that could have different dtypes only as arguments to functions and in this case floating identifies the dtype of the argument passed. The same cannot be done with variables) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ofc in a much better way. |
||
# Implement the function here since variables using fused types | ||
# cannot be declared directly and can only be passed as function arguments | ||
cdef unsigned int n_samples = shape[0] | ||
cdef unsigned int n_features = shape[1] | ||
|
||
cdef unsigned int i | ||
cdef unsigned int non_zero = X_indices.shape[0] | ||
cdef unsigned int col_ind | ||
cdef double diff | ||
cdef floating diff | ||
|
||
# means[j] contains the mean of feature j | ||
cdef np.ndarray[DOUBLE, ndim=1] means = np.zeros(n_features, | ||
dtype=np.float64) | ||
|
||
cdef np.ndarray[floating, ndim=1] means | ||
# variances[j] contains the variance of feature j | ||
cdef np.ndarray[DOUBLE, ndim=1] variances = np.zeros_like(means) | ||
cdef np.ndarray[floating, ndim=1] variances | ||
|
||
if floating is float: | ||
dtype = np.float32 | ||
else: | ||
dtype = np.float64 | ||
|
||
means = np.zeros(n_features, dtype=dtype) | ||
variances = np.zeros_like(means, dtype=dtype) | ||
|
||
# counts[j] contains the number of samples where feature j is non-zero | ||
cdef np.ndarray[int, ndim=1] counts = np.zeros(n_features, | ||
|
@@ -124,27 +136,38 @@ def csc_mean_variance_axis0(X): | |
Feature-wise variances | ||
|
||
""" | ||
cdef unsigned int n_samples = X.shape[0] | ||
cdef unsigned int n_features = X.shape[1] | ||
if X.dtype != np.float32: | ||
X = X.astype(np.float64) | ||
return _csc_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr) | ||
|
||
|
||
cdef np.ndarray[DOUBLE, ndim=1] X_data | ||
X_data = np.asarray(X.data, dtype=np.float64) # might copy! | ||
cdef np.ndarray[int, ndim=1] X_indices = X.indices | ||
cdef np.ndarray[int, ndim=1] X_indptr = X.indptr | ||
def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, | ||
shape, | ||
np.ndarray[int, ndim=1] X_indices, | ||
np.ndarray[int, ndim=1] X_indptr): | ||
# Implement the function here since variables using fused types | ||
# cannot be declared directly and can only be passed as function arguments | ||
cdef unsigned int n_samples = shape[0] | ||
cdef unsigned int n_features = shape[1] | ||
|
||
cdef unsigned int i | ||
cdef unsigned int j | ||
cdef unsigned int counts | ||
cdef unsigned int startptr | ||
cdef unsigned int endptr | ||
cdef double diff | ||
cdef floating diff | ||
|
||
# means[j] contains the mean of feature j | ||
cdef np.ndarray[DOUBLE, ndim=1] means = np.zeros(n_features, | ||
dtype=np.float64) | ||
|
||
cdef np.ndarray[floating, ndim=1] means | ||
# variances[j] contains the variance of feature j | ||
cdef np.ndarray[DOUBLE, ndim=1] variances = np.zeros_like(means) | ||
cdef np.ndarray[floating, ndim=1] variances | ||
if floating is float: | ||
dtype = np.float32 | ||
else: | ||
dtype = np.float64 | ||
|
||
means = np.zeros(n_features, dtype=dtype) | ||
variances = np.zeros_like(means, dtype=dtype) | ||
|
||
for i in xrange(n_features): | ||
|
||
|
@@ -210,29 +233,58 @@ def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n): | |
`utils.extmath._batch_mean_variance_update`. | ||
|
||
""" | ||
cdef unsigned long n_samples = X.shape[0] | ||
cdef unsigned int n_features = X.shape[1] | ||
if X.dtype != np.float32: | ||
X = X.astype(np.float64) | ||
return _incr_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr, | ||
X.format, last_mean, last_var, last_n) | ||
|
||
|
||
def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be a I think we also need the usual decorators: @cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True) Maybe we could enable them globally for this cython file instead. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just create another so that this can be merged soon. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Base on the discussion in #6659, |
||
shape, | ||
np.ndarray[int, ndim=1] X_indices, | ||
np.ndarray[int, ndim=1] X_indptr, | ||
X_format, | ||
last_mean, | ||
last_var, | ||
unsigned long last_n): | ||
# Implement the function here since variables using fused types | ||
# cannot be declared directly and can only be passed as function arguments | ||
cdef unsigned long n_samples = shape[0] | ||
cdef unsigned int n_features = shape[1] | ||
cdef unsigned int i | ||
|
||
# last = stats until now | ||
# new = the current increment | ||
# updated = the aggregated stats | ||
# when arrays, they are indexed by i per-feature | ||
cdef np.ndarray[DOUBLE, ndim=1] new_mean = np.zeros(n_features, | ||
dtype=np.float64) | ||
cdef np.ndarray[DOUBLE, ndim=1] new_var = np.zeros_like(new_mean) | ||
cdef np.ndarray[floating, ndim=1] new_mean | ||
cdef np.ndarray[floating, ndim=1] new_var | ||
cdef np.ndarray[floating, ndim=1] updated_mean | ||
cdef np.ndarray[floating, ndim=1] updated_var | ||
if floating is float: | ||
dtype = np.float32 | ||
else: | ||
dtype = np.float64 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. style: dtype = np.float32 if floating is float else np.float64 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Personally, I find that a bit too much of a tongue-twister! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. :) as you wish. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. haha |
||
|
||
new_mean = np.zeros(n_features, dtype=dtype) | ||
new_var = np.zeros_like(new_mean, dtype=dtype) | ||
updated_mean = np.zeros_like(new_mean, dtype=dtype) | ||
updated_var = np.zeros_like(new_mean, dtype=dtype) | ||
|
||
cdef unsigned long new_n | ||
cdef np.ndarray[DOUBLE, ndim=1] updated_mean = np.zeros_like(new_mean) | ||
cdef np.ndarray[DOUBLE, ndim=1] updated_var = np.zeros_like(new_mean) | ||
cdef unsigned long updated_n | ||
cdef DOUBLE last_over_new_n | ||
cdef floating last_over_new_n | ||
|
||
# Obtain new stats first | ||
new_n = n_samples | ||
if isinstance(X, sp.csr_matrix): | ||
new_mean, new_var = csr_mean_variance_axis0(X) | ||
elif isinstance(X, sp.csc_matrix): | ||
new_mean, new_var = csc_mean_variance_axis0(X) | ||
|
||
if X_format == 'csr': | ||
# X is a CSR matrix | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I find this way of detecting the type of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can also use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, +1 for passing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was the one who suggested this, trying to show off my superior scipy sparse matrix tricks. Sorry about that! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Also I learnt most of it from @jnothman , so I'm not the only one to blame :P ) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Explicit is better than implicit ;) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey, @MechCoder, that's not fair! I would have never said |
||
new_mean, new_var = _csr_mean_variance_axis0(X_data, shape, X_indices) | ||
else: | ||
# X is a CSC matrix | ||
new_mean, new_var = _csc_mean_variance_axis0(X_data, shape, X_indices, | ||
X_indptr) | ||
|
||
# First pass | ||
if last_n == 0: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can cast X to
np.float32
ifX.dtype
isnp.int32
right?