-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
MAINT cython typedefs in sparsefuncs_fast #27333
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5c67956
145b8fd
c259d79
9f3f090
e9172d9
9fa055e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,18 +7,23 @@ | |
# License: BSD 3 clause | ||
|
||
from libc.math cimport fabs, sqrt, isnan | ||
from libc.stdint cimport intptr_t | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this or better use our There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think we better use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don’t think. I‘m confused and would prefer C/C++ over Cython, but that’s another story. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or can someone tell what the difference between There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jjerphan If this point is clear then maybe we can merge this PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, here this choice is safe, but I think we could clarify types for indices in another PR. |
||
|
||
cimport numpy as cnp | ||
import numpy as np | ||
from cython cimport floating | ||
from ..utils._typedefs cimport float64_t, int32_t, intp_t, uint64_t | ||
|
||
cnp.import_array() | ||
|
||
ctypedef fused integral: | ||
int | ||
long long | ||
|
||
ctypedef cnp.float64_t DOUBLE | ||
# This `integral` fused type is defined to be used over `cython.integral` | ||
# to only generate implementations for `int{32,64}_t`. | ||
# TODO: use `{int,float}{32,64}_t` when cython#5230 is resolved: | ||
lorentzenchr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# https://github.com/cython/cython/issues/5230 | ||
ctypedef fused integral: | ||
int # int32_t | ||
long long # int64_t | ||
jjerphan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
def csr_row_norms(X): | ||
|
@@ -95,40 +100,40 @@ def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False): | |
|
||
def _csr_mean_variance_axis0( | ||
const floating[::1] X_data, | ||
unsigned long long n_samples, | ||
unsigned long long n_features, | ||
uint64_t n_samples, | ||
uint64_t n_features, | ||
const integral[:] X_indices, | ||
const integral[:] X_indptr, | ||
const floating[:] weights, | ||
): | ||
# Implement the function here since variables using fused types | ||
# cannot be declared directly and can only be passed as function arguments | ||
cdef: | ||
cnp.intp_t row_ind | ||
unsigned long long feature_idx | ||
intp_t row_ind | ||
uint64_t feature_idx | ||
integral i, col_ind | ||
cnp.float64_t diff | ||
float64_t diff | ||
# means[j] contains the mean of feature j | ||
cnp.float64_t[::1] means = np.zeros(n_features) | ||
float64_t[::1] means = np.zeros(n_features) | ||
# variances[j] contains the variance of feature j | ||
cnp.float64_t[::1] variances = np.zeros(n_features) | ||
float64_t[::1] variances = np.zeros(n_features) | ||
|
||
cnp.float64_t[::1] sum_weights = np.full( | ||
float64_t[::1] sum_weights = np.full( | ||
fill_value=np.sum(weights, dtype=np.float64), shape=n_features | ||
) | ||
cnp.float64_t[::1] sum_weights_nz = np.zeros(shape=n_features) | ||
cnp.float64_t[::1] correction = np.zeros(shape=n_features) | ||
float64_t[::1] sum_weights_nz = np.zeros(shape=n_features) | ||
float64_t[::1] correction = np.zeros(shape=n_features) | ||
|
||
cnp.uint64_t[::1] counts = np.full( | ||
uint64_t[::1] counts = np.full( | ||
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64 | ||
) | ||
cnp.uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64) | ||
uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64) | ||
|
||
for row_ind in range(len(X_indptr) - 1): | ||
for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]): | ||
col_ind = X_indices[i] | ||
if not isnan(X_data[i]): | ||
means[col_ind] += <cnp.float64_t>(X_data[i]) * weights[row_ind] | ||
means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind] | ||
# sum of weights where X[:, col_ind] is non-zero | ||
sum_weights_nz[col_ind] += weights[row_ind] | ||
# number of non-zero elements of X[:, col_ind] | ||
|
@@ -229,8 +234,8 @@ def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False): | |
|
||
def _csc_mean_variance_axis0( | ||
const floating[::1] X_data, | ||
unsigned long long n_samples, | ||
unsigned long long n_features, | ||
uint64_t n_samples, | ||
uint64_t n_features, | ||
const integral[:] X_indices, | ||
const integral[:] X_indptr, | ||
const floating[:] weights, | ||
|
@@ -239,29 +244,29 @@ def _csc_mean_variance_axis0( | |
# cannot be declared directly and can only be passed as function arguments | ||
cdef: | ||
integral i, row_ind | ||
unsigned long long feature_idx, col_ind | ||
cnp.float64_t diff | ||
uint64_t feature_idx, col_ind | ||
float64_t diff | ||
# means[j] contains the mean of feature j | ||
cnp.float64_t[::1] means = np.zeros(n_features) | ||
float64_t[::1] means = np.zeros(n_features) | ||
# variances[j] contains the variance of feature j | ||
cnp.float64_t[::1] variances = np.zeros(n_features) | ||
float64_t[::1] variances = np.zeros(n_features) | ||
|
||
cnp.float64_t[::1] sum_weights = np.full( | ||
float64_t[::1] sum_weights = np.full( | ||
fill_value=np.sum(weights, dtype=np.float64), shape=n_features | ||
) | ||
cnp.float64_t[::1] sum_weights_nz = np.zeros(shape=n_features) | ||
cnp.float64_t[::1] correction = np.zeros(shape=n_features) | ||
float64_t[::1] sum_weights_nz = np.zeros(shape=n_features) | ||
float64_t[::1] correction = np.zeros(shape=n_features) | ||
|
||
cnp.uint64_t[::1] counts = np.full( | ||
uint64_t[::1] counts = np.full( | ||
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64 | ||
) | ||
cnp.uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64) | ||
uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64) | ||
|
||
for col_ind in range(n_features): | ||
for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]): | ||
row_ind = X_indices[i] | ||
if not isnan(X_data[i]): | ||
means[col_ind] += <cnp.float64_t>(X_data[i]) * weights[row_ind] | ||
means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind] | ||
# sum of weights where X[:, col_ind] is non-zero | ||
sum_weights_nz[col_ind] += weights[row_ind] | ||
# number of non-zero elements of X[:, col_ind] | ||
|
@@ -387,9 +392,9 @@ def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None): | |
def _incr_mean_variance_axis0( | ||
const floating[:] X_data, | ||
floating n_samples, | ||
unsigned long long n_features, | ||
uint64_t n_features, | ||
const int[:] X_indices, | ||
# X_indptr might be either in32 or int64 | ||
# X_indptr might be either int32 or int64 | ||
const integral[:] X_indptr, | ||
str X_format, | ||
floating[:] last_mean, | ||
|
@@ -401,7 +406,7 @@ def _incr_mean_variance_axis0( | |
# Implement the function here since variables using fused types | ||
# cannot be declared directly and can only be passed as function arguments | ||
cdef: | ||
unsigned long long i | ||
uint64_t i | ||
|
||
# last = stats until now | ||
# new = the current increment | ||
|
@@ -493,13 +498,13 @@ def _inplace_csr_row_normalize_l1( | |
const integral[:] X_indptr, | ||
): | ||
cdef: | ||
unsigned long long n_samples = shape[0] | ||
uint64_t n_samples = shape[0] | ||
|
||
# the column indices for row i are stored in: | ||
# indices[indptr[i]:indices[i+1]] | ||
# and their corresponding values are stored in: | ||
# data[indptr[i]:indptr[i+1]] | ||
unsigned long long i | ||
uint64_t i | ||
integral j | ||
double sum_ | ||
|
||
|
@@ -530,8 +535,8 @@ def _inplace_csr_row_normalize_l2( | |
const integral[:] X_indptr, | ||
): | ||
cdef: | ||
unsigned long long n_samples = shape[0] | ||
unsigned long long i | ||
uint64_t n_samples = shape[0] | ||
uint64_t i | ||
integral j | ||
double sum_ | ||
|
||
|
@@ -554,8 +559,8 @@ def _inplace_csr_row_normalize_l2( | |
|
||
def assign_rows_csr( | ||
X, | ||
const cnp.npy_intp[:] X_rows, | ||
const cnp.npy_intp[:] out_rows, | ||
const intptr_t[:] X_rows, | ||
const intptr_t[:] out_rows, | ||
floating[:, ::1] out, | ||
): | ||
"""Densify selected rows of a CSR matrix into a preallocated array. | ||
|
@@ -571,12 +576,13 @@ def assign_rows_csr( | |
out : array, shape=(arbitrary, n_features) | ||
""" | ||
cdef: | ||
# npy_intp (np.intp in Python) is what np.where returns, | ||
# intptr_t (npy_intp, np.intp in Python) is what np.where returns, | ||
# but int is what scipy.sparse uses. | ||
int i, ind, j, k | ||
cnp.npy_intp rX | ||
intp_t i, ind, j, k | ||
intptr_t rX | ||
const floating[:] data = X.data | ||
const int[:] indices = X.indices, indptr = X.indptr | ||
const int32_t[:] indices = X.indices | ||
const int32_t[:] indptr = X.indptr | ||
|
||
if X_rows.shape[0] != out_rows.shape[0]: | ||
raise ValueError("cannot assign %d rows to %d" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This has also been added in #27334