Skip to content

MAINT cython typedefs in sparsefuncs_fast #27333

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sklearn/utils/_typedefs.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# NOTE: Extend this list as needed when converting more cython extensions.
ctypedef unsigned char uint8_t
ctypedef unsigned int uint32_t
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has also been added in #27334

ctypedef unsigned long long uint64_t
ctypedef Py_ssize_t intp_t
ctypedef float float32_t
ctypedef double float64_t
Expand Down
1 change: 1 addition & 0 deletions sklearn/utils/_typedefs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import numpy as np
ctypedef fused testing_type_t:
uint8_t
uint32_t
uint64_t
intp_t
float32_t
float64_t
Expand Down
90 changes: 48 additions & 42 deletions sklearn/utils/sparsefuncs_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,23 @@
# License: BSD 3 clause

from libc.math cimport fabs, sqrt, isnan
from libc.stdint cimport intptr_t
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this or better use our intp_t instead?

Copy link
Member

@jjerphan jjerphan Sep 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

intp_t aliases npy_intp which aliases Py_intptr_t which aliases intptr_t

I think we better use intp_t for consistency. WDYT?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don’t think. I‘m confused and would prefer C/C++ over Cython, but that’s another story.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or can someone tell what the difference between inp_t alias Py_ssize_t and intptr_t alias ssize_t is?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jjerphan If this point is clear then maybe we can merge this PR?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, here this choice is safe, but I think we could clarify types for indices in another PR.


cimport numpy as cnp
import numpy as np
from cython cimport floating
from ..utils._typedefs cimport float64_t, int32_t, intp_t, uint64_t

cnp.import_array()

ctypedef fused integral:
int
long long

ctypedef cnp.float64_t DOUBLE
# This `integral` fused type is defined to be used over `cython.integral`
# to only generate implementations for `int{32,64}_t`.
# TODO: use `{int,float}{32,64}_t` when cython#5230 is resolved:
# https://github.com/cython/cython/issues/5230
ctypedef fused integral:
int # int32_t
long long # int64_t


def csr_row_norms(X):
Expand Down Expand Up @@ -95,40 +100,40 @@ def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False):

def _csr_mean_variance_axis0(
const floating[::1] X_data,
unsigned long long n_samples,
unsigned long long n_features,
uint64_t n_samples,
uint64_t n_features,
const integral[:] X_indices,
const integral[:] X_indptr,
const floating[:] weights,
):
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef:
cnp.intp_t row_ind
unsigned long long feature_idx
intp_t row_ind
uint64_t feature_idx
integral i, col_ind
cnp.float64_t diff
float64_t diff
# means[j] contains the mean of feature j
cnp.float64_t[::1] means = np.zeros(n_features)
float64_t[::1] means = np.zeros(n_features)
# variances[j] contains the variance of feature j
cnp.float64_t[::1] variances = np.zeros(n_features)
float64_t[::1] variances = np.zeros(n_features)

cnp.float64_t[::1] sum_weights = np.full(
float64_t[::1] sum_weights = np.full(
fill_value=np.sum(weights, dtype=np.float64), shape=n_features
)
cnp.float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
cnp.float64_t[::1] correction = np.zeros(shape=n_features)
float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
float64_t[::1] correction = np.zeros(shape=n_features)

cnp.uint64_t[::1] counts = np.full(
uint64_t[::1] counts = np.full(
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
)
cnp.uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)
uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)

for row_ind in range(len(X_indptr) - 1):
for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
col_ind = X_indices[i]
if not isnan(X_data[i]):
means[col_ind] += <cnp.float64_t>(X_data[i]) * weights[row_ind]
means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind]
# sum of weights where X[:, col_ind] is non-zero
sum_weights_nz[col_ind] += weights[row_ind]
# number of non-zero elements of X[:, col_ind]
Expand Down Expand Up @@ -229,8 +234,8 @@ def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False):

def _csc_mean_variance_axis0(
const floating[::1] X_data,
unsigned long long n_samples,
unsigned long long n_features,
uint64_t n_samples,
uint64_t n_features,
const integral[:] X_indices,
const integral[:] X_indptr,
const floating[:] weights,
Expand All @@ -239,29 +244,29 @@ def _csc_mean_variance_axis0(
# cannot be declared directly and can only be passed as function arguments
cdef:
integral i, row_ind
unsigned long long feature_idx, col_ind
cnp.float64_t diff
uint64_t feature_idx, col_ind
float64_t diff
# means[j] contains the mean of feature j
cnp.float64_t[::1] means = np.zeros(n_features)
float64_t[::1] means = np.zeros(n_features)
# variances[j] contains the variance of feature j
cnp.float64_t[::1] variances = np.zeros(n_features)
float64_t[::1] variances = np.zeros(n_features)

cnp.float64_t[::1] sum_weights = np.full(
float64_t[::1] sum_weights = np.full(
fill_value=np.sum(weights, dtype=np.float64), shape=n_features
)
cnp.float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
cnp.float64_t[::1] correction = np.zeros(shape=n_features)
float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
float64_t[::1] correction = np.zeros(shape=n_features)

cnp.uint64_t[::1] counts = np.full(
uint64_t[::1] counts = np.full(
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
)
cnp.uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)
uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)

for col_ind in range(n_features):
for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
row_ind = X_indices[i]
if not isnan(X_data[i]):
means[col_ind] += <cnp.float64_t>(X_data[i]) * weights[row_ind]
means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind]
# sum of weights where X[:, col_ind] is non-zero
sum_weights_nz[col_ind] += weights[row_ind]
# number of non-zero elements of X[:, col_ind]
Expand Down Expand Up @@ -387,9 +392,9 @@ def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None):
def _incr_mean_variance_axis0(
const floating[:] X_data,
floating n_samples,
unsigned long long n_features,
uint64_t n_features,
const int[:] X_indices,
# X_indptr might be either in32 or int64
# X_indptr might be either int32 or int64
const integral[:] X_indptr,
str X_format,
floating[:] last_mean,
Expand All @@ -401,7 +406,7 @@ def _incr_mean_variance_axis0(
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef:
unsigned long long i
uint64_t i

# last = stats until now
# new = the current increment
Expand Down Expand Up @@ -493,13 +498,13 @@ def _inplace_csr_row_normalize_l1(
const integral[:] X_indptr,
):
cdef:
unsigned long long n_samples = shape[0]
uint64_t n_samples = shape[0]

# the column indices for row i are stored in:
# indices[indptr[i]:indices[i+1]]
# and their corresponding values are stored in:
# data[indptr[i]:indptr[i+1]]
unsigned long long i
uint64_t i
integral j
double sum_

Expand Down Expand Up @@ -530,8 +535,8 @@ def _inplace_csr_row_normalize_l2(
const integral[:] X_indptr,
):
cdef:
unsigned long long n_samples = shape[0]
unsigned long long i
uint64_t n_samples = shape[0]
uint64_t i
integral j
double sum_

Expand All @@ -554,8 +559,8 @@ def _inplace_csr_row_normalize_l2(

def assign_rows_csr(
X,
const cnp.npy_intp[:] X_rows,
const cnp.npy_intp[:] out_rows,
const intptr_t[:] X_rows,
const intptr_t[:] out_rows,
floating[:, ::1] out,
):
"""Densify selected rows of a CSR matrix into a preallocated array.
Expand All @@ -571,12 +576,13 @@ def assign_rows_csr(
out : array, shape=(arbitrary, n_features)
"""
cdef:
# npy_intp (np.intp in Python) is what np.where returns,
# intptr_t (npy_intp, np.intp in Python) is what np.where returns,
# but int is what scipy.sparse uses.
int i, ind, j, k
cnp.npy_intp rX
intp_t i, ind, j, k
intptr_t rX
const floating[:] data = X.data
const int[:] indices = X.indices, indptr = X.indptr
const int32_t[:] indices = X.indices
const int32_t[:] indptr = X.indptr

if X_rows.shape[0] != out_rows.shape[0]:
raise ValueError("cannot assign %d rows to %d"
Expand Down