Skip to content

MAINT Fix ctypedef types in tree submodule #27352

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 20 additions & 22 deletions sklearn/ensemble/_gradient_boosting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,9 @@ cnp.import_array()

from scipy.sparse import issparse

from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t
from ..tree._tree cimport Node
from ..tree._tree cimport Tree
from ..tree._tree cimport DTYPE_t
from ..tree._tree cimport SIZE_t
from ..tree._tree cimport INT32_t
from ..tree._utils cimport safe_realloc


Expand All @@ -24,10 +22,10 @@ from numpy import zeros as np_zeros


# constant to mark tree leafs
cdef SIZE_t TREE_LEAF = -1
cdef intp_t TREE_LEAF = -1

cdef void _predict_regression_tree_inplace_fast_dense(
const DTYPE_t[:, ::1] X,
const float32_t[:, ::1] X,
Node* root_node,
double *value,
double scale,
Expand All @@ -45,7 +43,7 @@ cdef void _predict_regression_tree_inplace_fast_dense(

Parameters
----------
X : DTYPE_t 2d memory view
X : float32_t 2d memory view
The memory view on the data ndarray of the input ``X``.
Assumes that the array is c-continuous.
root_node : tree Node pointer
Expand All @@ -63,7 +61,7 @@ cdef void _predict_regression_tree_inplace_fast_dense(
``out`` is assumed to be a two-dimensional array of
shape ``(n_samples, K)``.
"""
cdef SIZE_t n_samples = X.shape[0]
cdef intp_t n_samples = X.shape[0]
cdef Py_ssize_t i
cdef Node *node
for i in range(n_samples):
Expand All @@ -87,20 +85,20 @@ def _predict_regression_tree_stages_sparse(

The function assumes that the ndarray that wraps ``X`` is csr_matrix.
"""
cdef const DTYPE_t[::1] X_data = X.data
cdef const INT32_t[::1] X_indices = X.indices
cdef const INT32_t[::1] X_indptr = X.indptr
cdef const float32_t[::1] X_data = X.data
cdef const int32_t[::1] X_indices = X.indices
cdef const int32_t[::1] X_indptr = X.indptr

cdef SIZE_t n_samples = X.shape[0]
cdef SIZE_t n_features = X.shape[1]
cdef SIZE_t n_stages = estimators.shape[0]
cdef SIZE_t n_outputs = estimators.shape[1]
cdef intp_t n_samples = X.shape[0]
cdef intp_t n_features = X.shape[1]
cdef intp_t n_stages = estimators.shape[0]
cdef intp_t n_outputs = estimators.shape[1]

# Indices and temporary variables
cdef SIZE_t sample_i
cdef SIZE_t feature_i
cdef SIZE_t stage_i
cdef SIZE_t output_i
cdef intp_t sample_i
cdef intp_t feature_i
cdef intp_t stage_i
cdef intp_t output_i
cdef Node *root_node = NULL
cdef Node *node = NULL
cdef double *value = NULL
Expand All @@ -117,18 +115,18 @@ def _predict_regression_tree_stages_sparse(
values[stage_i * n_outputs + output_i] = tree.value

# Initialize auxiliary data-structure
cdef DTYPE_t feature_value = 0.
cdef DTYPE_t* X_sample = NULL
cdef float32_t feature_value = 0.
cdef float32_t* X_sample = NULL

# feature_to_sample as a data structure records the last seen sample
# for each feature; functionally, it is an efficient way to identify
# which features are nonzero in the present sample.
cdef SIZE_t* feature_to_sample = NULL
cdef intp_t* feature_to_sample = NULL

safe_realloc(&X_sample, n_features)
safe_realloc(&feature_to_sample, n_features)

memset(feature_to_sample, -1, n_features * sizeof(SIZE_t))
memset(feature_to_sample, -1, n_features * sizeof(intp_t))

# Cycle through all samples
for sample_i in range(n_samples):
Expand Down
57 changes: 27 additions & 30 deletions sklearn/tree/_criterion.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -10,55 +10,52 @@
# See _criterion.pyx for implementation details.
cimport numpy as cnp

from ._tree cimport DTYPE_t # Type of X
from ._tree cimport DOUBLE_t # Type of y, sample_weight
from ._tree cimport SIZE_t # Type for indices and counters
from ._tree cimport INT32_t # Signed 32 bit integer
from ._tree cimport UINT32_t # Unsigned 32 bit integer
from ..utils._typedefs cimport float64_t, intp_t


cdef class Criterion:
# The criterion computes the impurity of a node and the reduction of
# impurity of a split on that node. It also computes the output statistics
# such as the mean in regression and class probabilities in classification.

# Internal structures
cdef const DOUBLE_t[:, ::1] y # Values of y
cdef const DOUBLE_t[:] sample_weight # Sample weights
cdef const float64_t[:, ::1] y # Values of y
cdef const float64_t[:] sample_weight # Sample weights

cdef const SIZE_t[:] sample_indices # Sample indices in X, y
cdef SIZE_t start # samples[start:pos] are the samples in the left node
cdef SIZE_t pos # samples[pos:end] are the samples in the right node
cdef SIZE_t end
cdef SIZE_t n_missing # Number of missing values for the feature being evaluated
cdef bint missing_go_to_left # Whether missing values go to the left node
cdef const intp_t[:] sample_indices # Sample indices in X, y
cdef intp_t start # samples[start:pos] are the samples in the left node
cdef intp_t pos # samples[pos:end] are the samples in the right node
cdef intp_t end
cdef intp_t n_missing # Number of missing values for the feature being evaluated
cdef bint missing_go_to_left # Whether missing values go to the left node

cdef SIZE_t n_outputs # Number of outputs
cdef SIZE_t n_samples # Number of samples
cdef SIZE_t n_node_samples # Number of samples in the node (end-start)
cdef double weighted_n_samples # Weighted number of samples (in total)
cdef double weighted_n_node_samples # Weighted number of samples in the node
cdef double weighted_n_left # Weighted number of samples in the left node
cdef double weighted_n_right # Weighted number of samples in the right node
cdef double weighted_n_missing # Weighted number of samples that are missing
cdef intp_t n_outputs # Number of outputs
cdef intp_t n_samples # Number of samples
cdef intp_t n_node_samples # Number of samples in the node (end-start)
cdef double weighted_n_samples # Weighted number of samples (in total)
cdef double weighted_n_node_samples # Weighted number of samples in the node
cdef double weighted_n_left # Weighted number of samples in the left node
cdef double weighted_n_right # Weighted number of samples in the right node
cdef double weighted_n_missing # Weighted number of samples that are missing
Comment on lines +35 to +39
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could double (here and also elsewhere) be changed to float64_t?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I second this suggestion.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I can make the change in this PR, or do it in a follow-on? I could do the int there as well as @lorentzenchr suggested below.

WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A separate PR would be easier to review.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In that case, I'll leave this as is for now. Would this PR be mergable? I can help test and make the systematic change in the tree submodule once we resolve this PR?


# The criterion object is maintained such that left and right collected
# statistics correspond to samples[start:pos] and samples[pos:end].

# Methods
cdef int init(
self,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight,
double weighted_n_samples,
const SIZE_t[:] sample_indices,
SIZE_t start,
SIZE_t end
const intp_t[:] sample_indices,
intp_t start,
intp_t end
) except -1 nogil
cdef void init_sum_missing(self)
cdef void init_missing(self, SIZE_t n_missing) noexcept nogil
cdef void init_missing(self, intp_t n_missing) noexcept nogil
cdef int reset(self) except -1 nogil
cdef int reverse_reset(self) except -1 nogil
cdef int update(self, SIZE_t new_pos) except -1 nogil
cdef int update(self, intp_t new_pos) except -1 nogil
cdef double node_impurity(self) noexcept nogil
cdef void children_impurity(
self,
Expand Down Expand Up @@ -101,8 +98,8 @@ cdef class Criterion:
cdef class ClassificationCriterion(Criterion):
"""Abstract criterion for classification."""

cdef SIZE_t[::1] n_classes
cdef SIZE_t max_n_classes
cdef intp_t[::1] n_classes
cdef intp_t max_n_classes

cdef double[:, ::1] sum_total # The sum of the weighted count of each label.
cdef double[:, ::1] sum_left # Same as above, but for the left side of the split
Expand Down
Loading