From 48a58e8edadd4380e0b65eaa59d893849fd8b88f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 7 Oct 2023 01:11:04 -0400 Subject: [PATCH 01/10] Working replacement Signed-off-by: Adam Li --- sklearn/tree/_criterion.pxd | 8 ++++---- sklearn/tree/_criterion.pyx | 32 ++++++++++++++++---------------- sklearn/tree/_splitter.pxd | 6 +++--- sklearn/tree/_splitter.pyx | 26 +++++++++++++------------- 4 files changed, 36 insertions(+), 36 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 6538b9b824a79..6822dd63fdaa2 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -42,7 +42,7 @@ cdef class Criterion: # statistics correspond to samples[start:pos] and samples[pos:end]. # Methods - cdef int init( + cdef intp_t init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, @@ -53,9 +53,9 @@ cdef class Criterion: ) except -1 nogil cdef void init_sum_missing(self) cdef void init_missing(self, intp_t n_missing) noexcept nogil - cdef int reset(self) except -1 nogil - cdef int reverse_reset(self) except -1 nogil - cdef int update(self, intp_t new_pos) except -1 nogil + cdef intp_t reset(self) except -1 nogil + cdef intp_t reverse_reset(self) except -1 nogil + cdef intp_t update(self, intp_t new_pos) except -1 nogil cdef float64_t node_impurity(self) noexcept nogil cdef void children_impurity( self, diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 89a7639f9bbcf..46bed56c5ef1d 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -40,7 +40,7 @@ cdef class Criterion: def __setstate__(self, d): pass - cdef int init( + cdef intp_t init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, @@ -87,21 +87,21 @@ cdef class Criterion: """ pass - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the criterion at pos=start. This method must be implemented by the subclass. """ pass - cdef int reverse_reset(self) except -1 nogil: + cdef intp_t reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. This method must be implemented by the subclass. """ pass - cdef int update(self, intp_t new_pos) except -1 nogil: + cdef intp_t update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. This updates the collected statistics by moving sample_indices[pos:new_pos] @@ -347,7 +347,7 @@ cdef class ClassificationCriterion(Criterion): return (type(self), (self.n_outputs, np.asarray(self.n_classes)), self.__getstate__()) - cdef int init( + cdef intp_t init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, @@ -450,7 +450,7 @@ cdef class ClassificationCriterion(Criterion): self.weighted_n_missing += w - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the criterion at pos=start. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -467,7 +467,7 @@ cdef class ClassificationCriterion(Criterion): ) return 0 - cdef int reverse_reset(self) except -1 nogil: + cdef intp_t reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -484,7 +484,7 @@ cdef class ClassificationCriterion(Criterion): ) return 0 - cdef int update(self, intp_t new_pos) except -1 nogil: + cdef intp_t update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -857,7 +857,7 @@ cdef class RegressionCriterion(Criterion): def __reduce__(self): return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) - cdef int init( + cdef intp_t init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, @@ -944,7 +944,7 @@ cdef class RegressionCriterion(Criterion): self.weighted_n_missing += w - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the criterion at pos=start.""" self.pos = self.start _move_sums_regression( @@ -957,7 +957,7 @@ cdef class RegressionCriterion(Criterion): ) return 0 - cdef int reverse_reset(self) except -1 nogil: + cdef intp_t reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end.""" self.pos = self.end _move_sums_regression( @@ -970,7 +970,7 @@ cdef class RegressionCriterion(Criterion): ) return 0 - cdef int update(self, intp_t new_pos) except -1 nogil: + cdef intp_t update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left.""" cdef const float64_t[:] sample_weight = self.sample_weight cdef const intp_t[:] sample_indices = self.sample_indices @@ -1217,7 +1217,7 @@ cdef class MAE(RegressionCriterion): self.left_child_ptr = cnp.PyArray_DATA(self.left_child) self.right_child_ptr = cnp.PyArray_DATA(self.right_child) - cdef int init( + cdef intp_t init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, @@ -1279,7 +1279,7 @@ cdef class MAE(RegressionCriterion): with gil: raise ValueError("missing values is not supported for MAE.") - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the criterion at pos=start. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -1310,7 +1310,7 @@ cdef class MAE(RegressionCriterion): weight) return 0 - cdef int reverse_reset(self) except -1 nogil: + cdef intp_t reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -1338,7 +1338,7 @@ cdef class MAE(RegressionCriterion): weight) return 0 - cdef int update(self, intp_t new_pos) except -1 nogil: + cdef intp_t update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left. Returns -1 in case of failure to allocate memory (and raise MemoryError) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index adc14011cb7a2..129455a99866c 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -83,7 +83,7 @@ cdef class Splitter: # This allows optimization with depth-based tree building. # Methods - cdef int init( + cdef intp_t init( self, object X, const float64_t[:, ::1] y, @@ -91,14 +91,14 @@ cdef class Splitter: const unsigned char[::1] missing_values_in_feature_mask, ) except -1 - cdef int node_reset( + cdef intp_t node_reset( self, intp_t start, intp_t end, float64_t* weighted_n_node_samples ) except -1 nogil - cdef int node_split( + cdef intp_t node_split( self, float64_t impurity, # Impurity of the node SplitRecord* split, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index a9d3a169ec84a..084ecb8745231 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -119,7 +119,7 @@ cdef class Splitter: self.random_state, self.monotonic_cst), self.__getstate__()) - cdef int init( + cdef intp_t init( self, object X, const float64_t[:, ::1] y, @@ -193,7 +193,7 @@ cdef class Splitter: self.criterion.init_sum_missing() return 0 - cdef int node_reset(self, intp_t start, intp_t end, + cdef intp_t node_reset(self, intp_t start, intp_t end, float64_t* weighted_n_node_samples) except -1 nogil: """Reset splitter on node samples[start:end]. @@ -225,7 +225,7 @@ cdef class Splitter: weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples return 0 - cdef int node_split( + cdef intp_t node_split( self, float64_t impurity, SplitRecord* split, @@ -286,7 +286,7 @@ ctypedef fused Partitioner: DensePartitioner SparsePartitioner -cdef inline int node_split_best( +cdef inline intp_t node_split_best( Splitter splitter, Partitioner partitioner, Criterion criterion, @@ -667,7 +667,7 @@ cdef void heapsort(float32_t* feature_values, intp_t* samples, intp_t n) noexcep sift_down(feature_values, samples, 0, end) end = end - 1 -cdef inline int node_split_random( +cdef inline intp_t node_split_random( Splitter splitter, Partitioner partitioner, Criterion criterion, @@ -1490,7 +1490,7 @@ cdef inline void sparse_swap(intp_t[::1] index_to_samples, intp_t[::1] samples, cdef class BestSplitter(Splitter): """Splitter for finding the best split on dense data.""" cdef DensePartitioner partitioner - cdef int init( + cdef intp_t init( self, object X, const float64_t[:, ::1] y, @@ -1502,7 +1502,7 @@ cdef class BestSplitter(Splitter): X, self.samples, self.feature_values, missing_values_in_feature_mask ) - cdef int node_split( + cdef intp_t node_split( self, float64_t impurity, SplitRecord* split, @@ -1526,7 +1526,7 @@ cdef class BestSplitter(Splitter): cdef class BestSparseSplitter(Splitter): """Splitter for finding the best split, using the sparse data.""" cdef SparsePartitioner partitioner - cdef int init( + cdef intp_t init( self, object X, const float64_t[:, ::1] y, @@ -1538,7 +1538,7 @@ cdef class BestSparseSplitter(Splitter): X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask ) - cdef int node_split( + cdef intp_t node_split( self, float64_t impurity, SplitRecord* split, @@ -1562,7 +1562,7 @@ cdef class BestSparseSplitter(Splitter): cdef class RandomSplitter(Splitter): """Splitter for finding the best random split on dense data.""" cdef DensePartitioner partitioner - cdef int init( + cdef intp_t init( self, object X, const float64_t[:, ::1] y, @@ -1574,7 +1574,7 @@ cdef class RandomSplitter(Splitter): X, self.samples, self.feature_values, missing_values_in_feature_mask ) - cdef int node_split( + cdef intp_t node_split( self, float64_t impurity, SplitRecord* split, @@ -1598,7 +1598,7 @@ cdef class RandomSplitter(Splitter): cdef class RandomSparseSplitter(Splitter): """Splitter for finding the best random split, using the sparse data.""" cdef SparsePartitioner partitioner - cdef int init( + cdef intp_t init( self, object X, const float64_t[:, ::1] y, @@ -1609,7 +1609,7 @@ cdef class RandomSparseSplitter(Splitter): self.partitioner = SparsePartitioner( X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask ) - cdef int node_split( + cdef intp_t node_split( self, float64_t impurity, SplitRecord* split, From 4e2232e04f8ee44319a35c59b060ac7d14d29361 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 7 Oct 2023 01:15:53 -0400 Subject: [PATCH 02/10] Convert rest of files Signed-off-by: Adam Li --- sklearn/tree/_splitter.pyx | 4 ++-- sklearn/tree/_tree.pxd | 4 ++-- sklearn/tree/_tree.pyx | 48 +++++++++++++++++++------------------- sklearn/tree/_utils.pxd | 22 ++++++++--------- sklearn/tree/_utils.pyx | 28 +++++++++++----------- 5 files changed, 53 insertions(+), 53 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 084ecb8745231..2a31de65f8092 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -559,7 +559,7 @@ cdef inline intp_t node_split_best( cdef inline void sort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil: if n == 0: return - cdef int maxd = 2 * log(n) + cdef intp_t maxd = 2 * log(n) introsort(feature_values, samples, n, maxd) @@ -593,7 +593,7 @@ cdef inline float32_t median3(float32_t* feature_values, intp_t n) noexcept nogi # Introsort with median of 3 pivot selection and 3-way partition function # (robust to repeated elements, e.g. lots of zero features). cdef void introsort(float32_t* feature_values, intp_t *samples, - intp_t n, int maxd) noexcept nogil: + intp_t n, intp_t maxd) noexcept nogil: cdef float32_t pivot cdef intp_t i, l, r diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index e4081921f40f9..449c942aea3f1 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -57,8 +57,8 @@ cdef class Tree: intp_t n_node_samples, float64_t weighted_n_node_samples, unsigned char missing_go_to_left) except -1 nogil - cdef int _resize(self, intp_t capacity) except -1 nogil - cdef int _resize_c(self, intp_t capacity=*) except -1 nogil + cdef intp_t _resize(self, intp_t capacity) except -1 nogil + cdef intp_t _resize_c(self, intp_t capacity=*) except -1 nogil cdef cnp.ndarray _get_value_ndarray(self) cdef cnp.ndarray _get_node_ndarray(self) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index b4ce56a4d2a0b..12959f07138d5 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -63,10 +63,10 @@ cdef float64_t INFINITY = np.inf cdef float64_t EPSILON = np.finfo('double').eps # Some handy constants (BestFirstTreeBuilder) -cdef int IS_FIRST = 1 -cdef int IS_NOT_FIRST = 0 -cdef int IS_LEFT = 1 -cdef int IS_NOT_LEFT = 0 +cdef intp_t IS_FIRST = 1 +cdef intp_t IS_NOT_FIRST = 0 +cdef intp_t IS_LEFT = 1 +cdef intp_t IS_NOT_LEFT = 0 TREE_LEAF = -1 TREE_UNDEFINED = -2 @@ -177,10 +177,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): X, y, sample_weight = self._check_input(X, y, sample_weight) # Initial capacity - cdef int init_capacity + cdef intp_t init_capacity if tree.max_depth <= 10: - init_capacity = (2 ** (tree.max_depth + 1)) - 1 + init_capacity = (2 ** (tree.max_depth + 1)) - 1 else: init_capacity = 2047 @@ -219,7 +219,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef bint is_leaf cdef bint first = 1 cdef intp_t max_depth_seen = -1 - cdef int rc = 0 + cdef intp_t rc = 0 cdef stack[StackRecord] builder_stack cdef StackRecord stack_record @@ -455,7 +455,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef intp_t max_split_nodes = max_leaf_nodes - 1 cdef bint is_leaf cdef intp_t max_depth_seen = -1 - cdef int rc = 0 + cdef intp_t rc = 0 cdef Node* node # Initial capacity @@ -587,7 +587,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if rc == -1: raise MemoryError() - cdef inline int _add_split_node( + cdef inline intp_t _add_split_node( self, Splitter splitter, Tree tree, @@ -696,32 +696,32 @@ cdef class Tree: Attributes ---------- - node_count : int + node_count : intp_t The number of nodes (internal nodes + leaves) in the tree. - capacity : int + capacity : intp_t The current capacity (i.e., size) of the arrays, which is at least as great as `node_count`. - max_depth : int + max_depth : intp_t The depth of the tree, i.e. the maximum depth of its leaves. - children_left : array of int, shape [node_count] + children_left : array of intp_t, shape [node_count] children_left[i] holds the node id of the left child of node i. For leaves, children_left[i] == TREE_LEAF. Otherwise, children_left[i] > i. This child handles the case where X[:, feature[i]] <= threshold[i]. - children_right : array of int, shape [node_count] + children_right : array of intp_t, shape [node_count] children_right[i] holds the node id of the right child of node i. For leaves, children_right[i] == TREE_LEAF. Otherwise, children_right[i] > i. This child handles the case where X[:, feature[i]] > threshold[i]. - n_leaves : int + n_leaves : intp_t Number of leaves in the tree. - feature : array of int, shape [node_count] + feature : array of intp_t, shape [node_count] feature[i] holds the feature to split on, for the internal node i. threshold : array of float64_t, shape [node_count] @@ -734,7 +734,7 @@ cdef class Tree: impurity[i] holds the impurity (i.e., the value of the splitting criterion) at node i. - n_node_samples : array of int, shape [node_count] + n_node_samples : array of intp_t, shape [node_count] n_node_samples[i] holds the number of training samples reaching node i. weighted_n_node_samples : array of float64_t, shape [node_count] @@ -797,7 +797,7 @@ cdef class Tree: # TODO: Convert n_classes to cython.integral memory view once # https://github.com/cython/cython/issues/5243 is fixed - def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): + def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs): """Constructor.""" cdef intp_t dummy = 0 size_t_dtype = np.array(dummy).dtype @@ -878,7 +878,7 @@ cdef class Tree: memcpy(self.value, cnp.PyArray_DATA(value_ndarray), self.capacity * self.value_stride * sizeof(float64_t)) - cdef int _resize(self, intp_t capacity) except -1 nogil: + cdef intp_t _resize(self, intp_t capacity) except -1 nogil: """Resize all inner arrays to `capacity`, if `capacity` == -1, then double the size of the inner arrays. @@ -890,7 +890,7 @@ cdef class Tree: with gil: raise MemoryError() - cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil: + cdef intp_t _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil: """Guts of _resize Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -1341,7 +1341,7 @@ cdef class Tree: return arr def compute_partial_dependence(self, float32_t[:, ::1] X, - int[::1] target_features, + intp_t[::1] target_features, float64_t[::1] out): """Partial dependence of the response on the ``target_feature`` set. @@ -1377,7 +1377,7 @@ cdef class Tree: dtype=np.intp) intp_t sample_idx intp_t feature_idx - int stack_size + intp_t stack_size float64_t left_sample_frac float64_t current_weight float64_t total_weight # used for sanity check only @@ -1625,7 +1625,7 @@ cdef class _PathFinder(_CCPPruneController): cdef float64_t[:] impurities cdef uint32_t count - def __cinit__(self, int node_count): + def __cinit__(self, intp_t node_count): self.ccp_alphas = np.zeros(shape=(node_count), dtype=np.float64) self.impurities = np.zeros(shape=(node_count), dtype=np.float64) self.count = 0 @@ -1923,7 +1923,7 @@ cdef _build_pruned_tree( # value_stride for original tree and new tree are the same intp_t value_stride = orig_tree.value_stride intp_t max_depth_seen = -1 - int rc = 0 + intp_t rc = 0 Node* node float64_t* orig_value_ptr float64_t* new_value_ptr diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 4167230bfbf4d..83e45baf748e9 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -70,12 +70,12 @@ cdef class WeightedPQueue: cdef WeightedPQueueRecord* array_ cdef bint is_empty(self) noexcept nogil - cdef int reset(self) except -1 nogil + cdef intp_t reset(self) except -1 nogil cdef intp_t size(self) noexcept nogil - cdef int push(self, float64_t data, float64_t weight) except -1 nogil - cdef int remove(self, float64_t data, float64_t weight) noexcept nogil - cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil - cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil + cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil + cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil + cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil + cdef intp_t peek(self, float64_t* data, float64_t* weight) noexcept nogil cdef float64_t get_weight_from_index(self, intp_t index) noexcept nogil cdef float64_t get_value_from_index(self, intp_t index) noexcept nogil @@ -91,14 +91,14 @@ cdef class WeightedMedianCalculator: cdef intp_t k cdef float64_t sum_w_0_k # represents sum(weights[0:k]) = w[0] + w[1] + ... + w[k-1] cdef intp_t size(self) noexcept nogil - cdef int push(self, float64_t data, float64_t weight) except -1 nogil - cdef int reset(self) except -1 nogil - cdef int update_median_parameters_post_push( + cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil + cdef intp_t reset(self) except -1 nogil + cdef intp_t update_median_parameters_post_push( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil - cdef int remove(self, float64_t data, float64_t weight) noexcept nogil - cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil - cdef int update_median_parameters_post_remove( + cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil + cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil + cdef intp_t update_median_parameters_post_remove( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil cdef float64_t get_median(self) noexcept nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 3c0c312b25fbe..effcd07b63034 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -104,7 +104,7 @@ cdef class WeightedPQueue: def __dealloc__(self): free(self.array_) - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the WeightedPQueue to its state at construction Return -1 in case of failure to allocate memory (and raise MemoryError) @@ -121,7 +121,7 @@ cdef class WeightedPQueue: cdef intp_t size(self) noexcept nogil: return self.array_ptr - cdef int push(self, float64_t data, float64_t weight) except -1 nogil: + cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil: """Push record on the array. Return -1 in case of failure to allocate memory (and raise MemoryError) @@ -153,7 +153,7 @@ cdef class WeightedPQueue: self.array_ptr = array_ptr + 1 return 0 - cdef int remove(self, float64_t data, float64_t weight) noexcept nogil: + cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil: """Remove a specific value/weight record from the array. Returns 0 if successful, -1 if record not found.""" cdef intp_t array_ptr = self.array_ptr @@ -181,7 +181,7 @@ cdef class WeightedPQueue: self.array_ptr = array_ptr - 1 return 0 - cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil: + cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil: """Remove the top (minimum) element from array. Returns 0 if successful, -1 if nothing to remove.""" cdef intp_t array_ptr = self.array_ptr @@ -202,7 +202,7 @@ cdef class WeightedPQueue: self.array_ptr = array_ptr - 1 return 0 - cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil: + cdef intp_t peek(self, float64_t* data, float64_t* weight) noexcept nogil: """Write the top element from array to a pointer. Returns 0 if successful, -1 if nothing to write.""" cdef WeightedPQueueRecord* array = self.array_ @@ -279,7 +279,7 @@ cdef class WeightedMedianCalculator: WeightedMedianCalculator""" return self.samples.size() - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the WeightedMedianCalculator to its state at construction Return -1 in case of failure to allocate memory (and raise MemoryError) @@ -293,7 +293,7 @@ cdef class WeightedMedianCalculator: self.sum_w_0_k = 0 return 0 - cdef int push(self, float64_t data, float64_t weight) except -1 nogil: + cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil: """Push a value and its associated weight to the WeightedMedianCalculator Return -1 in case of failure to allocate memory (and raise MemoryError) @@ -310,7 +310,7 @@ cdef class WeightedMedianCalculator: original_median) return return_value - cdef int update_median_parameters_post_push( + cdef intp_t update_median_parameters_post_push( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil: """Update the parameters used in the median calculation, @@ -352,7 +352,7 @@ cdef class WeightedMedianCalculator: self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1) return 0 - cdef int remove(self, float64_t data, float64_t weight) noexcept nogil: + cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil: """Remove a value from the MedianHeap, removing it from consideration in the median calculation """ @@ -367,7 +367,7 @@ cdef class WeightedMedianCalculator: original_median) return return_value - cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil: + cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil: """Pop a value from the MedianHeap, starting from the left and moving to the right. """ @@ -387,7 +387,7 @@ cdef class WeightedMedianCalculator: original_median) return return_value - cdef int update_median_parameters_post_remove( + cdef intp_t update_median_parameters_post_remove( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil: """Update the parameters used in the median calculation, @@ -452,9 +452,9 @@ cdef class WeightedMedianCalculator: def _any_isnan_axis0(const float32_t[:, :] X): """Same as np.any(np.isnan(X), axis=0)""" cdef: - int i, j - int n_samples = X.shape[0] - int n_features = X.shape[1] + intp_t i, j + intp_t n_samples = X.shape[0] + intp_t n_features = X.shape[1] unsigned char[::1] isnan_out = np.zeros(X.shape[1], dtype=np.bool_) with nogil: From 357400e25c2f2f1a52a49a534eabfefeaff9e3f3 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 7 Oct 2023 01:18:50 -0400 Subject: [PATCH 03/10] Finish ctypedef conversion Signed-off-by: Adam Li --- sklearn/tree/_splitter.pyx | 8 ++++++-- sklearn/tree/_tree.pyx | 6 +++--- sklearn/tree/_utils.pyx | 6 +++--- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 2a31de65f8092..235ba81f649a4 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -193,8 +193,12 @@ cdef class Splitter: self.criterion.init_sum_missing() return 0 - cdef intp_t node_reset(self, intp_t start, intp_t end, - float64_t* weighted_n_node_samples) except -1 nogil: + cdef intp_t node_reset( + self, + intp_t start, + intp_t end, + float64_t* weighted_n_node_samples + ) except -1 nogil: """Reset splitter on node samples[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 12959f07138d5..e10c15631a3da 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -38,10 +38,10 @@ from ._utils cimport sizet_ptr_to_ndarray cdef extern from "numpy/arrayobject.h": object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr, - int nd, cnp.npy_intp* dims, + intp_t nd, cnp.npy_intp* dims, cnp.npy_intp* strides, - void* data, int flags, object obj) - int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) + void* data, intp_t flags, object obj) + intp_t PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) cdef extern from "" namespace "std" nogil: cdef cppclass stack[T]: diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index effcd07b63034..a748b1cd1a06a 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -299,7 +299,7 @@ cdef class WeightedMedianCalculator: Return -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ - cdef int return_value + cdef intp_t return_value cdef float64_t original_median = 0.0 if self.size() != 0: @@ -356,7 +356,7 @@ cdef class WeightedMedianCalculator: """Remove a value from the MedianHeap, removing it from consideration in the median calculation """ - cdef int return_value + cdef intp_t return_value cdef float64_t original_median = 0.0 if self.size() != 0: @@ -371,7 +371,7 @@ cdef class WeightedMedianCalculator: """Pop a value from the MedianHeap, starting from the left and moving to the right. """ - cdef int return_value + cdef intp_t return_value cdef float64_t original_median = 0.0 if self.size() != 0: From e74a0cf62eea7572ae6303322f40aafc608818e2 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 9 Oct 2023 09:49:52 -0400 Subject: [PATCH 04/10] Add conversion dtypes for target_features Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 1 + sklearn/ensemble/_gb.py | 2 ++ sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx | 4 ++-- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 1 + sklearn/inspection/_partial_dependence.py | 2 +- sklearn/tree/_classes.py | 1 + 6 files changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index eecd13d403744..26b8a6f8933ba 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1120,6 +1120,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features): averaged_predictions = np.zeros( shape=grid.shape[0], dtype=np.float64, order="C" ) + target_features = np.asarray(target_features, dtype=np.intp, order="C") for tree in self.estimators_: # Note: we don't sum in parallel because the GIL isn't released in diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 5982f8a7fb952..8ed90db1f6b61 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -1061,6 +1061,8 @@ def _compute_partial_dependence_recursion(self, grid, target_features): averaged_predictions = np.zeros( (n_trees_per_stage, grid.shape[0]), dtype=np.float64, order="C" ) + target_features = np.asarray(target_features, dtype=np.intp, order="C") + for stage in range(n_estimators): for k in range(n_trees_per_stage): tree = self.estimators_[stage, k].tree_ diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index a84c7dbf9f280..43f87f1f72de7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -148,7 +148,7 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data( def _compute_partial_dependence( node_struct [:] nodes, const X_DTYPE_C [:, ::1] X, - int [:] target_features, + intp_t [:] target_features, Y_DTYPE_C [:] out ): """Partial dependence of the response on the ``target_features`` set. @@ -173,7 +173,7 @@ def _compute_partial_dependence( X : view on 2d ndarray, shape (n_samples, n_target_features) The grid points on which the partial dependence should be evaluated. - target_features : view on 1d ndarray, shape (n_target_features) + target_features : view on 1d ndarray of intp_t, shape (n_target_features) The set of target features for which the partial dependence should be evaluated. out : view on 1d ndarray, shape (n_samples) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index c3af930654b73..a8bad4845b12c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1167,6 +1167,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features): averaged_predictions = np.zeros( (self.n_trees_per_iteration_, grid.shape[0]), dtype=Y_DTYPE ) + target_features = np.asarray(target_features, dtype=np.intp, order="C") for predictors_of_ith_iteration in self._predictors: for k, predictor in enumerate(predictors_of_ith_iteration): diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index d54adc90444fc..13425fa7e1c4b 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -660,7 +660,7 @@ def partial_dependence( raise ValueError("all features must be in [0, {}]".format(X.shape[1] - 1)) features_indices = np.asarray( - _get_column_indices(X, features), dtype=np.int32, order="C" + _get_column_indices(X, features), dtype=np.intp, order="C" ).ravel() feature_names = _check_feature_names(X, feature_names) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 03ba2f108bbdd..6523832328942 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1436,6 +1436,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features): averaged_predictions = np.zeros( shape=grid.shape[0], dtype=np.float64, order="C" ) + target_features = np.asarray(target_features, dtype=np.intp, order="C") self.tree_.compute_partial_dependence( grid, target_features, averaged_predictions From e2a12046deb9eeab0dfbeb28dc5a365f9ac92d67 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 9 Oct 2023 09:52:31 -0400 Subject: [PATCH 05/10] Add inline comment Signed-off-by: Adam Li --- sklearn/tree/_splitter.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 235ba81f649a4..887756370524c 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -1344,7 +1344,11 @@ cdef class SparsePartitioner: cdef int compare_SIZE_t(const void* a, const void* b) noexcept nogil: - """Comparison function for sort.""" + """Comparison function for sort. + + This must return an `int` as it is used by stdlib's qsort, which expects + an `int` return value. + """ return ((a)[0] - (b)[0]) From 9d957695006af90237a38ec15d877381a24ee7c6 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 9 Oct 2023 11:18:54 -0400 Subject: [PATCH 06/10] run lint From 877dc3221c97b36851acfbf097150192367ef83b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 3 Nov 2023 16:09:19 -0400 Subject: [PATCH 07/10] Consolidate unit-testing Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 6 +++--- sklearn/inspection/tests/test_partial_dependence.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 26b8a6f8933ba..600f56acbc419 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1104,10 +1104,10 @@ def _compute_partial_dependence_recursion(self, grid, target_features): Parameters ---------- - grid : ndarray of shape (n_samples, n_target_features) + grid : ndarray of shape (n_samples, n_target_features) of type DTYPE The grid points on which the partial dependence should be evaluated. - target_features : ndarray of shape (n_target_features) + target_features : ndarray of shape (n_target_features) of type np.intp The set of target features for which the partial dependence should be evaluated. @@ -1117,10 +1117,10 @@ def _compute_partial_dependence_recursion(self, grid, target_features): The value of the partial dependence function on each grid point. """ grid = np.asarray(grid, dtype=DTYPE, order="C") + target_features = np.asarray(target_features, dtype=np.intp, order="C") averaged_predictions = np.zeros( shape=grid.shape[0], dtype=np.float64, order="C" ) - target_features = np.asarray(target_features, dtype=np.intp, order="C") for tree in self.estimators_: # Note: we don't sum in parallel because the GIL isn't released in diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 0336dc4b827fe..b052609a85a2b 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -272,7 +272,7 @@ def test_partial_dependence_helpers(est, method, target_feature): est.fit(X, y) # target feature will be set to .5 and then to 123 - features = np.array([target_feature], dtype=np.int32) + features = np.array([target_feature], dtype=np.intp) grid = np.array([[0.5], [123]]) if method == "brute": @@ -356,7 +356,7 @@ def test_recursion_decision_tree_vs_forest_and_gbdt(seed): grid = rng.randn(50).reshape(-1, 1) for f in range(n_features): - features = np.array([f], dtype=np.int32) + features = np.array([f], dtype=np.intp) pdp_forest = _partial_dependence_recursion(forest, grid, features) pdp_gbdt = _partial_dependence_recursion(gbdt, grid, features) From b41e857757a32c746744fbd9abbbec106755fdf8 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 3 Nov 2023 17:55:59 -0400 Subject: [PATCH 08/10] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx | 2 +- sklearn/tree/_tree.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index 43f87f1f72de7..01a6ea855751b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -148,7 +148,7 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data( def _compute_partial_dependence( node_struct [:] nodes, const X_DTYPE_C [:, ::1] X, - intp_t [:] target_features, + const intp_t [:] target_features, Y_DTYPE_C [:] out ): """Partial dependence of the response on the ``target_features`` set. diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 19174f7dcb8d3..3ba5b52990cc0 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1343,7 +1343,7 @@ cdef class Tree: return arr def compute_partial_dependence(self, float32_t[:, ::1] X, - intp_t[::1] target_features, + const intp_t[::1] target_features, float64_t[::1] out): """Partial dependence of the response on the ``target_feature`` set. From 8b9a5dc0f4a67903e5152bba64d0be299b9fd877 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 17 Jan 2024 12:18:36 -0500 Subject: [PATCH 09/10] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- sklearn/ensemble/_forest.py | 4 ++-- sklearn/tree/_criterion.pxd | 8 ++++---- sklearn/tree/_criterion.pyx | 32 ++++++++++++++++---------------- sklearn/tree/_splitter.pxd | 6 +++--- sklearn/tree/_splitter.pyx | 26 +++++++++++++------------- sklearn/tree/_tree.pxd | 4 ++-- sklearn/tree/_tree.pyx | 18 +++++++++--------- sklearn/tree/_utils.pxd | 22 +++++++++++----------- sklearn/tree/_utils.pyx | 28 ++++++++++++++-------------- 9 files changed, 74 insertions(+), 74 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index adaa5b11fd210..1610d1068cfb5 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1135,10 +1135,10 @@ def _compute_partial_dependence_recursion(self, grid, target_features): Parameters ---------- - grid : ndarray of shape (n_samples, n_target_features) of type DTYPE + grid : ndarray of shape (n_samples, n_target_features), dtype=DTYPE The grid points on which the partial dependence should be evaluated. - target_features : ndarray of shape (n_target_features) of type np.intp + target_features : ndarray of shape (n_target_features), dtype=np.intp The set of target features for which the partial dependence should be evaluated. diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 6822dd63fdaa2..6538b9b824a79 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -42,7 +42,7 @@ cdef class Criterion: # statistics correspond to samples[start:pos] and samples[pos:end]. # Methods - cdef intp_t init( + cdef int init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, @@ -53,9 +53,9 @@ cdef class Criterion: ) except -1 nogil cdef void init_sum_missing(self) cdef void init_missing(self, intp_t n_missing) noexcept nogil - cdef intp_t reset(self) except -1 nogil - cdef intp_t reverse_reset(self) except -1 nogil - cdef intp_t update(self, intp_t new_pos) except -1 nogil + cdef int reset(self) except -1 nogil + cdef int reverse_reset(self) except -1 nogil + cdef int update(self, intp_t new_pos) except -1 nogil cdef float64_t node_impurity(self) noexcept nogil cdef void children_impurity( self, diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index f493199ee9c89..cb20db9ddb69c 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -40,7 +40,7 @@ cdef class Criterion: def __setstate__(self, d): pass - cdef intp_t init( + cdef int init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, @@ -87,21 +87,21 @@ cdef class Criterion: """ pass - cdef intp_t reset(self) except -1 nogil: + cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. This method must be implemented by the subclass. """ pass - cdef intp_t reverse_reset(self) except -1 nogil: + cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. This method must be implemented by the subclass. """ pass - cdef intp_t update(self, intp_t new_pos) except -1 nogil: + cdef int update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. This updates the collected statistics by moving sample_indices[pos:new_pos] @@ -347,7 +347,7 @@ cdef class ClassificationCriterion(Criterion): return (type(self), (self.n_outputs, np.asarray(self.n_classes)), self.__getstate__()) - cdef intp_t init( + cdef int init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, @@ -450,7 +450,7 @@ cdef class ClassificationCriterion(Criterion): self.weighted_n_missing += w - cdef intp_t reset(self) except -1 nogil: + cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -467,7 +467,7 @@ cdef class ClassificationCriterion(Criterion): ) return 0 - cdef intp_t reverse_reset(self) except -1 nogil: + cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -484,7 +484,7 @@ cdef class ClassificationCriterion(Criterion): ) return 0 - cdef intp_t update(self, intp_t new_pos) except -1 nogil: + cdef int update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -860,7 +860,7 @@ cdef class RegressionCriterion(Criterion): def __reduce__(self): return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) - cdef intp_t init( + cdef int init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, @@ -947,7 +947,7 @@ cdef class RegressionCriterion(Criterion): self.weighted_n_missing += w - cdef intp_t reset(self) except -1 nogil: + cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start.""" self.pos = self.start _move_sums_regression( @@ -960,7 +960,7 @@ cdef class RegressionCriterion(Criterion): ) return 0 - cdef intp_t reverse_reset(self) except -1 nogil: + cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end.""" self.pos = self.end _move_sums_regression( @@ -973,7 +973,7 @@ cdef class RegressionCriterion(Criterion): ) return 0 - cdef intp_t update(self, intp_t new_pos) except -1 nogil: + cdef int update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left.""" cdef const float64_t[:] sample_weight = self.sample_weight cdef const intp_t[:] sample_indices = self.sample_indices @@ -1220,7 +1220,7 @@ cdef class MAE(RegressionCriterion): self.left_child_ptr = cnp.PyArray_DATA(self.left_child) self.right_child_ptr = cnp.PyArray_DATA(self.right_child) - cdef intp_t init( + cdef int init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, @@ -1282,7 +1282,7 @@ cdef class MAE(RegressionCriterion): with gil: raise ValueError("missing values is not supported for MAE.") - cdef intp_t reset(self) except -1 nogil: + cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -1313,7 +1313,7 @@ cdef class MAE(RegressionCriterion): weight) return 0 - cdef intp_t reverse_reset(self) except -1 nogil: + cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -1341,7 +1341,7 @@ cdef class MAE(RegressionCriterion): weight) return 0 - cdef intp_t update(self, intp_t new_pos) except -1 nogil: + cdef int update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left. Returns -1 in case of failure to allocate memory (and raise MemoryError) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 129455a99866c..adc14011cb7a2 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -83,7 +83,7 @@ cdef class Splitter: # This allows optimization with depth-based tree building. # Methods - cdef intp_t init( + cdef int init( self, object X, const float64_t[:, ::1] y, @@ -91,14 +91,14 @@ cdef class Splitter: const unsigned char[::1] missing_values_in_feature_mask, ) except -1 - cdef intp_t node_reset( + cdef int node_reset( self, intp_t start, intp_t end, float64_t* weighted_n_node_samples ) except -1 nogil - cdef intp_t node_split( + cdef int node_split( self, float64_t impurity, # Impurity of the node SplitRecord* split, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 46c89e3578204..52253b2d12eaa 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -119,7 +119,7 @@ cdef class Splitter: self.random_state, self.monotonic_cst), self.__getstate__()) - cdef intp_t init( + cdef int init( self, object X, const float64_t[:, ::1] y, @@ -193,7 +193,7 @@ cdef class Splitter: self.criterion.init_sum_missing() return 0 - cdef intp_t node_reset( + cdef int node_reset( self, intp_t start, intp_t end, @@ -229,7 +229,7 @@ cdef class Splitter: weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples return 0 - cdef intp_t node_split( + cdef int node_split( self, float64_t impurity, SplitRecord* split, @@ -290,7 +290,7 @@ ctypedef fused Partitioner: DensePartitioner SparsePartitioner -cdef inline intp_t node_split_best( +cdef inline int node_split_best( Splitter splitter, Partitioner partitioner, Criterion criterion, @@ -671,7 +671,7 @@ cdef void heapsort(float32_t* feature_values, intp_t* samples, intp_t n) noexcep sift_down(feature_values, samples, 0, end) end = end - 1 -cdef inline intp_t node_split_random( +cdef inline int node_split_random( Splitter splitter, Partitioner partitioner, Criterion criterion, @@ -1498,7 +1498,7 @@ cdef inline void sparse_swap(intp_t[::1] index_to_samples, intp_t[::1] samples, cdef class BestSplitter(Splitter): """Splitter for finding the best split on dense data.""" cdef DensePartitioner partitioner - cdef intp_t init( + cdef int init( self, object X, const float64_t[:, ::1] y, @@ -1510,7 +1510,7 @@ cdef class BestSplitter(Splitter): X, self.samples, self.feature_values, missing_values_in_feature_mask ) - cdef intp_t node_split( + cdef int node_split( self, float64_t impurity, SplitRecord* split, @@ -1534,7 +1534,7 @@ cdef class BestSplitter(Splitter): cdef class BestSparseSplitter(Splitter): """Splitter for finding the best split, using the sparse data.""" cdef SparsePartitioner partitioner - cdef intp_t init( + cdef int init( self, object X, const float64_t[:, ::1] y, @@ -1546,7 +1546,7 @@ cdef class BestSparseSplitter(Splitter): X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask ) - cdef intp_t node_split( + cdef int node_split( self, float64_t impurity, SplitRecord* split, @@ -1570,7 +1570,7 @@ cdef class BestSparseSplitter(Splitter): cdef class RandomSplitter(Splitter): """Splitter for finding the best random split on dense data.""" cdef DensePartitioner partitioner - cdef intp_t init( + cdef int init( self, object X, const float64_t[:, ::1] y, @@ -1582,7 +1582,7 @@ cdef class RandomSplitter(Splitter): X, self.samples, self.feature_values, missing_values_in_feature_mask ) - cdef intp_t node_split( + cdef int node_split( self, float64_t impurity, SplitRecord* split, @@ -1606,7 +1606,7 @@ cdef class RandomSplitter(Splitter): cdef class RandomSparseSplitter(Splitter): """Splitter for finding the best random split, using the sparse data.""" cdef SparsePartitioner partitioner - cdef intp_t init( + cdef int init( self, object X, const float64_t[:, ::1] y, @@ -1617,7 +1617,7 @@ cdef class RandomSparseSplitter(Splitter): self.partitioner = SparsePartitioner( X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask ) - cdef intp_t node_split( + cdef int node_split( self, float64_t impurity, SplitRecord* split, diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 449c942aea3f1..e4081921f40f9 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -57,8 +57,8 @@ cdef class Tree: intp_t n_node_samples, float64_t weighted_n_node_samples, unsigned char missing_go_to_left) except -1 nogil - cdef intp_t _resize(self, intp_t capacity) except -1 nogil - cdef intp_t _resize_c(self, intp_t capacity=*) except -1 nogil + cdef int _resize(self, intp_t capacity) except -1 nogil + cdef int _resize_c(self, intp_t capacity=*) except -1 nogil cdef cnp.ndarray _get_value_ndarray(self) cdef cnp.ndarray _get_node_ndarray(self) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 3ba5b52990cc0..22dd74e53009d 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -38,10 +38,10 @@ from ._utils cimport sizet_ptr_to_ndarray cdef extern from "numpy/arrayobject.h": object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr, - intp_t nd, cnp.npy_intp* dims, + int nd, cnp.npy_intp* dims, cnp.npy_intp* strides, - void* data, intp_t flags, object obj) - intp_t PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) + void* data, int flags, object obj) + int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) cdef extern from "" namespace "std" nogil: cdef cppclass stack[T]: @@ -219,7 +219,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef bint is_leaf cdef bint first = 1 cdef intp_t max_depth_seen = -1 - cdef intp_t rc = 0 + cdef int rc = 0 cdef stack[StackRecord] builder_stack cdef StackRecord stack_record @@ -455,7 +455,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef intp_t max_split_nodes = max_leaf_nodes - 1 cdef bint is_leaf cdef intp_t max_depth_seen = -1 - cdef intp_t rc = 0 + cdef int rc = 0 cdef Node* node # Initial capacity @@ -587,7 +587,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if rc == -1: raise MemoryError() - cdef inline intp_t _add_split_node( + cdef inline int _add_split_node( self, Splitter splitter, Tree tree, @@ -878,7 +878,7 @@ cdef class Tree: memcpy(self.value, cnp.PyArray_DATA(value_ndarray), self.capacity * self.value_stride * sizeof(float64_t)) - cdef intp_t _resize(self, intp_t capacity) except -1 nogil: + cdef int _resize(self, intp_t capacity) except -1 nogil: """Resize all inner arrays to `capacity`, if `capacity` == -1, then double the size of the inner arrays. @@ -890,7 +890,7 @@ cdef class Tree: with gil: raise MemoryError() - cdef intp_t _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil: + cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil: """Guts of _resize Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -1925,7 +1925,7 @@ cdef _build_pruned_tree( # value_stride for original tree and new tree are the same intp_t value_stride = orig_tree.value_stride intp_t max_depth_seen = -1 - intp_t rc = 0 + int rc = 0 Node* node float64_t* orig_value_ptr float64_t* new_value_ptr diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 50a545e89368c..b59d18879ca94 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -70,12 +70,12 @@ cdef class WeightedPQueue: cdef WeightedPQueueRecord* array_ cdef bint is_empty(self) noexcept nogil - cdef intp_t reset(self) except -1 nogil + cdef int reset(self) except -1 nogil cdef intp_t size(self) noexcept nogil - cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil - cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil - cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil - cdef intp_t peek(self, float64_t* data, float64_t* weight) noexcept nogil + cdef int push(self, float64_t data, float64_t weight) except -1 nogil + cdef int remove(self, float64_t data, float64_t weight) noexcept nogil + cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil + cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil cdef float64_t get_weight_from_index(self, intp_t index) noexcept nogil cdef float64_t get_value_from_index(self, intp_t index) noexcept nogil @@ -91,14 +91,14 @@ cdef class WeightedMedianCalculator: cdef intp_t k cdef float64_t sum_w_0_k # represents sum(weights[0:k]) = w[0] + w[1] + ... + w[k-1] cdef intp_t size(self) noexcept nogil - cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil - cdef intp_t reset(self) except -1 nogil - cdef intp_t update_median_parameters_post_push( + cdef int push(self, float64_t data, float64_t weight) except -1 nogil + cdef int reset(self) except -1 nogil + cdef int update_median_parameters_post_push( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil - cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil - cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil - cdef intp_t update_median_parameters_post_remove( + cdef int remove(self, float64_t data, float64_t weight) noexcept nogil + cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil + cdef int update_median_parameters_post_remove( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil cdef float64_t get_median(self) noexcept nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 9fc890726aadb..21b21df9c3007 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -102,7 +102,7 @@ cdef class WeightedPQueue: def __dealloc__(self): free(self.array_) - cdef intp_t reset(self) except -1 nogil: + cdef int reset(self) except -1 nogil: """Reset the WeightedPQueue to its state at construction Return -1 in case of failure to allocate memory (and raise MemoryError) @@ -119,7 +119,7 @@ cdef class WeightedPQueue: cdef intp_t size(self) noexcept nogil: return self.array_ptr - cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil: + cdef int push(self, float64_t data, float64_t weight) except -1 nogil: """Push record on the array. Return -1 in case of failure to allocate memory (and raise MemoryError) @@ -151,7 +151,7 @@ cdef class WeightedPQueue: self.array_ptr = array_ptr + 1 return 0 - cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil: + cdef int remove(self, float64_t data, float64_t weight) noexcept nogil: """Remove a specific value/weight record from the array. Returns 0 if successful, -1 if record not found.""" cdef intp_t array_ptr = self.array_ptr @@ -179,7 +179,7 @@ cdef class WeightedPQueue: self.array_ptr = array_ptr - 1 return 0 - cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil: + cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil: """Remove the top (minimum) element from array. Returns 0 if successful, -1 if nothing to remove.""" cdef intp_t array_ptr = self.array_ptr @@ -200,7 +200,7 @@ cdef class WeightedPQueue: self.array_ptr = array_ptr - 1 return 0 - cdef intp_t peek(self, float64_t* data, float64_t* weight) noexcept nogil: + cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil: """Write the top element from array to a pointer. Returns 0 if successful, -1 if nothing to write.""" cdef WeightedPQueueRecord* array = self.array_ @@ -277,7 +277,7 @@ cdef class WeightedMedianCalculator: WeightedMedianCalculator""" return self.samples.size() - cdef intp_t reset(self) except -1 nogil: + cdef int reset(self) except -1 nogil: """Reset the WeightedMedianCalculator to its state at construction Return -1 in case of failure to allocate memory (and raise MemoryError) @@ -291,13 +291,13 @@ cdef class WeightedMedianCalculator: self.sum_w_0_k = 0 return 0 - cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil: + cdef int push(self, float64_t data, float64_t weight) except -1 nogil: """Push a value and its associated weight to the WeightedMedianCalculator Return -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ - cdef intp_t return_value + cdef int return_value cdef float64_t original_median = 0.0 if self.size() != 0: @@ -308,7 +308,7 @@ cdef class WeightedMedianCalculator: original_median) return return_value - cdef intp_t update_median_parameters_post_push( + cdef int update_median_parameters_post_push( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil: """Update the parameters used in the median calculation, @@ -350,11 +350,11 @@ cdef class WeightedMedianCalculator: self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1) return 0 - cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil: + cdef int remove(self, float64_t data, float64_t weight) noexcept nogil: """Remove a value from the MedianHeap, removing it from consideration in the median calculation """ - cdef intp_t return_value + cdef int return_value cdef float64_t original_median = 0.0 if self.size() != 0: @@ -365,11 +365,11 @@ cdef class WeightedMedianCalculator: original_median) return return_value - cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil: + cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil: """Pop a value from the MedianHeap, starting from the left and moving to the right. """ - cdef intp_t return_value + cdef int return_value cdef float64_t original_median = 0.0 if self.size() != 0: @@ -385,7 +385,7 @@ cdef class WeightedMedianCalculator: original_median) return return_value - cdef intp_t update_median_parameters_post_remove( + cdef int update_median_parameters_post_remove( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil: """Update the parameters used in the median calculation, From d7e2c014ce01907f5286b718b2c90da7e60a6d6b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 17 Jan 2024 12:35:07 -0500 Subject: [PATCH 10/10] Add relevant changes Signed-off-by: Adam Li --- sklearn/ensemble/_gb.py | 4 ++-- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 4 ++-- sklearn/tree/_classes.py | 6 +++--- sklearn/tree/_tree.pyx | 8 ++++---- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 88ac4d4a31801..1323e20e10e1a 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -1035,10 +1035,10 @@ def _compute_partial_dependence_recursion(self, grid, target_features): Parameters ---------- - grid : ndarray of shape (n_samples, n_target_features) + grid : ndarray of shape (n_samples, n_target_features), dtype=np.float32 The grid points on which the partial dependence should be evaluated. - target_features : ndarray of shape (n_target_features,) + target_features : ndarray of shape (n_target_features,), dtype=np.intp The set of target features for which the partial dependence should be evaluated. diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 971377b8c450b..d63a5c79f20dd 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1357,10 +1357,10 @@ def _compute_partial_dependence_recursion(self, grid, target_features): Parameters ---------- - grid : ndarray, shape (n_samples, n_target_features) + grid : ndarray, shape (n_samples, n_target_features), dtype=np.float32 The grid points on which the partial dependence should be evaluated. - target_features : ndarray, shape (n_target_features) + target_features : ndarray, shape (n_target_features), dtype=np.intp The set of target features for which the partial dependence should be evaluated. diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index b6172d233ce37..9f99d831a0990 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1387,16 +1387,16 @@ def _compute_partial_dependence_recursion(self, grid, target_features): Parameters ---------- - grid : ndarray of shape (n_samples, n_target_features) + grid : ndarray of shape (n_samples, n_target_features), dtype=np.float32 The grid points on which the partial dependence should be evaluated. - target_features : ndarray of shape (n_target_features) + target_features : ndarray of shape (n_target_features), dtype=np.intp The set of target features for which the partial dependence should be evaluated. Returns ------- - averaged_predictions : ndarray of shape (n_samples,) + averaged_predictions : ndarray of shape (n_samples,), dtype=np.float64 The value of the partial dependence function on each grid point. """ grid = np.asarray(grid, dtype=DTYPE, order="C") diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 22dd74e53009d..ea873764069f6 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -63,10 +63,10 @@ cdef float64_t INFINITY = np.inf cdef float64_t EPSILON = np.finfo('double').eps # Some handy constants (BestFirstTreeBuilder) -cdef intp_t IS_FIRST = 1 -cdef intp_t IS_NOT_FIRST = 0 -cdef intp_t IS_LEFT = 1 -cdef intp_t IS_NOT_LEFT = 0 +cdef bint IS_FIRST = 1 +cdef bint IS_NOT_FIRST = 0 +cdef bint IS_LEFT = 1 +cdef bint IS_NOT_LEFT = 0 TREE_LEAF = -1 TREE_UNDEFINED = -2