Skip to content

MAINT Convert int to intp_t ctype def in tree/ related code #27546

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 47 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
48a58e8
Working replacement
adam2392 Oct 7, 2023
4e2232e
Convert rest of files
adam2392 Oct 7, 2023
357400e
Finish ctypedef conversion
adam2392 Oct 7, 2023
e74a0cf
Add conversion dtypes for target_features
adam2392 Oct 9, 2023
e2a1204
Add inline comment
adam2392 Oct 9, 2023
90a4233
Merge branch 'main' into ctypeint
adam2392 Oct 9, 2023
9d95769
run lint
adam2392 Oct 9, 2023
b0cc454
Merge branch 'main' into ctypeint
adam2392 Oct 10, 2023
32dc901
Merge branch 'main' into ctypeint
adam2392 Oct 11, 2023
ae05493
Merge branch 'main' into ctypeint
adam2392 Oct 12, 2023
cdde707
Merge branch 'main' into ctypeint
adam2392 Oct 13, 2023
753fb60
Merge branch 'main' into ctypeint
adam2392 Oct 17, 2023
6b32bab
Merge branch 'main' into ctypeint
adam2392 Oct 23, 2023
e66a3dd
Merge branch 'main' into ctypeint
adam2392 Oct 24, 2023
df7a31a
Merge branch 'main' into ctypeint
adam2392 Oct 25, 2023
defe311
Merge branch 'main' into ctypeint
adam2392 Oct 27, 2023
502ca72
Merge branch 'main' into ctypeint
adam2392 Oct 30, 2023
bc410d0
Merge branch 'main' into ctypeint
adam2392 Nov 3, 2023
877dc32
Consolidate unit-testing
adam2392 Nov 3, 2023
b41e857
Apply suggestions from code review
adam2392 Nov 3, 2023
fdabf1d
Merge branch 'ctypeint' of https://github.com/neurodata/scikit-learn …
adam2392 Nov 3, 2023
4ef948f
Merge branch 'main' into ctypeint
adam2392 Nov 3, 2023
ab7abcc
Merge branch 'main' into ctypeint
adam2392 Nov 19, 2023
765bead
Merge branch 'main' into ctypeint
adam2392 Nov 19, 2023
ae10cc5
Merge branch 'main' into ctypeint
adam2392 Nov 20, 2023
bb1713b
Merge branch 'main' into ctypeint
adam2392 Nov 21, 2023
ae3d69b
Merge branch 'main' into ctypeint
adam2392 Nov 21, 2023
0014b13
Merge branch 'main' into ctypeint
adam2392 Nov 22, 2023
a099db9
Merge branch 'main' into ctypeint
adam2392 Nov 24, 2023
e348e4a
Merge branch 'main' into ctypeint
adam2392 Nov 24, 2023
45598a6
Merge branch 'main' into ctypeint
adam2392 Nov 27, 2023
310d11c
Merge branch 'main' into ctypeint
adam2392 Dec 5, 2023
3dfaea0
Merge branch 'main' into ctypeint
adam2392 Dec 6, 2023
346d4d4
Merge branch 'main' into ctypeint
adam2392 Dec 9, 2023
2dea684
Merge branch 'main' into ctypeint
adam2392 Dec 12, 2023
ee91a14
Merge branch 'main' into ctypeint
adam2392 Dec 12, 2023
88b9efb
Merge branch 'main' into ctypeint
adam2392 Dec 13, 2023
276abdd
Merge branch 'main' into ctypeint
adam2392 Dec 15, 2023
cb8ffe6
Merge branch 'main' into ctypeint
adam2392 Dec 26, 2023
ebb7155
Merge branch 'main' into ctypeint
adam2392 Jan 2, 2024
3d1622c
Merge branch 'main' into ctypeint
adam2392 Jan 11, 2024
8b9a5dc
Apply suggestions from code review
adam2392 Jan 17, 2024
d7e2c01
Add relevant changes
adam2392 Jan 17, 2024
0dc4a12
Merge branch 'main' into ctypeint
adam2392 Jan 17, 2024
ba0e0b0
Merge branch 'main' into ctypeint
adam2392 Jan 18, 2024
229f4d7
Merge branch 'main' into ctypeint
adam2392 Jan 19, 2024
339e30d
Merge branch 'main' into ctypeint
adam2392 Jan 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions sklearn/ensemble/_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1135,10 +1135,10 @@ def _compute_partial_dependence_recursion(self, grid, target_features):

Parameters
----------
grid : ndarray of shape (n_samples, n_target_features)
grid : ndarray of shape (n_samples, n_target_features), dtype=DTYPE
The grid points on which the partial dependence should be
evaluated.
target_features : ndarray of shape (n_target_features)
target_features : ndarray of shape (n_target_features), dtype=np.intp
The set of target features for which the partial dependence
should be evaluated.

Expand All @@ -1148,6 +1148,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
The value of the partial dependence function on each grid point.
"""
grid = np.asarray(grid, dtype=DTYPE, order="C")
target_features = np.asarray(target_features, dtype=np.intp, order="C")
averaged_predictions = np.zeros(
shape=grid.shape[0], dtype=np.float64, order="C"
)
Expand Down
6 changes: 4 additions & 2 deletions sklearn/ensemble/_gb.py
Original file line number Diff line number Diff line change
Expand Up @@ -1042,10 +1042,10 @@ def _compute_partial_dependence_recursion(self, grid, target_features):

Parameters
----------
grid : ndarray of shape (n_samples, n_target_features)
grid : ndarray of shape (n_samples, n_target_features), dtype=np.float32
The grid points on which the partial dependence should be
evaluated.
target_features : ndarray of shape (n_target_features,)
target_features : ndarray of shape (n_target_features,), dtype=np.intp
The set of target features for which the partial dependence
should be evaluated.

Expand All @@ -1068,6 +1068,8 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
averaged_predictions = np.zeros(
(n_trees_per_stage, grid.shape[0]), dtype=np.float64, order="C"
)
target_features = np.asarray(target_features, dtype=np.intp, order="C")

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can specify the dtype of grid and target_features in the docstring as well.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

for stage in range(n_estimators):
for k in range(n_trees_per_stage):
tree = self.estimators_[stage, k].tree_
Expand Down
4 changes: 2 additions & 2 deletions sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data(
def _compute_partial_dependence(
node_struct [:] nodes,
const X_DTYPE_C [:, ::1] X,
int [:] target_features,
const intp_t [:] target_features,
Y_DTYPE_C [:] out
):
"""Partial dependence of the response on the ``target_features`` set.
Expand All @@ -173,7 +173,7 @@ def _compute_partial_dependence(
X : view on 2d ndarray, shape (n_samples, n_target_features)
The grid points on which the partial dependence should be
evaluated.
target_features : view on 1d ndarray, shape (n_target_features)
target_features : view on 1d ndarray of intp_t, shape (n_target_features)
The set of target features for which the partial dependence
should be evaluated.
out : view on 1d ndarray, shape (n_samples)
Expand Down
5 changes: 3 additions & 2 deletions sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -1357,10 +1357,10 @@ def _compute_partial_dependence_recursion(self, grid, target_features):

Parameters
----------
grid : ndarray, shape (n_samples, n_target_features)
grid : ndarray, shape (n_samples, n_target_features), dtype=np.float32
The grid points on which the partial dependence should be
evaluated.
target_features : ndarray, shape (n_target_features)
target_features : ndarray, shape (n_target_features), dtype=np.intp
The set of target features for which the partial dependence
should be evaluated.

Expand All @@ -1383,6 +1383,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
averaged_predictions = np.zeros(
(self.n_trees_per_iteration_, grid.shape[0]), dtype=Y_DTYPE
)
target_features = np.asarray(target_features, dtype=np.intp, order="C")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you update the docstring above to specify the dtype of grid and target_features.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


for predictors_of_ith_iteration in self._predictors:
for k, predictor in enumerate(predictors_of_ith_iteration):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/inspection/_partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,7 @@ def partial_dependence(
raise ValueError("all features must be in [0, {}]".format(X.shape[1] - 1))

features_indices = np.asarray(
_get_column_indices(X, features), dtype=np.int32, order="C"
_get_column_indices(X, features), dtype=np.intp, order="C"
).ravel()

feature_names = _check_feature_names(X, feature_names)
Expand Down
4 changes: 2 additions & 2 deletions sklearn/inspection/tests/test_partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def test_partial_dependence_helpers(est, method, target_feature):
est.fit(X, y)

# target feature will be set to .5 and then to 123
features = np.array([target_feature], dtype=np.int32)
features = np.array([target_feature], dtype=np.intp)
grid = np.array([[0.5], [123]])

if method == "brute":
Expand Down Expand Up @@ -356,7 +356,7 @@ def test_recursion_decision_tree_vs_forest_and_gbdt(seed):

grid = rng.randn(50).reshape(-1, 1)
for f in range(n_features):
features = np.array([f], dtype=np.int32)
features = np.array([f], dtype=np.intp)

pdp_forest = _partial_dependence_recursion(forest, grid, features)
pdp_gbdt = _partial_dependence_recursion(gbdt, grid, features)
Expand Down
7 changes: 4 additions & 3 deletions sklearn/tree/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1387,22 +1387,23 @@ def _compute_partial_dependence_recursion(self, grid, target_features):

Parameters
----------
grid : ndarray of shape (n_samples, n_target_features)
grid : ndarray of shape (n_samples, n_target_features), dtype=np.float32
The grid points on which the partial dependence should be
evaluated.
target_features : ndarray of shape (n_target_features)
target_features : ndarray of shape (n_target_features), dtype=np.intp
The set of target features for which the partial dependence
should be evaluated.

Returns
-------
averaged_predictions : ndarray of shape (n_samples,)
averaged_predictions : ndarray of shape (n_samples,), dtype=np.float64
The value of the partial dependence function on each grid point.
"""
grid = np.asarray(grid, dtype=DTYPE, order="C")
averaged_predictions = np.zeros(
shape=grid.shape[0], dtype=np.float64, order="C"
)
target_features = np.asarray(target_features, dtype=np.intp, order="C")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can also update the grid and target_features dtype in the docstring.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


self.tree_.compute_partial_dependence(
grid, target_features, averaged_predictions
Expand Down
18 changes: 13 additions & 5 deletions sklearn/tree/_splitter.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,12 @@ cdef class Splitter:
self.criterion.init_sum_missing()
return 0

cdef int node_reset(self, intp_t start, intp_t end,
float64_t* weighted_n_node_samples) except -1 nogil:
cdef int node_reset(
self,
intp_t start,
intp_t end,
float64_t* weighted_n_node_samples
) except -1 nogil:
"""Reset splitter on node samples[start:end].

Returns -1 in case of failure to allocate memory (and raise MemoryError)
Expand Down Expand Up @@ -559,7 +563,7 @@ cdef inline int node_split_best(
cdef inline void sort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil:
if n == 0:
return
cdef int maxd = 2 * <int>log(n)
cdef intp_t maxd = 2 * <intp_t>log(n)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since n is already intp_t then it makes sense.

introsort(feature_values, samples, n, maxd)


Expand Down Expand Up @@ -593,7 +597,7 @@ cdef inline float32_t median3(float32_t* feature_values, intp_t n) noexcept nogi
# Introsort with median of 3 pivot selection and 3-way partition function
# (robust to repeated elements, e.g. lots of zero features).
cdef void introsort(float32_t* feature_values, intp_t *samples,
intp_t n, int maxd) noexcept nogil:
intp_t n, intp_t maxd) noexcept nogil:
cdef float32_t pivot
cdef intp_t i, l, r

Expand Down Expand Up @@ -1340,7 +1344,11 @@ cdef class SparsePartitioner:


cdef int compare_SIZE_t(const void* a, const void* b) noexcept nogil:
"""Comparison function for sort."""
"""Comparison function for sort.

This must return an `int` as it is used by stdlib's qsort, which expects
an `int` return value.
"""
return <int>((<intp_t*>a)[0] - (<intp_t*>b)[0])


Expand Down
36 changes: 18 additions & 18 deletions sklearn/tree/_tree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ cdef float64_t INFINITY = np.inf
cdef float64_t EPSILON = np.finfo('double').eps

# Some handy constants (BestFirstTreeBuilder)
cdef int IS_FIRST = 1
cdef int IS_NOT_FIRST = 0
cdef int IS_LEFT = 1
cdef int IS_NOT_LEFT = 0
cdef bint IS_FIRST = 1
cdef bint IS_NOT_FIRST = 0
cdef bint IS_LEFT = 1
cdef bint IS_NOT_LEFT = 0

TREE_LEAF = -1
TREE_UNDEFINED = -2
Expand Down Expand Up @@ -177,10 +177,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
X, y, sample_weight = self._check_input(X, y, sample_weight)

# Initial capacity
cdef int init_capacity
cdef intp_t init_capacity

if tree.max_depth <= 10:
init_capacity = <int> (2 ** (tree.max_depth + 1)) - 1
init_capacity = <intp_t> (2 ** (tree.max_depth + 1)) - 1
else:
init_capacity = 2047

Expand Down Expand Up @@ -696,32 +696,32 @@ cdef class Tree:

Attributes
----------
node_count : int
node_count : intp_t
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that I agree with the changes below because they are all linked to some sort of indexing types.

The number of nodes (internal nodes + leaves) in the tree.

capacity : int
capacity : intp_t
The current capacity (i.e., size) of the arrays, which is at least as
great as `node_count`.

max_depth : int
max_depth : intp_t
The depth of the tree, i.e. the maximum depth of its leaves.

children_left : array of int, shape [node_count]
children_left : array of intp_t, shape [node_count]
children_left[i] holds the node id of the left child of node i.
For leaves, children_left[i] == TREE_LEAF. Otherwise,
children_left[i] > i. This child handles the case where
X[:, feature[i]] <= threshold[i].

children_right : array of int, shape [node_count]
children_right : array of intp_t, shape [node_count]
children_right[i] holds the node id of the right child of node i.
For leaves, children_right[i] == TREE_LEAF. Otherwise,
children_right[i] > i. This child handles the case where
X[:, feature[i]] > threshold[i].

n_leaves : int
n_leaves : intp_t
Number of leaves in the tree.

feature : array of int, shape [node_count]
feature : array of intp_t, shape [node_count]
feature[i] holds the feature to split on, for the internal node i.

threshold : array of float64_t, shape [node_count]
Expand All @@ -734,7 +734,7 @@ cdef class Tree:
impurity[i] holds the impurity (i.e., the value of the splitting
criterion) at node i.

n_node_samples : array of int, shape [node_count]
n_node_samples : array of intp_t, shape [node_count]
n_node_samples[i] holds the number of training samples reaching node i.

weighted_n_node_samples : array of float64_t, shape [node_count]
Expand Down Expand Up @@ -797,7 +797,7 @@ cdef class Tree:

# TODO: Convert n_classes to cython.integral memory view once
# https://github.com/cython/cython/issues/5243 is fixed
def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs):
def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs):
"""Constructor."""
cdef intp_t dummy = 0
size_t_dtype = np.array(dummy).dtype
Expand Down Expand Up @@ -1343,7 +1343,7 @@ cdef class Tree:
return arr

def compute_partial_dependence(self, float32_t[:, ::1] X,
int[::1] target_features,
const intp_t[::1] target_features,
float64_t[::1] out):
"""Partial dependence of the response on the ``target_feature`` set.

Expand Down Expand Up @@ -1379,7 +1379,7 @@ cdef class Tree:
dtype=np.intp)
intp_t sample_idx
intp_t feature_idx
int stack_size
intp_t stack_size
float64_t left_sample_frac
float64_t current_weight
float64_t total_weight # used for sanity check only
Expand Down Expand Up @@ -1627,7 +1627,7 @@ cdef class _PathFinder(_CCPPruneController):
cdef float64_t[:] impurities
cdef uint32_t count

def __cinit__(self, int node_count):
def __cinit__(self, intp_t node_count):
self.ccp_alphas = np.zeros(shape=(node_count), dtype=np.float64)
self.impurities = np.zeros(shape=(node_count), dtype=np.float64)
self.count = 0
Expand Down
6 changes: 3 additions & 3 deletions sklearn/tree/_utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -450,9 +450,9 @@ cdef class WeightedMedianCalculator:
def _any_isnan_axis0(const float32_t[:, :] X):
"""Same as np.any(np.isnan(X), axis=0)"""
cdef:
int i, j
int n_samples = X.shape[0]
int n_features = X.shape[1]
intp_t i, j
intp_t n_samples = X.shape[0]
intp_t n_features = X.shape[1]
unsigned char[::1] isnan_out = np.zeros(X.shape[1], dtype=np.bool_)

with nogil:
Expand Down