Skip to content

MAINT Replace cnp.ndarray with memory views in sklearn.tree._tree (where possible) #25540

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Feb 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@
"sklearn.svm._libsvm_sparse",
"sklearn.svm._newrand",
"sklearn.tree._splitter",
"sklearn.tree._tree",
"sklearn.tree._utils",
"sklearn.utils._cython_blas",
"sklearn.utils._fast_dict",
Expand Down
17 changes: 14 additions & 3 deletions sklearn/tree/_tree.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,17 @@ cdef class TreeBuilder:
cdef SIZE_t max_depth # Maximal tree depth
cdef double min_impurity_decrease # Impurity threshold for early stopping

cpdef build(self, Tree tree, object X, cnp.ndarray y,
cnp.ndarray sample_weight=*)
cdef _check_input(self, object X, cnp.ndarray y, cnp.ndarray sample_weight)
cpdef build(
self,
Tree tree,
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight=*,
)

cdef _check_input(
self,
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight,
)
163 changes: 86 additions & 77 deletions sklearn/tree/_tree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,22 @@ NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
cdef class TreeBuilder:
"""Interface for different tree building strategies."""

cpdef build(self, Tree tree, object X, cnp.ndarray y,
cnp.ndarray sample_weight=None):
cpdef build(
self,
Tree tree,
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight=None,
):
"""Build a decision tree from the training set (X, y)."""
pass

cdef inline _check_input(self, object X, cnp.ndarray y,
cnp.ndarray sample_weight):
cdef inline _check_input(
self,
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight,
):
"""Check input dtype, layout and format"""
if issparse(X):
X = X.tocsc()
Expand All @@ -109,12 +118,15 @@ cdef class TreeBuilder:
# since we have to copy we will make it fortran for efficiency
X = np.asfortranarray(X, dtype=DTYPE)

if y.dtype != DOUBLE or not y.flags.contiguous:
# TODO: This check for y seems to be redundant, as it is also
# present in the BaseDecisionTree's fit method, and therefore
# can be removed.
if y.base.dtype != DOUBLE or not y.base.flags.contiguous:
y = np.ascontiguousarray(y, dtype=DOUBLE)

if (sample_weight is not None and
(sample_weight.dtype != DOUBLE or
not sample_weight.flags.contiguous)):
(sample_weight.base.dtype != DOUBLE or
not sample_weight.base.flags.contiguous)):
sample_weight = np.asarray(sample_weight, dtype=DOUBLE,
order="C")

Expand Down Expand Up @@ -144,8 +156,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
self.max_depth = max_depth
self.min_impurity_decrease = min_impurity_decrease

cpdef build(self, Tree tree, object X, cnp.ndarray y,
cnp.ndarray sample_weight=None):
cpdef build(
self,
Tree tree,
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight=None,
):
"""Build a decision tree from the training set (X, y)."""

# check input
Expand Down Expand Up @@ -335,8 +352,13 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease

cpdef build(self, Tree tree, object X, cnp.ndarray y,
cnp.ndarray sample_weight=None):
cpdef build(
self,
Tree tree,
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight=None,
):
"""Build a decision tree from the training set (X, y)."""

# check input
Expand Down Expand Up @@ -608,6 +630,8 @@ cdef class Tree:
def __get__(self):
return self._get_value_ndarray()[:self.node_count]

# TODO: Convert n_classes to cython.integral memory view once
# https://github.com/cython/cython/issues/5243 is fixed
def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs):
"""Constructor."""
cdef SIZE_t dummy = 0
Expand Down Expand Up @@ -683,9 +707,12 @@ cdef class Tree:
self.capacity = node_ndarray.shape[0]
if self._resize_c(self.capacity) != 0:
raise MemoryError("resizing tree to %d" % self.capacity)
nodes = memcpy(self.nodes, (<cnp.ndarray> node_ndarray).data,

cdef Node[::1] node_memory_view = node_ndarray
cdef DOUBLE_t[:, :, ::1] value_memory_view = value_ndarray
nodes = memcpy(self.nodes, &node_memory_view[0],
self.capacity * sizeof(Node))
value = memcpy(self.value, (<cnp.ndarray> value_ndarray).data,
value = memcpy(self.value, &value_memory_view[0, 0, 0],
self.capacity * self.value_stride * sizeof(double))

cdef int _resize(self, SIZE_t capacity) nogil except -1:
Expand Down Expand Up @@ -804,8 +831,7 @@ cdef class Tree:
cdef SIZE_t n_samples = X.shape[0]

# Initialize output
cdef cnp.ndarray[SIZE_t] out = np.zeros((n_samples,), dtype=np.intp)
cdef SIZE_t* out_ptr = <SIZE_t*> out.data
cdef SIZE_t[:] out = np.zeros(n_samples, dtype=np.intp)

# Initialize auxiliary data-structure
cdef Node* node = NULL
Expand All @@ -822,9 +848,9 @@ cdef class Tree:
else:
node = &self.nodes[node.right_child]

out_ptr[i] = <SIZE_t>(node - self.nodes) # node offset
out[i] = <SIZE_t>(node - self.nodes) # node offset

return out
return np.asarray(out)

cdef inline cnp.ndarray _apply_sparse_csr(self, object X):
"""Finds the terminal region (=leaf node) for each sample in sparse X.
Expand All @@ -838,21 +864,15 @@ cdef class Tree:
raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)

# Extract input
cdef cnp.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data
cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray = X.indices
cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray = X.indptr

cdef DTYPE_t* X_data = <DTYPE_t*>X_data_ndarray.data
cdef INT32_t* X_indices = <INT32_t*>X_indices_ndarray.data
cdef INT32_t* X_indptr = <INT32_t*>X_indptr_ndarray.data
cdef const DTYPE_t[:] X_data = X.data
cdef const INT32_t[:] X_indices = X.indices
cdef const INT32_t[:] X_indptr = X.indptr

cdef SIZE_t n_samples = X.shape[0]
cdef SIZE_t n_features = X.shape[1]

# Initialize output
cdef cnp.ndarray[SIZE_t, ndim=1] out = np.zeros((n_samples,),
dtype=np.intp)
cdef SIZE_t* out_ptr = <SIZE_t*> out.data
cdef SIZE_t[:] out = np.zeros(n_samples, dtype=np.intp)

# Initialize auxiliary data-structure
cdef DTYPE_t feature_value = 0.
Expand Down Expand Up @@ -893,13 +913,13 @@ cdef class Tree:
else:
node = &self.nodes[node.right_child]

out_ptr[i] = <SIZE_t>(node - self.nodes) # node offset
out[i] = <SIZE_t>(node - self.nodes) # node offset

# Free auxiliary arrays
free(X_sample)
free(feature_to_sample)

return out
return np.asarray(out)

cpdef object decision_path(self, object X):
"""Finds the decision path (=node) for each sample in X."""
Expand All @@ -924,13 +944,10 @@ cdef class Tree:
cdef SIZE_t n_samples = X.shape[0]

# Initialize output
cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1, dtype=np.intp)
cdef SIZE_t* indptr_ptr = <SIZE_t*> indptr.data

cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples *
(1 + self.max_depth),
dtype=np.intp)
cdef SIZE_t* indices_ptr = <SIZE_t*> indices.data
cdef SIZE_t[:] indptr = np.zeros(n_samples + 1, dtype=np.intp)
cdef SIZE_t[:] indices = np.zeros(
n_samples * (1 + self.max_depth), dtype=np.intp
)

# Initialize auxiliary data-structure
cdef Node* node = NULL
Expand All @@ -939,26 +956,25 @@ cdef class Tree:
with nogil:
for i in range(n_samples):
node = self.nodes
indptr_ptr[i + 1] = indptr_ptr[i]
indptr[i + 1] = indptr[i]

# Add all external nodes
while node.left_child != _TREE_LEAF:
# ... and node.right_child != _TREE_LEAF:
indices_ptr[indptr_ptr[i + 1]] = <SIZE_t>(node - self.nodes)
indptr_ptr[i + 1] += 1
indices[indptr[i + 1]] = <SIZE_t>(node - self.nodes)
indptr[i + 1] += 1

if X_ndarray[i, node.feature] <= node.threshold:
node = &self.nodes[node.left_child]
else:
node = &self.nodes[node.right_child]

# Add the leave node
indices_ptr[indptr_ptr[i + 1]] = <SIZE_t>(node - self.nodes)
indptr_ptr[i + 1] += 1
indices[indptr[i + 1]] = <SIZE_t>(node - self.nodes)
indptr[i + 1] += 1

indices = indices[:indptr[n_samples]]
cdef cnp.ndarray[SIZE_t] data = np.ones(shape=len(indices),
dtype=np.intp)
cdef SIZE_t[:] data = np.ones(shape=len(indices), dtype=np.intp)
out = csr_matrix((data, indices, indptr),
shape=(n_samples, self.node_count))

Expand All @@ -976,25 +992,18 @@ cdef class Tree:
raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)

# Extract input
cdef cnp.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data
cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray = X.indices
cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray = X.indptr

cdef DTYPE_t* X_data = <DTYPE_t*>X_data_ndarray.data
cdef INT32_t* X_indices = <INT32_t*>X_indices_ndarray.data
cdef INT32_t* X_indptr = <INT32_t*>X_indptr_ndarray.data
cdef const DTYPE_t[:] X_data = X.data
cdef const INT32_t[:] X_indices = X.indices
cdef const INT32_t[:] X_indptr = X.indptr

cdef SIZE_t n_samples = X.shape[0]
cdef SIZE_t n_features = X.shape[1]

# Initialize output
cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1, dtype=np.intp)
cdef SIZE_t* indptr_ptr = <SIZE_t*> indptr.data

cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples *
(1 + self.max_depth),
dtype=np.intp)
cdef SIZE_t* indices_ptr = <SIZE_t*> indices.data
cdef SIZE_t[:] indptr = np.zeros(n_samples + 1, dtype=np.intp)
cdef SIZE_t[:] indices = np.zeros(
n_samples * (1 + self.max_depth), dtype=np.intp
)

# Initialize auxiliary data-structure
cdef DTYPE_t feature_value = 0.
Expand All @@ -1016,7 +1025,7 @@ cdef class Tree:

for i in range(n_samples):
node = self.nodes
indptr_ptr[i + 1] = indptr_ptr[i]
indptr[i + 1] = indptr[i]

for k in range(X_indptr[i], X_indptr[i + 1]):
feature_to_sample[X_indices[k]] = i
Expand All @@ -1026,8 +1035,8 @@ cdef class Tree:
while node.left_child != _TREE_LEAF:
# ... and node.right_child != _TREE_LEAF:

indices_ptr[indptr_ptr[i + 1]] = <SIZE_t>(node - self.nodes)
indptr_ptr[i + 1] += 1
indices[indptr[i + 1]] = <SIZE_t>(node - self.nodes)
indptr[i + 1] += 1

if feature_to_sample[node.feature] == i:
feature_value = X_sample[node.feature]
Expand All @@ -1041,16 +1050,15 @@ cdef class Tree:
node = &self.nodes[node.right_child]

# Add the leave node
indices_ptr[indptr_ptr[i + 1]] = <SIZE_t>(node - self.nodes)
indptr_ptr[i + 1] += 1
indices[indptr[i + 1]] = <SIZE_t>(node - self.nodes)
indptr[i + 1] += 1

# Free auxiliary arrays
free(X_sample)
free(feature_to_sample)

indices = indices[:indptr[n_samples]]
cdef cnp.ndarray[SIZE_t] data = np.ones(shape=len(indices),
dtype=np.intp)
cdef SIZE_t[:] data = np.ones(shape=len(indices), dtype=np.intp)
out = csr_matrix((data, indices, indptr),
shape=(n_samples, self.node_count))

Expand Down Expand Up @@ -1093,9 +1101,7 @@ cdef class Tree:

cdef double normalizer = 0.

cdef cnp.ndarray[cnp.float64_t, ndim=1] importances
importances = np.zeros((self.n_features,))
cdef DOUBLE_t* importance_data = <DOUBLE_t*>importances.data
cdef cnp.float64_t[:] importances = np.zeros(self.n_features)

with nogil:
while node != end_node:
Expand All @@ -1104,22 +1110,24 @@ cdef class Tree:
left = &nodes[node.left_child]
right = &nodes[node.right_child]

importance_data[node.feature] += (
importances[node.feature] += (
node.weighted_n_node_samples * node.impurity -
left.weighted_n_node_samples * left.impurity -
right.weighted_n_node_samples * right.impurity)
node += 1

importances /= nodes[0].weighted_n_node_samples
for i in range(self.n_features):
importances[i] /= nodes[0].weighted_n_node_samples

if normalize:
normalizer = np.sum(importances)

if normalizer > 0.0:
# Avoid dividing by zero (e.g., when root is pure)
importances /= normalizer
for i in range(self.n_features):
importances[i] /= normalizer

return importances
return np.asarray(importances)

cdef cnp.ndarray _get_value_ndarray(self):
"""Wraps value as a 3-d NumPy array.
Expand Down Expand Up @@ -1154,7 +1162,7 @@ cdef class Tree:
arr = PyArray_NewFromDescr(<PyTypeObject *> cnp.ndarray,
<cnp.dtype> NODE_DTYPE, 1, shape,
strides, <void*> self.nodes,
cnp.NPY_DEFAULT, None)
cnp.NPY_ARRAY_DEFAULT, None)
Py_INCREF(self)
if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
raise ValueError("Can't initialize array.")
Expand Down Expand Up @@ -1686,18 +1694,19 @@ def ccp_pruning_path(Tree orig_tree):

cdef:
UINT32_t total_items = path_finder.count
cnp.ndarray ccp_alphas = np.empty(shape=total_items,
dtype=np.float64)
cnp.ndarray impurities = np.empty(shape=total_items,
dtype=np.float64)
DOUBLE_t[:] ccp_alphas = np.empty(shape=total_items, dtype=np.float64)
DOUBLE_t[:] impurities = np.empty(shape=total_items, dtype=np.float64)
UINT32_t count = 0

while count < total_items:
ccp_alphas[count] = path_finder.ccp_alphas[count]
impurities[count] = path_finder.impurities[count]
count += 1

return {'ccp_alphas': ccp_alphas, 'impurities': impurities}
return {
'ccp_alphas': np.asarray(ccp_alphas),
'impurities': np.asarray(impurities),
}


cdef struct BuildPrunedRecord:
Expand Down