From bf9e2641c811e177870880a28b8b3a1e0b19e6a9 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Sat, 4 Feb 2023 19:21:45 +0500 Subject: [PATCH 01/14] MAINT Remove -Wcpp warnings when compiling sklearn.tree._tree --- sklearn/tree/_tree.pxd | 8 ++++---- sklearn/tree/_tree.pyx | 34 +++++++++++++++++----------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 11c848881f6d3..f6bb354ae0cc2 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -12,7 +12,7 @@ import numpy as np cimport numpy as cnp - +from cython cimport floating ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef cnp.npy_intp SIZE_t # Type for indices and counters @@ -99,6 +99,6 @@ cdef class TreeBuilder: cdef SIZE_t max_depth # Maximal tree depth cdef double min_impurity_decrease # Impurity threshold for early stopping - cpdef build(self, Tree tree, object X, cnp.ndarray y, - cnp.ndarray sample_weight=*) - cdef _check_input(self, object X, cnp.ndarray y, cnp.ndarray sample_weight) + cpdef build(self, Tree tree, object X, const floating[:] y, + const floating[:] sample_weight=*) + cdef _check_input(self, object X, const floating[:] y, const floating[:] sample_weight) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 730b2c1c3c9c6..f51f740b1a47a 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -13,7 +13,7 @@ # License: BSD 3 clause from cpython cimport Py_INCREF, PyObject, PyTypeObject - +from cython cimport floating from libc.stdlib cimport free from libc.string cimport memcpy from libc.string cimport memset @@ -86,13 +86,13 @@ NODE_DTYPE = np.asarray((&dummy)).dtype cdef class TreeBuilder: """Interface for different tree building strategies.""" - cpdef build(self, Tree tree, object X, cnp.ndarray y, - cnp.ndarray sample_weight=None): + cpdef build(self, Tree tree, object X, const floating[:] y, + const floating[:] sample_weight=None): """Build a decision tree from the training set (X, y).""" pass - cdef inline _check_input(self, object X, cnp.ndarray y, - cnp.ndarray sample_weight): + cdef inline _check_input(self, object X, const floating[:] y, + const floating[:] sample_weight): """Check input dtype, layout and format""" if issparse(X): X = X.tocsc() @@ -109,16 +109,16 @@ cdef class TreeBuilder: # since we have to copy we will make it fortran for efficiency X = np.asfortranarray(X, dtype=DTYPE) - if y.dtype != DOUBLE or not y.flags.contiguous: + if y.base.dtype != DOUBLE or not y.base.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if (sample_weight is not None and - (sample_weight.dtype != DOUBLE or - not sample_weight.flags.contiguous)): + (sample_weight.base.dtype != DOUBLE or + not sample_weight.base.flags.contiguous)): sample_weight = np.asarray(sample_weight, dtype=DOUBLE, order="C") - return X, y, sample_weight + return X, y.base, sample_weight.base # Depth first builder --------------------------------------------------------- # A record on the stack for depth-first tree growing @@ -144,12 +144,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): self.max_depth = max_depth self.min_impurity_decrease = min_impurity_decrease - cpdef build(self, Tree tree, object X, cnp.ndarray y, - cnp.ndarray sample_weight=None): + cpdef build(self, Tree tree, object X, const floating[:] y, + const floating[:] sample_weight=None): """Build a decision tree from the training set (X, y).""" # check input - X, y, sample_weight = self._check_input(X, y, sample_weight) + X, _y, _sample_weight = self._check_input(X, y, sample_weight) # Initial capacity cdef int init_capacity @@ -170,7 +170,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef double min_impurity_decrease = self.min_impurity_decrease # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight) + splitter.init(X, _y, _sample_weight) cdef SIZE_t start cdef SIZE_t end @@ -335,19 +335,19 @@ cdef class BestFirstTreeBuilder(TreeBuilder): self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease - cpdef build(self, Tree tree, object X, cnp.ndarray y, - cnp.ndarray sample_weight=None): + cpdef build(self, Tree tree, object X, const floating[:] y, + const floating[:] sample_weight=None): """Build a decision tree from the training set (X, y).""" # check input - X, y, sample_weight = self._check_input(X, y, sample_weight) + X, _y, _sample_weight = self._check_input(X, y, sample_weight) # Parameters cdef Splitter splitter = self.splitter cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight) + splitter.init(X, _y, _sample_weight) cdef vector[FrontierRecord] frontier cdef FrontierRecord record From eb6a658b0ada3464e0fa79da716f57af053141fe Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Mon, 6 Feb 2023 12:55:24 +0500 Subject: [PATCH 02/14] * Set y and sample_weight as DOUBLE_t memory views in build * Remove condition for checking y in _check_input as it seems redundant --- sklearn/tree/_tree.pxd | 8 ++++---- sklearn/tree/_tree.pyx | 31 ++++++++++++++----------------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index f6bb354ae0cc2..986c52aacd17e 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -12,7 +12,7 @@ import numpy as np cimport numpy as cnp -from cython cimport floating + ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef cnp.npy_intp SIZE_t # Type for indices and counters @@ -99,6 +99,6 @@ cdef class TreeBuilder: cdef SIZE_t max_depth # Maximal tree depth cdef double min_impurity_decrease # Impurity threshold for early stopping - cpdef build(self, Tree tree, object X, const floating[:] y, - const floating[:] sample_weight=*) - cdef _check_input(self, object X, const floating[:] y, const floating[:] sample_weight) + cpdef build(self, Tree tree, object X, const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=*) + cdef _check_input(self, object X, const DOUBLE_t[:] sample_weight) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index f51f740b1a47a..335499cd3e9a2 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -13,7 +13,7 @@ # License: BSD 3 clause from cpython cimport Py_INCREF, PyObject, PyTypeObject -from cython cimport floating + from libc.stdlib cimport free from libc.string cimport memcpy from libc.string cimport memset @@ -86,13 +86,12 @@ NODE_DTYPE = np.asarray((&dummy)).dtype cdef class TreeBuilder: """Interface for different tree building strategies.""" - cpdef build(self, Tree tree, object X, const floating[:] y, - const floating[:] sample_weight=None): + cpdef build(self, Tree tree, object X, const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=None): """Build a decision tree from the training set (X, y).""" pass - cdef inline _check_input(self, object X, const floating[:] y, - const floating[:] sample_weight): + cdef inline _check_input(self, object X, const DOUBLE_t[:] sample_weight): """Check input dtype, layout and format""" if issparse(X): X = X.tocsc() @@ -109,16 +108,14 @@ cdef class TreeBuilder: # since we have to copy we will make it fortran for efficiency X = np.asfortranarray(X, dtype=DTYPE) - if y.base.dtype != DOUBLE or not y.base.flags.contiguous: - y = np.ascontiguousarray(y, dtype=DOUBLE) - if (sample_weight is not None and (sample_weight.base.dtype != DOUBLE or not sample_weight.base.flags.contiguous)): sample_weight = np.asarray(sample_weight, dtype=DOUBLE, order="C") - return X, y.base, sample_weight.base + sample_weight_exists = sample_weight is not None and sample_weight.size > 0 + return X, sample_weight.base if sample_weight_exists else None # Depth first builder --------------------------------------------------------- # A record on the stack for depth-first tree growing @@ -144,12 +141,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): self.max_depth = max_depth self.min_impurity_decrease = min_impurity_decrease - cpdef build(self, Tree tree, object X, const floating[:] y, - const floating[:] sample_weight=None): + cpdef build(self, Tree tree, object X, const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=None): """Build a decision tree from the training set (X, y).""" # check input - X, _y, _sample_weight = self._check_input(X, y, sample_weight) + X, sample_weight = self._check_input(X, sample_weight) # Initial capacity cdef int init_capacity @@ -170,7 +167,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef double min_impurity_decrease = self.min_impurity_decrease # Recursive partition (without actual recursion) - splitter.init(X, _y, _sample_weight) + splitter.init(X, y, sample_weight) cdef SIZE_t start cdef SIZE_t end @@ -335,19 +332,19 @@ cdef class BestFirstTreeBuilder(TreeBuilder): self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease - cpdef build(self, Tree tree, object X, const floating[:] y, - const floating[:] sample_weight=None): + cpdef build(self, Tree tree, object X, const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=None): """Build a decision tree from the training set (X, y).""" # check input - X, _y, _sample_weight = self._check_input(X, y, sample_weight) + X, sample_weight = self._check_input(X, sample_weight) # Parameters cdef Splitter splitter = self.splitter cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes # Recursive partition (without actual recursion) - splitter.init(X, _y, _sample_weight) + splitter.init(X, y, sample_weight) cdef vector[FrontierRecord] frontier cdef FrontierRecord record From 73d542d979ff3544b4770403ec09da9a5f535b74 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Mon, 6 Feb 2023 16:00:44 +0500 Subject: [PATCH 03/14] Use SIZE_t memory view for num_classes in tree __cinit__ --- sklearn/tree/_tree.pxd | 2 +- sklearn/tree/_tree.pyx | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 986c52aacd17e..9c6d8213b0e13 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -101,4 +101,4 @@ cdef class TreeBuilder: cpdef build(self, Tree tree, object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight=*) - cdef _check_input(self, object X, const DOUBLE_t[:] sample_weight) + cdef _check_input(self, object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 335499cd3e9a2..536d1c8949d29 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -91,7 +91,8 @@ cdef class TreeBuilder: """Build a decision tree from the training set (X, y).""" pass - cdef inline _check_input(self, object X, const DOUBLE_t[:] sample_weight): + cdef inline _check_input(self, object X, const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight): """Check input dtype, layout and format""" if issparse(X): X = X.tocsc() @@ -108,6 +109,9 @@ cdef class TreeBuilder: # since we have to copy we will make it fortran for efficiency X = np.asfortranarray(X, dtype=DTYPE) + if y.dtype != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + if (sample_weight is not None and (sample_weight.base.dtype != DOUBLE or not sample_weight.base.flags.contiguous)): @@ -115,7 +119,7 @@ cdef class TreeBuilder: order="C") sample_weight_exists = sample_weight is not None and sample_weight.size > 0 - return X, sample_weight.base if sample_weight_exists else None + return X, y.base, sample_weight.base if sample_weight_exists else None # Depth first builder --------------------------------------------------------- # A record on the stack for depth-first tree growing @@ -146,7 +150,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree from the training set (X, y).""" # check input - X, sample_weight = self._check_input(X, sample_weight) + X, y, sample_weight = self._check_input(X, y, sample_weight) # Initial capacity cdef int init_capacity @@ -337,7 +341,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): """Build a decision tree from the training set (X, y).""" # check input - X, sample_weight = self._check_input(X, sample_weight) + X, y, sample_weight = self._check_input(X, y, sample_weight) # Parameters cdef Splitter splitter = self.splitter @@ -605,12 +609,12 @@ cdef class Tree: def __get__(self): return self._get_value_ndarray()[:self.node_count] - def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): + def __cinit__(self, int n_features, SIZE_t[:] n_classes, int n_outputs): """Constructor.""" cdef SIZE_t dummy = 0 size_t_dtype = np.array(dummy).dtype - n_classes = _check_n_classes(n_classes, size_t_dtype) + n_classes = _check_n_classes(n_classes.base, size_t_dtype) # Input/Output layout self.n_features = n_features From bb9b1f767b5f4b14c063933561e326241311788e Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Mon, 6 Feb 2023 17:18:38 +0500 Subject: [PATCH 04/14] y.base in _check_input --- sklearn/tree/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 536d1c8949d29..bae2058323883 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -109,7 +109,7 @@ cdef class TreeBuilder: # since we have to copy we will make it fortran for efficiency X = np.asfortranarray(X, dtype=DTYPE) - if y.dtype != DOUBLE or not y.flags.contiguous: + if y.base.dtype != DOUBLE or not y.base.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if (sample_weight is not None and From a1dbdc1d8aa1060f60bae2ddd8b7423e0288d33f Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Mon, 6 Feb 2023 19:19:22 +0500 Subject: [PATCH 05/14] Fix the PR by keeping cnp.ndarray for n_classes in Tree __cinit__ --- sklearn/tree/_tree.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index bae2058323883..72429eb5753a2 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -118,8 +118,7 @@ cdef class TreeBuilder: sample_weight = np.asarray(sample_weight, dtype=DOUBLE, order="C") - sample_weight_exists = sample_weight is not None and sample_weight.size > 0 - return X, y.base, sample_weight.base if sample_weight_exists else None + return X, y, sample_weight # Depth first builder --------------------------------------------------------- # A record on the stack for depth-first tree growing @@ -609,12 +608,12 @@ cdef class Tree: def __get__(self): return self._get_value_ndarray()[:self.node_count] - def __cinit__(self, int n_features, SIZE_t[:] n_classes, int n_outputs): + def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): """Constructor.""" cdef SIZE_t dummy = 0 size_t_dtype = np.array(dummy).dtype - n_classes = _check_n_classes(n_classes.base, size_t_dtype) + n_classes = _check_n_classes(n_classes, size_t_dtype) # Input/Output layout self.n_features = n_features From 9401757727d056364b6d56f3d83dbf69dafd6c44 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Mon, 6 Feb 2023 20:01:57 +0500 Subject: [PATCH 06/14] Add TODO --- sklearn/tree/_tree.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 72429eb5753a2..249452b1e12a9 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -608,6 +608,8 @@ cdef class Tree: def __get__(self): return self._get_value_ndarray()[:self.node_count] + # TODO: Convert n_classes to cython.integral memory view once + # https://github.com/cython/cython/issues/5243 is fixed def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): """Constructor.""" cdef SIZE_t dummy = 0 From ae7a7d6b2e8502aa9e7ca308f18e2e901449e66b Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Tue, 7 Feb 2023 16:53:27 +0500 Subject: [PATCH 07/14] Replace cnp.ndarray with memory views --- sklearn/tree/_tree.pxd | 12 ++-- sklearn/tree/_tree.pyx | 159 ++++++++++++++++++----------------------- 2 files changed, 77 insertions(+), 94 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 9c6d8213b0e13..20bc25bc0adbd 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -62,14 +62,14 @@ cdef class Tree: cdef int _resize(self, SIZE_t capacity) nogil except -1 cdef int _resize_c(self, SIZE_t capacity=*) nogil except -1 - cdef cnp.ndarray _get_value_ndarray(self) - cdef cnp.ndarray _get_node_ndarray(self) + cdef DOUBLE_t[:, :, ::1] _get_value_ndarray(self) + cdef Node[::1] _get_node_ndarray(self) - cpdef cnp.ndarray predict(self, object X) + cpdef predict(self, object X) - cpdef cnp.ndarray apply(self, object X) - cdef cnp.ndarray _apply_dense(self, object X) - cdef cnp.ndarray _apply_sparse_csr(self, object X) + cpdef apply(self, object X) + cdef SIZE_t[:] _apply_dense(self, object X) + cdef SIZE_t[:] _apply_sparse_csr(self, object X) cpdef object decision_path(self, object X) cdef object _decision_path_dense(self, object X) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 249452b1e12a9..de11145a241b4 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -572,11 +572,11 @@ cdef class Tree: property children_left: def __get__(self): - return self._get_node_ndarray()['left_child'][:self.node_count] + return np.asarray(self._get_node_ndarray())['left_child'][:self.node_count] property children_right: def __get__(self): - return self._get_node_ndarray()['right_child'][:self.node_count] + return np.asarray(self._get_node_ndarray())['right_child'][:self.node_count] property n_leaves: def __get__(self): @@ -586,27 +586,27 @@ cdef class Tree: property feature: def __get__(self): - return self._get_node_ndarray()['feature'][:self.node_count] + return np.asarray(self._get_node_ndarray())['feature'][:self.node_count] property threshold: def __get__(self): - return self._get_node_ndarray()['threshold'][:self.node_count] + return np.asarray(self._get_node_ndarray())['threshold'][:self.node_count] property impurity: def __get__(self): - return self._get_node_ndarray()['impurity'][:self.node_count] + return np.asarray(self._get_node_ndarray())['impurity'][:self.node_count] property n_node_samples: def __get__(self): - return self._get_node_ndarray()['n_node_samples'][:self.node_count] + return np.asarray(self._get_node_ndarray())['n_node_samples'][:self.node_count] property weighted_n_node_samples: def __get__(self): - return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] + return np.asarray(self._get_node_ndarray())['weighted_n_node_samples'][:self.node_count] property value: def __get__(self): - return self._get_value_ndarray()[:self.node_count] + return np.asarray(self._get_value_ndarray())[:self.node_count] # TODO: Convert n_classes to cython.integral memory view once # https://github.com/cython/cython/issues/5243 is fixed @@ -656,8 +656,8 @@ cdef class Tree: # capacity is inferred during the __setstate__ using nodes d["max_depth"] = self.max_depth d["node_count"] = self.node_count - d["nodes"] = self._get_node_ndarray() - d["values"] = self._get_value_ndarray() + d["nodes"] = np.asarray(self._get_node_ndarray()) + d["values"] = np.asarray(self._get_value_ndarray()) return d def __setstate__(self, d): @@ -685,9 +685,12 @@ cdef class Tree: self.capacity = node_ndarray.shape[0] if self._resize_c(self.capacity) != 0: raise MemoryError("resizing tree to %d" % self.capacity) - nodes = memcpy(self.nodes, ( node_ndarray).data, + + cdef Node[::1] node_memory_view = node_ndarray + cdef DOUBLE_t[:, :, ::1] value_memory_view = value_ndarray + nodes = memcpy(self.nodes, &node_memory_view[0], self.capacity * sizeof(Node)) - value = memcpy(self.value, ( value_ndarray).data, + value = memcpy(self.value, &value_memory_view[0, 0, 0], self.capacity * self.value_stride * sizeof(double)) cdef int _resize(self, SIZE_t capacity) nogil except -1: @@ -775,22 +778,22 @@ cdef class Tree: return node_id - cpdef cnp.ndarray predict(self, object X): + cpdef predict(self, object X): """Predict target for X.""" - out = self._get_value_ndarray().take(self.apply(X), axis=0, - mode='clip') + out_memory_view = np.asarray(self._get_value_ndarray()) + out = out_memory_view.take(self.apply(X), axis=0, mode='clip') if self.n_outputs == 1: out = out.reshape(X.shape[0], self.max_n_classes) return out - cpdef cnp.ndarray apply(self, object X): + cpdef apply(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" if issparse(X): - return self._apply_sparse_csr(X) + return np.asarray(self._apply_sparse_csr(X)) else: - return self._apply_dense(X) + return np.asarray(self._apply_dense(X)) - cdef inline cnp.ndarray _apply_dense(self, object X): + cdef inline SIZE_t[:] _apply_dense(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" # Check input @@ -806,8 +809,7 @@ cdef class Tree: cdef SIZE_t n_samples = X.shape[0] # Initialize output - cdef cnp.ndarray[SIZE_t] out = np.zeros((n_samples,), dtype=np.intp) - cdef SIZE_t* out_ptr = out.data + cdef SIZE_t[:] out = np.zeros((n_samples,), dtype=np.intp) # Initialize auxiliary data-structure cdef Node* node = NULL @@ -824,11 +826,11 @@ cdef class Tree: else: node = &self.nodes[node.right_child] - out_ptr[i] = (node - self.nodes) # node offset + out[i] = (node - self.nodes) # node offset return out - cdef inline cnp.ndarray _apply_sparse_csr(self, object X): + cdef inline SIZE_t[:] _apply_sparse_csr(self, object X): """Finds the terminal region (=leaf node) for each sample in sparse X. """ # Check input @@ -840,21 +842,15 @@ cdef class Tree: raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) # Extract input - cdef cnp.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data - cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray = X.indices - cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray = X.indptr - - cdef DTYPE_t* X_data = X_data_ndarray.data - cdef INT32_t* X_indices = X_indices_ndarray.data - cdef INT32_t* X_indptr = X_indptr_ndarray.data + cdef const DTYPE_t[:] X_data = X.data + cdef const INT32_t[:] X_indices = X.indices + cdef const INT32_t[:] X_indptr = X.indptr cdef SIZE_t n_samples = X.shape[0] cdef SIZE_t n_features = X.shape[1] # Initialize output - cdef cnp.ndarray[SIZE_t, ndim=1] out = np.zeros((n_samples,), - dtype=np.intp) - cdef SIZE_t* out_ptr = out.data + cdef SIZE_t[:] out = np.zeros((n_samples,), dtype=np.intp) # Initialize auxiliary data-structure cdef DTYPE_t feature_value = 0. @@ -895,7 +891,7 @@ cdef class Tree: else: node = &self.nodes[node.right_child] - out_ptr[i] = (node - self.nodes) # node offset + out[i] = (node - self.nodes) # node offset # Free auxiliary arrays free(X_sample) @@ -926,13 +922,10 @@ cdef class Tree: cdef SIZE_t n_samples = X.shape[0] # Initialize output - cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1, dtype=np.intp) - cdef SIZE_t* indptr_ptr = indptr.data - - cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples * - (1 + self.max_depth), - dtype=np.intp) - cdef SIZE_t* indices_ptr = indices.data + cdef SIZE_t[:] indptr = np.zeros(n_samples + 1, dtype=np.intp) + cdef SIZE_t[:] indices = np.zeros( + n_samples * (1 + self.max_depth), dtype=np.intp + ) # Initialize auxiliary data-structure cdef Node* node = NULL @@ -941,13 +934,13 @@ cdef class Tree: with nogil: for i in range(n_samples): node = self.nodes - indptr_ptr[i + 1] = indptr_ptr[i] + indptr[i + 1] = indptr[i] # Add all external nodes while node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) - indptr_ptr[i + 1] += 1 + indices[indptr[i + 1]] = (node - self.nodes) + indptr[i + 1] += 1 if X_ndarray[i, node.feature] <= node.threshold: node = &self.nodes[node.left_child] @@ -955,12 +948,11 @@ cdef class Tree: node = &self.nodes[node.right_child] # Add the leave node - indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) - indptr_ptr[i + 1] += 1 + indices[indptr[i + 1]] = (node - self.nodes) + indptr[i + 1] += 1 indices = indices[:indptr[n_samples]] - cdef cnp.ndarray[SIZE_t] data = np.ones(shape=len(indices), - dtype=np.intp) + cdef SIZE_t[:] data = np.ones(shape=len(indices), dtype=np.intp) out = csr_matrix((data, indices, indptr), shape=(n_samples, self.node_count)) @@ -978,25 +970,18 @@ cdef class Tree: raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) # Extract input - cdef cnp.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data - cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray = X.indices - cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray = X.indptr - - cdef DTYPE_t* X_data = X_data_ndarray.data - cdef INT32_t* X_indices = X_indices_ndarray.data - cdef INT32_t* X_indptr = X_indptr_ndarray.data + cdef const DTYPE_t[:] X_data = X.data + cdef const INT32_t[:] X_indices = X.indices + cdef const INT32_t[:] X_indptr = X.indptr cdef SIZE_t n_samples = X.shape[0] cdef SIZE_t n_features = X.shape[1] # Initialize output - cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1, dtype=np.intp) - cdef SIZE_t* indptr_ptr = indptr.data - - cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples * - (1 + self.max_depth), - dtype=np.intp) - cdef SIZE_t* indices_ptr = indices.data + cdef SIZE_t[:] indptr = np.zeros(n_samples + 1, dtype=np.intp) + cdef SIZE_t[:] indices = np.zeros( + n_samples * (1 + self.max_depth), dtype=np.intp + ) # Initialize auxiliary data-structure cdef DTYPE_t feature_value = 0. @@ -1018,7 +1003,7 @@ cdef class Tree: for i in range(n_samples): node = self.nodes - indptr_ptr[i + 1] = indptr_ptr[i] + indptr[i + 1] = indptr[i] for k in range(X_indptr[i], X_indptr[i + 1]): feature_to_sample[X_indices[k]] = i @@ -1028,8 +1013,8 @@ cdef class Tree: while node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) - indptr_ptr[i + 1] += 1 + indices[indptr[i + 1]] = (node - self.nodes) + indptr[i + 1] += 1 if feature_to_sample[node.feature] == i: feature_value = X_sample[node.feature] @@ -1043,16 +1028,15 @@ cdef class Tree: node = &self.nodes[node.right_child] # Add the leave node - indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) - indptr_ptr[i + 1] += 1 + indices[indptr[i + 1]] = (node - self.nodes) + indptr[i + 1] += 1 # Free auxiliary arrays free(X_sample) free(feature_to_sample) indices = indices[:indptr[n_samples]] - cdef cnp.ndarray[SIZE_t] data = np.ones(shape=len(indices), - dtype=np.intp) + cdef SIZE_t[:] data = np.ones(shape=len(indices), dtype=np.intp) out = csr_matrix((data, indices, indptr), shape=(n_samples, self.node_count)) @@ -1095,9 +1079,7 @@ cdef class Tree: cdef double normalizer = 0. - cdef cnp.ndarray[cnp.float64_t, ndim=1] importances - importances = np.zeros((self.n_features,)) - cdef DOUBLE_t* importance_data = importances.data + cdef cnp.float64_t[:] importances = np.zeros((self.n_features,)) with nogil: while node != end_node: @@ -1106,24 +1088,24 @@ cdef class Tree: left = &nodes[node.left_child] right = &nodes[node.right_child] - importance_data[node.feature] += ( + importances[node.feature] += ( node.weighted_n_node_samples * node.impurity - left.weighted_n_node_samples * left.impurity - right.weighted_n_node_samples * right.impurity) node += 1 - importances /= nodes[0].weighted_n_node_samples + importances = np.divide(importances, nodes[0].weighted_n_node_samples) if normalize: normalizer = np.sum(importances) if normalizer > 0.0: # Avoid dividing by zero (e.g., when root is pure) - importances /= normalizer + importances = np.divide(importances, normalizer) - return importances + return np.asarray(importances) - cdef cnp.ndarray _get_value_ndarray(self): + cdef DOUBLE_t[:, :, ::1] _get_value_ndarray(self): """Wraps value as a 3-d NumPy array. The array keeps a reference to this Tree, which manages the underlying @@ -1133,14 +1115,14 @@ cdef class Tree: shape[0] = self.node_count shape[1] = self.n_outputs shape[2] = self.max_n_classes - cdef cnp.ndarray arr + cdef DOUBLE_t[:, :, ::1] arr arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: + if PyArray_SetBaseObject(arr.base, self) < 0: raise ValueError("Can't initialize array.") return arr - cdef cnp.ndarray _get_node_ndarray(self): + cdef Node[::1] _get_node_ndarray(self): """Wraps nodes as a NumPy struct array. The array keeps a reference to this Tree, which manages the underlying @@ -1151,14 +1133,14 @@ cdef class Tree: shape[0] = self.node_count cdef cnp.npy_intp strides[1] strides[0] = sizeof(Node) - cdef cnp.ndarray arr + cdef Node[::1] arr Py_INCREF(NODE_DTYPE) - arr = PyArray_NewFromDescr( cnp.ndarray, + arr = PyArray_NewFromDescr( np.ndarray, NODE_DTYPE, 1, shape, strides, self.nodes, cnp.NPY_DEFAULT, None) Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: + if PyArray_SetBaseObject(arr.base, self) < 0: raise ValueError("Can't initialize array.") return arr @@ -1688,10 +1670,8 @@ def ccp_pruning_path(Tree orig_tree): cdef: UINT32_t total_items = path_finder.count - cnp.ndarray ccp_alphas = np.empty(shape=total_items, - dtype=np.float64) - cnp.ndarray impurities = np.empty(shape=total_items, - dtype=np.float64) + DOUBLE_t[:] ccp_alphas = np.empty(shape=total_items, dtype=np.float64) + DOUBLE_t[:] impurities = np.empty(shape=total_items, dtype=np.float64) UINT32_t count = 0 while count < total_items: @@ -1699,7 +1679,10 @@ def ccp_pruning_path(Tree orig_tree): impurities[count] = path_finder.impurities[count] count += 1 - return {'ccp_alphas': ccp_alphas, 'impurities': impurities} + return { + 'ccp_alphas': np.asarray(ccp_alphas), + 'impurities': np.asarray(impurities), + } cdef struct BuildPrunedRecord: From 21cf642c03e87f4c688558ae53359b713a29b9e0 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Tue, 7 Feb 2023 17:26:05 +0500 Subject: [PATCH 08/14] Add TODO to remove redundant y check in _check_input --- sklearn/tree/_tree.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index de11145a241b4..adcd2277b230d 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -109,6 +109,9 @@ cdef class TreeBuilder: # since we have to copy we will make it fortran for efficiency X = np.asfortranarray(X, dtype=DTYPE) + # TODO: This check for y seems to be redundant, as it is also + # present in the BaseDecisionTree's fit method, and therefore + # can be removed. if y.base.dtype != DOUBLE or not y.base.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) From f5daddcb6fd8af1009271b678e904acc48cb33d7 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Tue, 7 Feb 2023 21:30:39 +0500 Subject: [PATCH 09/14] Applied suggestions provided on PR --- setup.py | 1 + sklearn/tree/_tree.pyx | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f252e1ec03ad8..dc2edc6a8127b 100755 --- a/setup.py +++ b/setup.py @@ -111,6 +111,7 @@ "sklearn.svm._libsvm_sparse", "sklearn.svm._newrand", "sklearn.tree._splitter", + "sklearn.tree._tree", "sklearn.tree._utils", "sklearn.utils._cython_blas", "sklearn.utils._fast_dict", diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index adcd2277b230d..52a8f996b273f 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1141,7 +1141,7 @@ cdef class Tree: arr = PyArray_NewFromDescr( np.ndarray, NODE_DTYPE, 1, shape, strides, self.nodes, - cnp.NPY_DEFAULT, None) + cnp.NPY_ARRAY_DEFAULT, None) Py_INCREF(self) if PyArray_SetBaseObject(arr.base, self) < 0: raise ValueError("Can't initialize array.") From 0496453d54c2c81e7ed1a9137bd326d4fd8037c3 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Wed, 8 Feb 2023 11:49:18 +0500 Subject: [PATCH 10/14] Applied further suggestions --- sklearn/tree/_tree.pxd | 8 +++---- sklearn/tree/_tree.pyx | 52 ++++++++++++++++++++++-------------------- 2 files changed, 31 insertions(+), 29 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 20bc25bc0adbd..62338ddbbc20e 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -62,14 +62,14 @@ cdef class Tree: cdef int _resize(self, SIZE_t capacity) nogil except -1 cdef int _resize_c(self, SIZE_t capacity=*) nogil except -1 - cdef DOUBLE_t[:, :, ::1] _get_value_ndarray(self) - cdef Node[::1] _get_node_ndarray(self) + cdef cnp.ndarray _get_value_ndarray(self) + cdef cnp.ndarray _get_node_ndarray(self) cpdef predict(self, object X) cpdef apply(self, object X) - cdef SIZE_t[:] _apply_dense(self, object X) - cdef SIZE_t[:] _apply_sparse_csr(self, object X) + cdef cnp.ndarray _apply_dense(self, object X) + cdef cnp.ndarray _apply_sparse_csr(self, object X) cpdef object decision_path(self, object X) cdef object _decision_path_dense(self, object X) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 52a8f996b273f..0e7cf6ecef6e0 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -575,11 +575,11 @@ cdef class Tree: property children_left: def __get__(self): - return np.asarray(self._get_node_ndarray())['left_child'][:self.node_count] + return self._get_node_ndarray()['left_child'][:self.node_count] property children_right: def __get__(self): - return np.asarray(self._get_node_ndarray())['right_child'][:self.node_count] + return self._get_node_ndarray()['right_child'][:self.node_count] property n_leaves: def __get__(self): @@ -589,27 +589,27 @@ cdef class Tree: property feature: def __get__(self): - return np.asarray(self._get_node_ndarray())['feature'][:self.node_count] + return self._get_node_ndarray()['feature'][:self.node_count] property threshold: def __get__(self): - return np.asarray(self._get_node_ndarray())['threshold'][:self.node_count] + return self._get_node_ndarray()['threshold'][:self.node_count] property impurity: def __get__(self): - return np.asarray(self._get_node_ndarray())['impurity'][:self.node_count] + return self._get_node_ndarray()['impurity'][:self.node_count] property n_node_samples: def __get__(self): - return np.asarray(self._get_node_ndarray())['n_node_samples'][:self.node_count] + return self._get_node_ndarray()['n_node_samples'][:self.node_count] property weighted_n_node_samples: def __get__(self): - return np.asarray(self._get_node_ndarray())['weighted_n_node_samples'][:self.node_count] + return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] property value: def __get__(self): - return np.asarray(self._get_value_ndarray())[:self.node_count] + return self._get_value_ndarray()[:self.node_count] # TODO: Convert n_classes to cython.integral memory view once # https://github.com/cython/cython/issues/5243 is fixed @@ -659,8 +659,8 @@ cdef class Tree: # capacity is inferred during the __setstate__ using nodes d["max_depth"] = self.max_depth d["node_count"] = self.node_count - d["nodes"] = np.asarray(self._get_node_ndarray()) - d["values"] = np.asarray(self._get_value_ndarray()) + d["nodes"] = self._get_node_ndarray() + d["values"] = self._get_value_ndarray() return d def __setstate__(self, d): @@ -783,8 +783,8 @@ cdef class Tree: cpdef predict(self, object X): """Predict target for X.""" - out_memory_view = np.asarray(self._get_value_ndarray()) - out = out_memory_view.take(self.apply(X), axis=0, mode='clip') + out = self._get_value_ndarray().take(self.apply(X), axis=0, + mode='clip') if self.n_outputs == 1: out = out.reshape(X.shape[0], self.max_n_classes) return out @@ -792,11 +792,11 @@ cdef class Tree: cpdef apply(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" if issparse(X): - return np.asarray(self._apply_sparse_csr(X)) + return self._apply_sparse_csr(X) else: - return np.asarray(self._apply_dense(X)) + return self._apply_dense(X) - cdef inline SIZE_t[:] _apply_dense(self, object X): + cdef inline cnp.ndarray _apply_dense(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" # Check input @@ -831,9 +831,9 @@ cdef class Tree: out[i] = (node - self.nodes) # node offset - return out + return np.asarray(out) - cdef inline SIZE_t[:] _apply_sparse_csr(self, object X): + cdef inline cnp.ndarray _apply_sparse_csr(self, object X): """Finds the terminal region (=leaf node) for each sample in sparse X. """ # Check input @@ -900,7 +900,7 @@ cdef class Tree: free(X_sample) free(feature_to_sample) - return out + return np.asarray(out) cpdef object decision_path(self, object X): """Finds the decision path (=node) for each sample in X.""" @@ -1082,7 +1082,7 @@ cdef class Tree: cdef double normalizer = 0. - cdef cnp.float64_t[:] importances = np.zeros((self.n_features,)) + cdef cnp.float64_t[:] importances = np.zeros(self.n_features) with nogil: while node != end_node: @@ -1097,18 +1097,20 @@ cdef class Tree: right.weighted_n_node_samples * right.impurity) node += 1 - importances = np.divide(importances, nodes[0].weighted_n_node_samples) + for i in range(self.n_features): + importances[i] /= nodes[0].weighted_n_node_samples if normalize: normalizer = np.sum(importances) if normalizer > 0.0: # Avoid dividing by zero (e.g., when root is pure) - importances = np.divide(importances, normalizer) + for i in range(self.n_features): + importances[i] /= normalizer return np.asarray(importances) - cdef DOUBLE_t[:, :, ::1] _get_value_ndarray(self): + cdef cnp.ndarray _get_value_ndarray(self): """Wraps value as a 3-d NumPy array. The array keeps a reference to this Tree, which manages the underlying @@ -1123,9 +1125,9 @@ cdef class Tree: Py_INCREF(self) if PyArray_SetBaseObject(arr.base, self) < 0: raise ValueError("Can't initialize array.") - return arr + return np.asarray(arr) - cdef Node[::1] _get_node_ndarray(self): + cdef cnp.ndarray _get_node_ndarray(self): """Wraps nodes as a NumPy struct array. The array keeps a reference to this Tree, which manages the underlying @@ -1145,7 +1147,7 @@ cdef class Tree: Py_INCREF(self) if PyArray_SetBaseObject(arr.base, self) < 0: raise ValueError("Can't initialize array.") - return arr + return np.asarray(arr) def compute_partial_dependence(self, DTYPE_t[:, ::1] X, int[::1] target_features, From c18c3cb72996fdc940f615c844bc94d53d4b6ac3 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Wed, 8 Feb 2023 18:33:08 +0500 Subject: [PATCH 11/14] Reverted some lines --- sklearn/tree/_tree.pxd | 4 ++-- sklearn/tree/_tree.pyx | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 62338ddbbc20e..9c6d8213b0e13 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -65,9 +65,9 @@ cdef class Tree: cdef cnp.ndarray _get_value_ndarray(self) cdef cnp.ndarray _get_node_ndarray(self) - cpdef predict(self, object X) + cpdef cnp.ndarray predict(self, object X) - cpdef apply(self, object X) + cpdef cnp.ndarray apply(self, object X) cdef cnp.ndarray _apply_dense(self, object X) cdef cnp.ndarray _apply_sparse_csr(self, object X) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 0e7cf6ecef6e0..be66c03239709 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -781,7 +781,7 @@ cdef class Tree: return node_id - cpdef predict(self, object X): + cpdef cnp.ndarray predict(self, object X): """Predict target for X.""" out = self._get_value_ndarray().take(self.apply(X), axis=0, mode='clip') @@ -789,7 +789,7 @@ cdef class Tree: out = out.reshape(X.shape[0], self.max_n_classes) return out - cpdef apply(self, object X): + cpdef cnp.ndarray apply(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" if issparse(X): return self._apply_sparse_csr(X) @@ -1140,7 +1140,7 @@ cdef class Tree: strides[0] = sizeof(Node) cdef Node[::1] arr Py_INCREF(NODE_DTYPE) - arr = PyArray_NewFromDescr( np.ndarray, + arr = PyArray_NewFromDescr( cnp.ndarray, NODE_DTYPE, 1, shape, strides, self.nodes, cnp.NPY_ARRAY_DEFAULT, None) From 317abb38e192540be491c4589bbf77c00c0d4918 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Wed, 8 Feb 2023 20:02:34 +0500 Subject: [PATCH 12/14] PR suggestions --- sklearn/tree/_tree.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index be66c03239709..1be17659bd67c 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -812,7 +812,7 @@ cdef class Tree: cdef SIZE_t n_samples = X.shape[0] # Initialize output - cdef SIZE_t[:] out = np.zeros((n_samples,), dtype=np.intp) + cdef SIZE_t[:] out = np.zeros(n_samples, dtype=np.intp) # Initialize auxiliary data-structure cdef Node* node = NULL @@ -853,7 +853,7 @@ cdef class Tree: cdef SIZE_t n_features = X.shape[1] # Initialize output - cdef SIZE_t[:] out = np.zeros((n_samples,), dtype=np.intp) + cdef SIZE_t[:] out = np.zeros(n_samples, dtype=np.intp) # Initialize auxiliary data-structure cdef DTYPE_t feature_value = 0. From e7e806bfe356d1ca62e26735b9136571efb318c4 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Wed, 8 Feb 2023 22:32:28 +0500 Subject: [PATCH 13/14] Replace cnp.ndarray back in _get_value_ndarray and _get_node_ndarray --- sklearn/tree/_tree.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 1be17659bd67c..21d58e58dd475 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1120,12 +1120,12 @@ cdef class Tree: shape[0] = self.node_count shape[1] = self.n_outputs shape[2] = self.max_n_classes - cdef DOUBLE_t[:, :, ::1] arr + cdef cnp.ndarray arr arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) Py_INCREF(self) - if PyArray_SetBaseObject(arr.base, self) < 0: + if PyArray_SetBaseObject(arr, self) < 0: raise ValueError("Can't initialize array.") - return np.asarray(arr) + return arr cdef cnp.ndarray _get_node_ndarray(self): """Wraps nodes as a NumPy struct array. @@ -1138,16 +1138,16 @@ cdef class Tree: shape[0] = self.node_count cdef cnp.npy_intp strides[1] strides[0] = sizeof(Node) - cdef Node[::1] arr + cdef cnp.ndarray arr Py_INCREF(NODE_DTYPE) arr = PyArray_NewFromDescr( cnp.ndarray, NODE_DTYPE, 1, shape, strides, self.nodes, cnp.NPY_ARRAY_DEFAULT, None) Py_INCREF(self) - if PyArray_SetBaseObject(arr.base, self) < 0: + if PyArray_SetBaseObject(arr, self) < 0: raise ValueError("Can't initialize array.") - return np.asarray(arr) + return arr def compute_partial_dependence(self, DTYPE_t[:, ::1] X, int[::1] target_features, From 6d6dc0475a0105b4c0fe244b4099a3c67706d406 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Thu, 9 Feb 2023 12:12:22 +0500 Subject: [PATCH 14/14] Add black formatting in modified method's signatures --- sklearn/tree/_tree.pxd | 17 ++++++++++++++--- sklearn/tree/_tree.pyx | 35 +++++++++++++++++++++++++++-------- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 9c6d8213b0e13..3e60e91d6940a 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -99,6 +99,17 @@ cdef class TreeBuilder: cdef SIZE_t max_depth # Maximal tree depth cdef double min_impurity_decrease # Impurity threshold for early stopping - cpdef build(self, Tree tree, object X, const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight=*) - cdef _check_input(self, object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight) + cpdef build( + self, + Tree tree, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=*, + ) + + cdef _check_input( + self, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + ) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 21d58e58dd475..72e98b33b847f 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -86,13 +86,22 @@ NODE_DTYPE = np.asarray((&dummy)).dtype cdef class TreeBuilder: """Interface for different tree building strategies.""" - cpdef build(self, Tree tree, object X, const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight=None): + cpdef build( + self, + Tree tree, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=None, + ): """Build a decision tree from the training set (X, y).""" pass - cdef inline _check_input(self, object X, const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight): + cdef inline _check_input( + self, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + ): """Check input dtype, layout and format""" if issparse(X): X = X.tocsc() @@ -147,8 +156,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): self.max_depth = max_depth self.min_impurity_decrease = min_impurity_decrease - cpdef build(self, Tree tree, object X, const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight=None): + cpdef build( + self, + Tree tree, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=None, + ): """Build a decision tree from the training set (X, y).""" # check input @@ -338,8 +352,13 @@ cdef class BestFirstTreeBuilder(TreeBuilder): self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease - cpdef build(self, Tree tree, object X, const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight=None): + cpdef build( + self, + Tree tree, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=None, + ): """Build a decision tree from the training set (X, y).""" # check input