From 34355bbc6cd4d63d532ecfc9c1bae033cf482a09 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Sun, 12 Apr 2015 21:47:11 -0700 Subject: [PATCH 01/35] Created SplitValue datatype to generalize the concept of a threshold to categorical variables. Replaced the threshold attribute of SplitRecord and Node with SplitValue. --- sklearn/tree/_splitter.pxd | 27 +++++++++++++++++++++-- sklearn/tree/_splitter.pyx | 44 ++++++++++++++++++-------------------- sklearn/tree/_tree.pxd | 5 ++++- sklearn/tree/_tree.pyx | 39 ++++++++++++++++++--------------- 4 files changed, 72 insertions(+), 43 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index e716736e1cf91..8756e94fa5f42 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -19,6 +19,28 @@ ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef np.npy_intp SIZE_t # Type for indices and counters ctypedef np.npy_int32 INT32_t # Signed 32 bit integer ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer +ctypedef np.npy_uint64 UINT64_t # Unsigned 64 bit integer + +cdef union SplitValue: + # Union type to generalize the concept of a threshold to + # categorical features. For non-categorical features, use the + # threshold member. It acts just as before, where feature values + # less than or equal to the threshold go left, and values greater + # than the threshold go right. + # + # For categorical features, use the cat_split member. It works in + # one of two ways, indicated by the value of its least significant + # bit (LSB). If the LSB is 0, then cat_split acts as a bitfield + # for up to 64 categories, sending samples left if the bit + # corresponding to their category is 1 or right if it is 0. If the + # LSB is 1, then the more significant 32 bits of cat_split is a + # random seed and the next 31 bits are the number of deviates to + # draw. To evaluate a sample, draw the required set of categories + # and check if the sample's feature value is in the set. If so, + # send it left; otherwise right. This second method allows up to + # 2**31 category values, but can only be used for RandomSplitter. + DOUBLE_t threshold + UINT64_t cat_split cdef struct SplitRecord: # Data to track sample split @@ -26,7 +48,8 @@ cdef struct SplitRecord: SIZE_t pos # Split samples array at the given position, # i.e. count of samples below threshold for feature. # pos is >= end if the node is a leaf. - double threshold # Threshold to split at. + SplitValue split_value # Generalized threshold for categorical and + # non-categorical features. double improvement # Impurity improvement given parent node. double impurity_left # Impurity of the left split. double impurity_right # Impurity of the right split. @@ -95,4 +118,4 @@ cdef class Splitter: cdef void node_value(self, double* dest) nogil - cdef double node_impurity(self) nogil \ No newline at end of file + cdef double node_impurity(self) nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 4c59b6960e7a0..4c20834259388 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -48,7 +48,7 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil: self.impurity_right = INFINITY self.pos = start_pos self.feature = 0 - self.threshold = 0. + self.split_value.threshold = 0. self.improvement = -INFINITY cdef class Splitter: @@ -461,10 +461,10 @@ cdef class BestSplitter(BaseDenseSplitter): if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement - current.threshold = (Xf[p - 1] + Xf[p]) / 2.0 + current.split_value.threshold = (Xf[p - 1] + Xf[p]) / 2.0 - if current.threshold == Xf[p]: - current.threshold = Xf[p - 1] + if current.split_value.threshold == Xf[p]: + current.split_value.threshold = Xf[p - 1] best = current # copy @@ -475,7 +475,7 @@ cdef class BestSplitter(BaseDenseSplitter): p = start while p < partition_end: - if X[X_sample_stride * samples[p] + feature_offset] <= best.threshold: + if X[X_sample_stride * samples[p] + feature_offset] <= best.split_value.threshold: p += 1 else: @@ -749,19 +749,18 @@ cdef class RandomSplitter(BaseDenseSplitter): features[f_i], features[f_j] = features[f_j], features[f_i] # Draw a random threshold - current.threshold = rand_uniform(min_feature_value, - max_feature_value, - random_state) + current.split_value.threshold = rand_uniform( + min_feature_value, max_feature_value, random_state) - if current.threshold == max_feature_value: - current.threshold = min_feature_value + if current.split_value.threshold == max_feature_value: + current.split_value.threshold = min_feature_value # Partition partition_end = end p = start while p < partition_end: current_feature_value = Xf[p] - if current_feature_value <= current.threshold: + if current_feature_value <= current.split_value.threshold: p += 1 else: partition_end -= 1 @@ -803,7 +802,7 @@ cdef class RandomSplitter(BaseDenseSplitter): p = start while p < partition_end: - if X[X_sample_stride * samples[p] + feature_stride] <= best.threshold: + if X[X_sample_stride * samples[p] + feature_stride] <= best.split_value.threshold: p += 1 else: @@ -1347,9 +1346,9 @@ cdef class BestSparseSplitter(BaseSparseSplitter): if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement - current.threshold = (Xf[p_prev] + Xf[p]) / 2.0 - if current.threshold == Xf[p]: - current.threshold = Xf[p_prev] + current.split_value.threshold = (Xf[p_prev] + Xf[p]) / 2.0 + if current.split_value.threshold == Xf[p]: + current.split_value.threshold = Xf[p_prev] best = current @@ -1358,7 +1357,7 @@ cdef class BestSparseSplitter(BaseSparseSplitter): self.extract_nnz(best.feature, &end_negative, &start_positive, &is_samples_sorted) - self._partition(best.threshold, end_negative, start_positive, + self._partition(best.split_value.threshold, end_negative, start_positive, best.pos) self.criterion.reset() @@ -1542,15 +1541,14 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): features[f_i], features[f_j] = features[f_j], features[f_i] # Draw a random threshold - current.threshold = rand_uniform(min_feature_value, - max_feature_value, - random_state) + current.split_value.threshold = rand_uniform( + min_feature_value, max_feature_value, random_state) - if current.threshold == max_feature_value: - current.threshold = min_feature_value + if current.split_value.threshold == max_feature_value: + current.split_value.threshold = min_feature_value # Partition - current.pos = self._partition(current.threshold, + current.pos = self._partition(current.split_value.threshold, end_negative, start_positive, start_positive + @@ -1586,7 +1584,7 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): self.extract_nnz(best.feature, &end_negative, &start_positive, &is_samples_sorted) - self._partition(best.threshold, end_negative, start_positive, + self._partition(best.split_value.threshold, end_negative, start_positive, best.pos) self.criterion.reset() diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 6b34e51a07c8e..59bd414f0423b 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -17,9 +17,11 @@ ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef np.npy_intp SIZE_t # Type for indices and counters ctypedef np.npy_int32 INT32_t # Signed 32 bit integer ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer +ctypedef np.npy_uint64 UINT64_t # Unsigned 64 bit integer from ._splitter cimport Splitter from ._splitter cimport SplitRecord +from ._splitter cimport SplitValue cdef struct Node: # Base storage structure for the nodes in a Tree object @@ -27,7 +29,8 @@ cdef struct Node: SIZE_t left_child # id of the left child of the node SIZE_t right_child # id of the right child of the node SIZE_t feature # Feature used for splitting the node - DOUBLE_t threshold # Threshold value at the node + SplitValue split_value # Generalized threshold for categorical and + # non-categorical features. DOUBLE_t impurity # Impurity of the node (i.e., the value of the criterion) SIZE_t n_node_samples # Number of samples at the node DOUBLE_t weighted_n_node_samples # Weighted number of samples at the node diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 8bc02ef44d94d..58df97698e71d 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -66,16 +66,21 @@ cdef SIZE_t INITIAL_STACK_SIZE = 10 cdef DTYPE_t MIN_IMPURITY_SPLIT = 1e-7 # Repeat struct definition for numpy +SPLITVALUE_DTYPE = np.dtype({ + 'names': ['threshold', 'cat_split'], + 'formats': [np.float64, np.uint64], + 'offsets': [0, 0] +}) NODE_DTYPE = np.dtype({ - 'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity', - 'n_node_samples', 'weighted_n_node_samples'], - 'formats': [np.intp, np.intp, np.intp, np.float64, np.float64, np.intp, - np.float64], + 'names': ['left_child', 'right_child', 'feature', 'split_value', + 'impurity', 'n_node_samples', 'weighted_n_node_samples'], + 'formats': [np.intp, np.intp, np.intp, SPLITVALUE_DTYPE, np.float64, + np.intp, np.float64], 'offsets': [ &( NULL).left_child, &( NULL).right_child, &( NULL).feature, - &( NULL).threshold, + &( NULL).split_value, &( NULL).impurity, &( NULL).n_node_samples, &( NULL).weighted_n_node_samples @@ -229,8 +234,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): is_leaf = is_leaf or (split.pos >= end) node_id = tree._add_node(parent, is_left, is_leaf, split.feature, - split.threshold, impurity, n_node_samples, - weighted_n_node_samples) + split.split_value.threshold, impurity, + n_node_samples, weighted_n_node_samples) if node_id == (-1): rc = -1 @@ -356,7 +361,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): node.left_child = _TREE_LEAF node.right_child = _TREE_LEAF node.feature = _TREE_UNDEFINED - node.threshold = _TREE_UNDEFINED + node.split_value.threshold = _TREE_UNDEFINED else: # Node is expandable @@ -444,8 +449,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if parent != NULL else _TREE_UNDEFINED, is_left, is_leaf, - split.feature, split.threshold, impurity, n_node_samples, - weighted_n_node_samples) + split.feature, split.split_value.threshold, + impurity, n_node_samples, weighted_n_node_samples) if node_id == (-1): return -1 @@ -558,7 +563,7 @@ cdef class Tree: property threshold: def __get__(self): - return self._get_node_ndarray()['threshold'][:self.node_count] + return self._get_node_ndarray()['split_value']['threshold'][:self.node_count] property impurity: def __get__(self): @@ -725,12 +730,12 @@ cdef class Tree: node.left_child = _TREE_LEAF node.right_child = _TREE_LEAF node.feature = _TREE_UNDEFINED - node.threshold = _TREE_UNDEFINED + node.split_value.threshold = _TREE_UNDEFINED else: # left_child and right_child will be set later node.feature = feature - node.threshold = threshold + node.split_value.threshold = threshold self.node_count += 1 @@ -784,7 +789,7 @@ cdef class Tree: while node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: if X_ptr[X_sample_stride * i + - X_fx_stride * node.feature] <= node.threshold: + X_fx_stride * node.feature] <= node.split_value.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -855,7 +860,7 @@ cdef class Tree: else: feature_value = 0. - if feature_value <= node.threshold: + if feature_value <= node.split_value.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -918,7 +923,7 @@ cdef class Tree: indptr_ptr[i + 1] += 1 if X_ptr[X_sample_stride * i + - X_fx_stride * node.feature] <= node.threshold: + X_fx_stride * node.feature] <= node.split_value.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -1006,7 +1011,7 @@ cdef class Tree: else: feature_value = 0. - if feature_value <= node.threshold: + if feature_value <= node.split_value.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] From a9db0ea636ff0eb25a8a78eeb5795164bd593124 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Sun, 12 Apr 2015 21:49:30 -0700 Subject: [PATCH 02/35] Added attribute n_categories to Splitter and Tree, an array of ints that defaults to -1 for each feature (indicating non-categorical feature). --- sklearn/tree/_splitter.pxd | 2 ++ sklearn/tree/_splitter.pyx | 8 ++++++++ sklearn/tree/_tree.pxd | 3 +++ sklearn/tree/_tree.pyx | 28 +++++++++++++++++++++++++++- sklearn/tree/_utils.pxd | 1 + 5 files changed, 41 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 8756e94fa5f42..eb296d597b388 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -86,6 +86,8 @@ cdef class Splitter: cdef DOUBLE_t* y cdef SIZE_t y_stride cdef DOUBLE_t* sample_weight + cdef INT32_t* n_categories # (n_features) array giving number of + # categories (<0 for non-categorical) # The samples vector `samples` is maintained by the Splitter object such # that the samples contained in a node are contiguous. With this setting, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 4c20834259388..d111a0fd3a2f2 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -95,6 +95,7 @@ cdef class Splitter: self.y = NULL self.y_stride = 0 self.sample_weight = NULL + self.n_categories = NULL self.max_features = max_features self.min_samples_leaf = min_samples_leaf @@ -109,6 +110,7 @@ cdef class Splitter: free(self.features) free(self.constant_features) free(self.feature_values) + free(self.n_categories) def __getstate__(self): return {} @@ -181,6 +183,12 @@ cdef class Splitter: self.sample_weight = sample_weight + # Initialize the number of categories for each feature + # A value of -1 indicates a non-categorical feature + safe_realloc(&self.n_categories, n_features) + for i in range(n_features): + self.n_categories[i] = -1 + cdef void node_reset(self, SIZE_t start, SIZE_t end, double* weighted_n_node_samples) nogil: """Reset splitter on node samples[start:end]. diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 59bd414f0423b..732a18597d590 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -55,6 +55,8 @@ cdef class Tree: cdef Node* nodes # Array of nodes cdef double* value # (capacity, n_outputs, max_n_classes) array of values cdef SIZE_t value_stride # = n_outputs * max_n_classes + cdef INT32_t* n_categories # (n_features) array giving number of + # categories (<0 for non-categorical) # Methods cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, @@ -66,6 +68,7 @@ cdef class Tree: cdef np.ndarray _get_value_ndarray(self) cdef np.ndarray _get_node_ndarray(self) + cdef np.ndarray _get_ncat_ndarray(self) cpdef np.ndarray predict(self, object X) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 58df97698e71d..e56e628d804dd 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -603,6 +603,10 @@ cdef class Tree: self.capacity = 0 self.value = NULL self.nodes = NULL + self.n_categories = NULL + safe_realloc(&self.n_categories, n_features) + for k in range(n_features): + self.n_categories[k] = -1 def __dealloc__(self): """Destructor.""" @@ -610,6 +614,7 @@ cdef class Tree: free(self.n_classes) free(self.value) free(self.nodes) + free(self.n_categories) def __reduce__(self): """Reduce re-implementation, for pickling.""" @@ -625,6 +630,7 @@ cdef class Tree: d["node_count"] = self.node_count d["nodes"] = self._get_node_ndarray() d["values"] = self._get_value_ndarray() + d["n_categories"] = self._get_ncat_ndarray() return d def __setstate__(self, d): @@ -638,6 +644,7 @@ cdef class Tree: node_ndarray = d['nodes'] value_ndarray = d['values'] + ncat_ndarray = d['n_categories'] value_shape = (node_ndarray.shape[0], self.n_outputs, self.max_n_classes) @@ -646,7 +653,10 @@ cdef class Tree: not node_ndarray.flags.c_contiguous or value_ndarray.shape != value_shape or not value_ndarray.flags.c_contiguous or - value_ndarray.dtype != np.float64): + value_ndarray.dtype != np.float64 or + ncat_ndarray.shape != (self.n_features,) or + ncat_ndarray.dtype != np.int32 or + not ncat_ndarray.flags.c_contiguous): raise ValueError('Did not recognise loaded array layout') self.capacity = node_ndarray.shape[0] @@ -656,6 +666,8 @@ cdef class Tree: self.capacity * sizeof(Node)) value = memcpy(self.value, ( value_ndarray).data, self.capacity * self.value_stride * sizeof(double)) + ncat = memcpy(self.n_categories, ( ncat_ndarray).data, + self.n_features * sizeof(INT32_t)) cdef void _resize(self, SIZE_t capacity) except *: """Resize all inner arrays to `capacity`, if `capacity` == -1, then @@ -1087,6 +1099,20 @@ cdef class Tree: arr.base = self return arr + cdef np.ndarray _get_ncat_ndarray(self): + """Wraps n_categories as a 3-d Numpy array + + The array keeps a reference to this Tree, which manages the underlying + memory. + """ + cdef np.npy_intp shape[1] + shape[0] = self.n_features + cdef np.ndarray arr + arr = np.PyArray_SimpleNewFromData(1, shape, np.NPY_INT32, self.n_categories) + Py_INCREF(self) + arr.base = self + return arr + cdef np.ndarray _get_node_ndarray(self): """Wraps nodes as a NumPy struct array. diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 8f659aa86374e..cdb6ac1f54790 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -33,6 +33,7 @@ ctypedef fused realloc_ptr: (DTYPE_t*) (SIZE_t*) (unsigned char*) + (INT32_t*) cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except * From 53457e5812703adfcd8758470344dc6f5a4002be Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Sun, 12 Apr 2015 22:39:11 -0700 Subject: [PATCH 03/35] Added a goes_left function for evaluating splits. --- sklearn/tree/_utils.pxd | 6 ++++++ sklearn/tree/_utils.pyx | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index cdb6ac1f54790..5b73df0ebb27e 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -10,6 +10,8 @@ import numpy as np cimport numpy as np +from ._splitter cimport SplitValue + ctypedef np.npy_float32 DTYPE_t # Type of X ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef np.npy_intp SIZE_t # Type for indices and counters @@ -51,6 +53,10 @@ cdef double rand_uniform(double low, double high, cdef double log(double x) nogil +# Function for traversing a tree +cdef bint goes_left(DTYPE_t feature_value, SplitValue split, + INT32_t n_categories) nogil + # ============================================================================= # Stack data structure # ============================================================================= diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 70232e3851050..a2c975506a751 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -82,6 +82,26 @@ cdef inline double rand_uniform(double low, double high, cdef inline double log(double x) nogil: return ln(x) / ln(2.0) +cdef bint goes_left(DTYPE_t feature_value, SplitValue split, + INT32_t n_categories) nogil: + """Determine whether a sample goes to the left or right child node.""" + cdef UINT32_t rng_seed + + if n_categories < 1: + # Non-categorical feature + return feature_value <= split.threshold + elif (split.cat_split & 1 == 0): + # Bitfield model + return (split.cat_split >> feature_value) & 1 + else: + # Random model + rng_seed = split.cat_split >> 32 + for q in range((split.cat_split & 0xFFFFFFFF) >> 1): + if (feature_value == + rand_int(0, n_categories, &rng_seed)): + return 1 + return 0 + # ============================================================================= # Stack data structure From ba93458cad0a55ee08261bd891bae26c3d21f213 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Sun, 12 Apr 2015 22:46:49 -0700 Subject: [PATCH 04/35] Tree.apply() now uses the goes_left function. --- sklearn/tree/_splitter.pxd | 2 +- sklearn/tree/_tree.pxd | 4 ++-- sklearn/tree/_tree.pyx | 17 +++++++++-------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index eb296d597b388..c75714af1c1af 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -21,7 +21,7 @@ ctypedef np.npy_int32 INT32_t # Signed 32 bit integer ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer ctypedef np.npy_uint64 UINT64_t # Unsigned 64 bit integer -cdef union SplitValue: +ctypedef union SplitValue: # Union type to generalize the concept of a threshold to # categorical features. For non-categorical features, use the # threshold member. It acts just as before, where feature values diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 732a18597d590..dd22775484e7b 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -60,8 +60,8 @@ cdef class Tree: # Methods cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, + SIZE_t feature, SplitValue split_value, + double impurity, SIZE_t n_node_samples, double weighted_n_samples) nogil cdef void _resize(self, SIZE_t capacity) except * cdef int _resize_c(self, SIZE_t capacity=*) nogil diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index e56e628d804dd..9d557de88a857 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -35,6 +35,7 @@ from ._utils cimport StackRecord from ._utils cimport PriorityHeap from ._utils cimport PriorityHeapRecord from ._utils cimport safe_realloc +from ._utils cimport goes_left from ._utils cimport sizet_ptr_to_ndarray cdef extern from "numpy/arrayobject.h": @@ -186,7 +187,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SplitRecord split cdef SIZE_t node_id - cdef double threshold cdef double impurity = INFINITY cdef SIZE_t n_constant_features cdef bint is_leaf @@ -234,7 +234,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): is_leaf = is_leaf or (split.pos >= end) node_id = tree._add_node(parent, is_left, is_leaf, split.feature, - split.split_value.threshold, impurity, + split.split_value, impurity, n_node_samples, weighted_n_node_samples) if node_id == (-1): @@ -449,7 +449,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if parent != NULL else _TREE_UNDEFINED, is_left, is_leaf, - split.feature, split.split_value.threshold, + split.feature, split.split_value, impurity, n_node_samples, weighted_n_node_samples) if node_id == (-1): return -1 @@ -713,7 +713,7 @@ cdef class Tree: return 0 cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, + SIZE_t feature, SplitValue split_value, double impurity, SIZE_t n_node_samples, double weighted_n_node_samples) nogil: """Add a node to the tree. @@ -747,7 +747,7 @@ cdef class Tree: else: # left_child and right_child will be set later node.feature = feature - node.split_value.threshold = threshold + node.split_value = split_value self.node_count += 1 @@ -800,8 +800,8 @@ cdef class Tree: # While node not a leaf while node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - if X_ptr[X_sample_stride * i + - X_fx_stride * node.feature] <= node.split_value.threshold: + if goes_left(X_ptr[X_sample_stride * i + X_fx_stride * node.feature], + node.split_value, self.n_categories[node.feature]): node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -872,7 +872,8 @@ cdef class Tree: else: feature_value = 0. - if feature_value <= node.split_value.threshold: + if goes_left(feature_value, node.split_value, + self.n_categories[node.feature]): node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] From d7d13b39f0c2061aa10aa2e958ea58d107e349b6 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Fri, 17 Apr 2015 14:56:58 -0700 Subject: [PATCH 05/35] Added categorical split code to BestSplitter.node_split --- sklearn/tree/_splitter.pyx | 93 +++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 32 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index d111a0fd3a2f2..f5caf3339f121 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -33,6 +33,7 @@ from ._utils cimport rand_int from ._utils cimport rand_uniform from ._utils cimport RAND_R_MAX from ._utils cimport safe_realloc +from ._utils cimport goes_left cdef double INFINITY = np.inf @@ -333,8 +334,8 @@ cdef class BestSplitter(BaseDenseSplitter): cdef SIZE_t f_i = n_features cdef SIZE_t f_j - cdef SIZE_t tmp cdef SIZE_t p + cdef SIZE_t q cdef SIZE_t feature_idx_offset cdef SIZE_t feature_offset cdef SIZE_t i @@ -350,6 +351,7 @@ cdef class BestSplitter(BaseDenseSplitter): cdef SIZE_t n_total_constants = n_known_constants cdef DTYPE_t current_feature_value cdef SIZE_t partition_end + cdef bint is_categorical _init_split(&best, end) @@ -391,9 +393,8 @@ cdef class BestSplitter(BaseDenseSplitter): if f_j < n_known_constants: # f_j in the interval [n_drawn_constants, n_known_constants[ - tmp = features[f_j] - features[f_j] = features[n_drawn_constants] - features[n_drawn_constants] = tmp + features[f_j], features[n_drawn_constants] = ( + features[n_drawn_constants], features[f_j]) n_drawn_constants += 1 @@ -437,44 +438,72 @@ cdef class BestSplitter(BaseDenseSplitter): # Evaluate all splits self.criterion.reset() - p = start + is_categorical = self.n_categories[current.feature] > 0 + p = 0 if is_categorical else start + + while True: + if is_categorical: + # WARNING: This is O(n_samples * + # 2**n_categories), and will be very slow + # for more than just a few categories. + if p > (1 << self.n_categories[current.feature]) - 1: + break + else: + p += 2 # LSB must always be 0 + + # Partition + q = start + partition_end = end + while q < partition_end: + if ((p >> Xf[q]) & 1): + q += 1 + else: + partition_end -= 1 + Xf[q], Xf[partition_end] = Xf[partition_end], Xf[q] + samples[q], samples[partition_end] = ( + samples[partition_end], samples[q]) + current.pos = q + else: + # Non-categorical feature + while (p + 1 < end and + Xf[p + 1] <= Xf[p] + FEATURE_THRESHOLD): + p += 1 - while p < end: - while (p + 1 < end and - Xf[p + 1] <= Xf[p] + FEATURE_THRESHOLD): + # (p + 1 >= end) or (X[samples[p + 1], current.feature] > + # X[samples[p], current.feature]) p += 1 + # (p >= end) or (X[samples[p], current.feature] > + # X[samples[p - 1], current.feature]) - # (p + 1 >= end) or (X[samples[p + 1], current.feature] > - # X[samples[p], current.feature]) - p += 1 - # (p >= end) or (X[samples[p], current.feature] > - # X[samples[p - 1], current.feature]) + if p >= end: + break - if p < end: current.pos = p - # Reject if min_samples_leaf is not guaranteed - if (((current.pos - start) < min_samples_leaf) or - ((end - current.pos) < min_samples_leaf)): - continue + # Reject if min_samples_leaf is not guaranteed + if (((current.pos - start) < min_samples_leaf) or + ((end - current.pos) < min_samples_leaf)): + continue - self.criterion.update(current.pos) + self.criterion.update(current.pos) - # Reject if min_weight_leaf is not satisfied - if ((self.criterion.weighted_n_left < min_weight_leaf) or - (self.criterion.weighted_n_right < min_weight_leaf)): - continue + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + continue - current_proxy_improvement = self.criterion.proxy_impurity_improvement() + current_proxy_improvement = self.criterion.proxy_impurity_improvement() - if current_proxy_improvement > best_proxy_improvement: - best_proxy_improvement = current_proxy_improvement + if current_proxy_improvement > best_proxy_improvement: + best_proxy_improvement = current_proxy_improvement + if is_categorical: + current.split_value.cat_split = p + else: current.split_value.threshold = (Xf[p - 1] + Xf[p]) / 2.0 - if current.split_value.threshold == Xf[p]: current.split_value.threshold = Xf[p - 1] - best = current # copy + best = current # copy # Reorganize into samples[start:best.pos] + samples[best.pos:end] if best.pos < end: @@ -483,15 +512,15 @@ cdef class BestSplitter(BaseDenseSplitter): p = start while p < partition_end: - if X[X_sample_stride * samples[p] + feature_offset] <= best.split_value.threshold: + if goes_left(X[X_sample_stride * samples[p] + feature_offset], + best.split_value, self.n_categories[best.feature]): p += 1 else: partition_end -= 1 - tmp = samples[partition_end] - samples[partition_end] = samples[p] - samples[p] = tmp + samples[p], samples[partition_end] = ( + samples[partition_end], samples[p]) self.criterion.reset() self.criterion.update(best.pos) From ec37e11bda066fc13e49c7006b05a93c23599361 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Fri, 17 Apr 2015 15:01:42 -0700 Subject: [PATCH 06/35] Added categorical split code to RandomSplitter.node_split --- sklearn/tree/_splitter.pyx | 47 +++++++++++++++++++++++--------------- sklearn/tree/_utils.pxd | 3 +++ 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index f5caf3339f121..5171a7cf76a7e 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -29,6 +29,7 @@ np.import_array() from scipy.sparse import csc_matrix from ._utils cimport log +from ._utils cimport our_rand_r from ._utils cimport rand_int from ._utils cimport rand_uniform from ._utils cimport RAND_R_MAX @@ -696,7 +697,6 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef SIZE_t f_i = n_features cdef SIZE_t f_j cdef SIZE_t p - cdef SIZE_t tmp cdef SIZE_t feature_stride # Number of features discovered to be constant during the split search cdef SIZE_t n_found_constants = 0 @@ -710,6 +710,9 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef DTYPE_t max_feature_value cdef DTYPE_t current_feature_value cdef SIZE_t partition_end + cdef bint is_categorical + cdef UINT32_t split_n_draw + cdef UINT64_t split_seed _init_split(&best, end) @@ -746,9 +749,8 @@ cdef class RandomSplitter(BaseDenseSplitter): if f_j < n_known_constants: # f_j in the interval [n_drawn_constants, n_known_constants[ - tmp = features[f_j] - features[f_j] = features[n_drawn_constants] - features[n_drawn_constants] = tmp + features[f_j], features[n_drawn_constants] = ( + features[n_drawn_constants], features[f_j]) n_drawn_constants += 1 @@ -785,19 +787,29 @@ cdef class RandomSplitter(BaseDenseSplitter): f_i -= 1 features[f_i], features[f_j] = features[f_j], features[f_i] - # Draw a random threshold - current.split_value.threshold = rand_uniform( - min_feature_value, max_feature_value, random_state) - - if current.split_value.threshold == max_feature_value: - current.split_value.threshold = min_feature_value + # Construct a random split + is_categorical = self.n_categories[current.feature] > 0 + if is_categorical: + # split_n_draw is the number of categories to send left + # TODO: this should be a binomial draw + split_n_draw = rand_int( + 1, self.n_categories[current.feature], random_state) + split_seed = our_rand_r(random_state) + current.split_value.cat_split = ( + (split_seed << 32) | (split_n_draw << 1) | 1) + else: + current.split_value.threshold = rand_uniform( + min_feature_value, max_feature_value, random_state) + if current.split_value.threshold == max_feature_value: + current.split_value.threshold = min_feature_value # Partition partition_end = end p = start while p < partition_end: current_feature_value = Xf[p] - if current_feature_value <= current.split_value.threshold: + if goes_left(current_feature_value, current.split_value, + self.n_categories[current.feature]): p += 1 else: partition_end -= 1 @@ -805,9 +817,8 @@ cdef class RandomSplitter(BaseDenseSplitter): Xf[p] = Xf[partition_end] Xf[partition_end] = current_feature_value - tmp = samples[partition_end] - samples[partition_end] = samples[p] - samples[p] = tmp + samples[p], samples[partition_end] = ( + samples[partition_end], samples[p]) current.pos = partition_end @@ -839,15 +850,15 @@ cdef class RandomSplitter(BaseDenseSplitter): p = start while p < partition_end: - if X[X_sample_stride * samples[p] + feature_stride] <= best.split_value.threshold: + if goes_left(X[X_sample_stride * samples[p] + feature_stride], + best.split_value, self.n_categories[best.feature]): p += 1 else: partition_end -= 1 - tmp = samples[partition_end] - samples[partition_end] = samples[p] - samples[p] = tmp + samples[p], samples[partition_end] = ( + samples[partition_end], samples[p]) self.criterion.reset() diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 5b73df0ebb27e..856dfaf87aaf2 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -43,6 +43,9 @@ cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except * cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size) +cdef UINT32_t our_rand_r(UINT32_t* seed) nogil + + cdef SIZE_t rand_int(SIZE_t low, SIZE_t high, UINT32_t* random_state) nogil From 94521965b2eb2fecac30d647f56555c0c1e0f5b0 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Mon, 11 May 2015 14:47:40 -0700 Subject: [PATCH 07/35] Added a 'categorical' parameter to BaseDecisionTree.fit(). It currently does nothing. --- sklearn/tree/tree.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index d33f2fbadcb80..cf14ae9ff2a37 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -110,8 +110,8 @@ def __init__(self, self.tree_ = None self.max_features_ = None - def fit(self, X, y, sample_weight=None, check_input=True, - X_idx_sorted=None): + def fit(self, X, y, sample_weight=None, categorical='None', + check_input=True, X_idx_sorted=None): """Build a decision tree from the training set (X, y). Parameters @@ -133,9 +133,22 @@ def fit(self, X, y, sample_weight=None, check_input=True, classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. + categorical : array-like or str + Array of feature indices, boolean array of length + n_features, ``'All'``, or ``'None'``. Indicates which + features should be considered as categorical rather than + ordinal. For decision trees, the maximum number of + categories per feature is 64, though the real-world limit + will be much lower because evaluating splits has + :math:`O(2^N)` time complexity, for :math:`N` + categories. Extra-randomized trees do not have this + limitation because they do not try to find the best + split. For these trees, the maximum number of categories + per feature is :math:`2^{31}`. + check_input : boolean, (default=True) Allow to bypass several input checking. - Don't use this parameter unless you know what you do. + Don't use this parameter unless you know what you are doing. X_idx_sorted : array-like, shape = [n_samples, n_features], optional The indexes of the sorted training input samples. If many tree From 361f87cc79b6e508e5a020406dc5927222895c33 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Mon, 11 May 2015 14:56:30 -0700 Subject: [PATCH 08/35] Added a property getter/setter for Tree.n_categories --- sklearn/tree/_tree.pyx | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 9d557de88a857..138833cd99894 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -529,6 +529,10 @@ cdef class Tree: value : array of double, shape [node_count, n_outputs, max_n_classes] Contains the constant prediction value of each node. + n_categories : array of int, shape [n_features] + Number of expected categorie values for categorical features, or + -1 for non-categorical features. + impurity : array of double, shape [node_count] impurity[i] holds the impurity (i.e., the value of the splitting criterion) at node i. @@ -581,6 +585,15 @@ cdef class Tree: def __get__(self): return self._get_value_ndarray()[:self.node_count] + property n_categories: + def __get__(self): + return self._get_ncat_ndarray()[:self.n_features] + + def __set__(self, np.ndarray[INT32_t, ndim=1] value): + cdef SIZE_t i + for i in range(self.n_features): + self.n_categories[i] = value[i] + def __cinit__(self, int n_features, np.ndarray[SIZE_t, ndim=1] n_classes, int n_outputs): """Constructor.""" From fc7efa64f92e922368b46a8425c5cb17f21d4704 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Mon, 11 May 2015 15:31:21 -0700 Subject: [PATCH 09/35] Added n_categories arguments to Splitter.init() and TreeBuilder.build(). Enforced max n_categories=2**31 for RandomSplitter. --- sklearn/tree/_splitter.pxd | 1 + sklearn/tree/_splitter.pyx | 14 +++++++++----- sklearn/tree/_tree.pxd | 1 + sklearn/tree/_tree.pyx | 23 +++++++++++++++++++++-- 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index c75714af1c1af..809feb2ac32fe 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -108,6 +108,7 @@ cdef class Splitter: # Methods cdef void init(self, object X, np.ndarray y, DOUBLE_t* sample_weight, + INT32_t* n_categories, np.ndarray X_idx_sorted=*) except * cdef void node_reset(self, SIZE_t start, SIZE_t end, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 5171a7cf76a7e..46cb3be42898a 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -124,6 +124,7 @@ cdef class Splitter: object X, np.ndarray[DOUBLE_t, ndim=2, mode="c"] y, DOUBLE_t* sample_weight, + INT32_t* n_categories, np.ndarray X_idx_sorted=None) except *: """Initialize the splitter. @@ -189,7 +190,8 @@ cdef class Splitter: # A value of -1 indicates a non-categorical feature safe_realloc(&self.n_categories, n_features) for i in range(n_features): - self.n_categories[i] = -1 + self.n_categories[i] = (-1 if n_categories == NULL + else n_categories[i]) cdef void node_reset(self, SIZE_t start, SIZE_t end, double* weighted_n_node_samples) nogil: @@ -271,11 +273,12 @@ cdef class BaseDenseSplitter(Splitter): object X, np.ndarray[DOUBLE_t, ndim=2, mode="c"] y, DOUBLE_t* sample_weight, + INT32_t* n_categories, np.ndarray X_idx_sorted=None) except *: """Initialize the splitter.""" # Call parent init - Splitter.init(self, X, y, sample_weight) + Splitter.init(self, X, y, sample_weight, n_categories) # Initialize X cdef np.ndarray X_ndarray = X @@ -792,8 +795,8 @@ cdef class RandomSplitter(BaseDenseSplitter): if is_categorical: # split_n_draw is the number of categories to send left # TODO: this should be a binomial draw - split_n_draw = rand_int( - 1, self.n_categories[current.feature], random_state) + split_n_draw = rand_int(1, self.n_categories[current.feature], + random_state) & 0x7FFFFFFF split_seed = our_rand_r(random_state) current.split_value.cat_split = ( (split_seed << 32) | (split_n_draw << 1) | 1) @@ -916,11 +919,12 @@ cdef class BaseSparseSplitter(Splitter): object X, np.ndarray[DOUBLE_t, ndim=2, mode="c"] y, DOUBLE_t* sample_weight, + INT32_t* n_categories, np.ndarray X_idx_sorted=None) except *: """Initialize the splitter.""" # Call parent init - Splitter.init(self, X, y, sample_weight) + Splitter.init(self, X, y, sample_weight, n_categories) if not isinstance(X, csc_matrix): raise ValueError("X should be in csc format") diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index dd22775484e7b..e877f66cf6d0b 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -104,5 +104,6 @@ cdef class TreeBuilder: cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=*, + np.ndarray n_categories=*, np.ndarray X_idx_sorted=*) cdef _check_input(self, object X, np.ndarray y, np.ndarray sample_weight) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 138833cd99894..f88618a0762b5 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -97,6 +97,7 @@ cdef class TreeBuilder: cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=None, + np.ndarray n_categories=None, np.ndarray X_idx_sorted=None): """Build a decision tree from the training set (X, y).""" pass @@ -146,6 +147,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=None, + np.ndarray n_categories=None, np.ndarray X_idx_sorted=None): """Build a decision tree from the training set (X, y).""" @@ -156,6 +158,14 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if sample_weight is not None: sample_weight_ptr = sample_weight.data + cdef INT32_t *n_categories_ptr = NULL + if n_categories is not None: + if ((n_categories.dtype != np.int32) or + (not n_categories.flags.contiguous)): + n_categories = np.asarray(n_categories, + dtype=np.int32, order="C") + n_categories_ptr = n_categories.data + # Initial capacity cdef int init_capacity @@ -174,7 +184,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SIZE_t min_samples_split = self.min_samples_split # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight_ptr, X_idx_sorted) + splitter.init(X, y, sample_weight_ptr, n_categories_ptr, X_idx_sorted) cdef SIZE_t start cdef SIZE_t end @@ -303,6 +313,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=None, + np.ndarray n_categories=None, np.ndarray X_idx_sorted=None): """Build a decision tree from the training set (X, y).""" @@ -313,6 +324,14 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if sample_weight is not None: sample_weight_ptr = sample_weight.data + cdef INT32_t *n_categories_ptr = NULL + if n_categories is not None: + if ((n_categories.dtype != np.int32) or + (not n_categories.flags.contiguous)): + n_categories = np.asarray(n_categories, + dtype=np.int32, order="C") + n_categories_ptr = n_categories.data + # Parameters cdef Splitter splitter = self.splitter cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes @@ -321,7 +340,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef SIZE_t min_samples_split = self.min_samples_split # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight_ptr, X_idx_sorted) + splitter.init(X, y, sample_weight_ptr, n_categories_ptr, X_idx_sorted) cdef PriorityHeap frontier = PriorityHeap(INITIAL_STACK_SIZE) cdef PriorityHeapRecord record From 144d665f910fa30d7d78e1fea6db7c99e7dceedb Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Wed, 24 Jun 2015 18:49:56 -0700 Subject: [PATCH 10/35] Added python interface for categorical features. --- sklearn/tree/tree.py | 55 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index cf14ae9ff2a37..80a19d573f8bd 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -20,6 +20,7 @@ from abc import ABCMeta from abc import abstractmethod from math import ceil +from itertools import izip, count import numpy as np from scipy.sparse import issparse @@ -106,6 +107,7 @@ def __init__(self, self.n_outputs_ = None self.classes_ = None self.n_classes_ = None + self.category_map_ = None self.tree_ = None self.max_features_ = None @@ -309,6 +311,42 @@ def fit(self, X, y, sample_weight=None, categorical='None', else: sample_weight = expanded_class_weight + if isinstance(categorical, str): + if categorical == "None": + categorical = np.array([]) + elif categorical == "All": + categorical = np.arange(self.n_features_) + else: + raise ValueError("Invalid value for categorical: %s. Allowed" + " strings are 'All' or 'None'" % categorical) + categorical = np.atleast_1d(categorical).flatten() + if categorical.dtype == np.bool: + if categorical.size != self.n_features_: + raise ValueError("Shape of boolean parameter categorical must" + " be [n_features]") + categorical = np.nonzero(categorical)[0] + if (categorical.size > self.n_features_ or + (categorical.size > 0 and + (np.min(categorical) < 0 or + np.max(categorical) >= self.n_features_))): + raise ValueError("Invalid shape or invalid feature index for" + " parameter categorical") + if issparse(X) and len(categorical) > 0: + raise NotImplementedError("Categorical features not supported with" + " sparse inputs") + + # Determine the number of categories in each categorical feature + n_categories = np.zeros(self.n_features_, dtype=np.int32) - 1 + self.category_map_ = [None] * self.n_features_ + if categorical.size > 0: + X = np.copy(X) + for feature in categorical: + rounded = np.round(X[:, feature]).astype(np.int64) + self.category_map_[feature] = dict(izip(set(rounded), count())) + X[:, feature] = np.array([self.category_map_[feature][x] + for x in rounded]).astype(DTYPE) + n_categories[feature] = len(self.category_map_[feature]) + # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * @@ -364,6 +402,7 @@ def fit(self, X, y, sample_weight=None, categorical='None', self.presort) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) + self.tree_.n_categories = n_categories # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: @@ -378,7 +417,8 @@ def fit(self, X, y, sample_weight=None, categorical='None', max_depth, max_leaf_nodes) - builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) + builder.build(self.tree_, X, y, sample_weight, n_categories, + X_idx_sorted) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] @@ -406,6 +446,19 @@ def _validate_X_predict(self, X, check_input): "input n_features is %s " % (self.n_features_, n_features)) + # Map categorical features onto integers + n_categories = self.tree_.n_categories + categorical_features = np.nonzero(n_categories > 0)[0] + if categorical_features.size > 0: + X = np.copy(X) + for feature in categorical_features: + rounded = np.round(X[:, feature]).astype('int64') + new_cat = set(rounded) - set(self.category_map_[feature]) + new_cat_map = dict(izip(new_cat, count(n_categories[feature]))) + X[:, feature] = np.array( + [self.category_map_[feature].get(x, new_cat_map.get(x)) + for x in rounded]).astype(DTYPE) + return X def predict(self, X, check_input=True): From cf0955969c533cf421746fa2ddd3603f8e4cc0bf Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Mon, 11 May 2015 23:04:45 -0700 Subject: [PATCH 11/35] Adjusted _gradient_boosting.pyx to match the new splitting mechanism. --- sklearn/ensemble/_gradient_boosting.pyx | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index 00ad2c08a0ee3..f95d120943957 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -14,6 +14,7 @@ np.import_array() from sklearn.tree._tree cimport Node from sklearn.tree._tree cimport Tree +from sklearn.tree._utils cimport goes_left ctypedef np.int32_t int32 @@ -31,6 +32,7 @@ from numpy import float64 as np_float64 DTYPE = np.float32 ctypedef np.float32_t DTYPE_t ctypedef np.npy_intp SIZE_t +ctypedef np.npy_int32 INT32_t # constant to mark tree leafs @@ -44,6 +46,7 @@ cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X, Py_ssize_t K, Py_ssize_t n_samples, Py_ssize_t n_features, + INT32_t* n_categories, float64 *out): """Predicts output for regression tree and stores it in ``out[i, k]``. @@ -78,6 +81,9 @@ cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X, ``n_samples == X.shape[0]``. n_features : int The number of features; ``n_samples == X.shape[1]``. + n_categories : INT32_t pointer + Pointer to array of shape [n_features] containing the number of + categories for each feature, or -1 for non-categorical features. out : np.float64_t pointer The pointer to the data array where the predictions are stored. ``out`` is assumed to be a two-dimensional array of @@ -90,7 +96,8 @@ cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X, node = root_node # While node not a leaf while node.left_child != -1 and node.right_child != -1: - if X[i * n_features + node.feature] <= node.threshold: + if goes_left(X[i * n_features + node.feature], node.split_value, + n_categories[node.feature]): node = root_node + node.left_child else: node = root_node + node.right_child @@ -123,6 +130,7 @@ def predict_stages(np.ndarray[object, ndim=2] estimators, X.data, tree.nodes, tree.value, scale, k, K, X.shape[0], X.shape[1], + tree.n_categories, ( out).data) ## out += scale * tree.predict(X).reshape((X.shape[0], 1)) @@ -226,7 +234,8 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X, if feature_index != -1: # split feature in target set # push left or right child on stack - if X[i, feature_index] <= current_node.threshold: + if goes_left(X[i, feature_index], current_node.split_value, + tree.n_categories[current_node.feature]): # left node_stack[stack_size] = (root_node + current_node.left_child) From 25fcc8835639fbf3dca65a0e3aa9a3f16732a863 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Wed, 24 Jun 2015 10:50:38 -0700 Subject: [PATCH 12/35] Added interface code to forest.py and gradient_boosting.py for categorical features. --- sklearn/ensemble/forest.py | 27 +++++++++++---- sklearn/ensemble/gradient_boosting.py | 48 +++++++++++++++------------ 2 files changed, 47 insertions(+), 28 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index e26323f65bfee..bc88343261612 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -90,8 +90,8 @@ def _generate_unsampled_indices(random_state, n_samples): return unsampled_indices -def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, - verbose=0, class_weight=None): +def _parallel_build_trees(tree, forest, X, y, sample_weight, categorical, + tree_idx, n_trees, verbose=0, class_weight=None): """Private function used to fit a single tree in parallel.""" if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) @@ -114,9 +114,11 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, elif class_weight == 'balanced_subsample': curr_sample_weight *= compute_sample_weight('balanced', y, indices) - tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) + tree.fit(X, y, sample_weight=curr_sample_weight, + categorical=categorical, check_input=False) else: - tree.fit(X, y, sample_weight=sample_weight, check_input=False) + tree.fit(X, y, sample_weight=sample_weight, + categorical=categorical, check_input=False) return tree @@ -211,7 +213,7 @@ def decision_path(self, X): return sparse_hstack(indicators).tocsr(), n_nodes_ptr - def fit(self, X, y, sample_weight=None): + def fit(self, X, y, sample_weight=None, categorical='None'): """Build a forest of trees from the training set (X, y). Parameters @@ -232,6 +234,19 @@ def fit(self, X, y, sample_weight=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. + categorical : array-like or str + Array of feature indices, boolean array of length + n_features, ``'All'``, or ``'None'``. Indicates which + features should be considered as categorical rather than + ordinal. For decision trees, the maximum number of + categories per feature is 64, though the real-world limit + will be much lower because evaluating splits has + :math:`O(2^N)` time complexity, for :math:`N` + categories. Extra-randomized trees do not have this + limitation because they do not try to find the best + split. For these trees, the maximum number of categories + per feature is :math:`2^{31}`. + Returns ------- self : object @@ -315,7 +330,7 @@ def fit(self, X, y, sample_weight=None): trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_trees)( - t, self, X, y, sample_weight, i, len(trees), + t, self, X, y, sample_weight, categorical, i, len(trees), verbose=self.verbose, class_weight=self.class_weight) for i, t in enumerate(trees)) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index b17d726cb122a..087a8430f0b15 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -746,7 +746,8 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split, self.estimators_ = np.empty((0, 0), dtype=np.object) def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, - random_state, X_idx_sorted, X_csc=None, X_csr=None): + categorical, random_state, X_idx_sorted, X_csc=None, + X_csr=None): """Fit another stage of ``n_classes_`` trees to the boosting model. """ assert sample_mask.dtype == np.bool @@ -777,22 +778,14 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, # no inplace multiplication! sample_weight = sample_weight * sample_mask.astype(np.float64) - if X_csc is not None: - tree.fit(X_csc, residual, sample_weight=sample_weight, - check_input=False, X_idx_sorted=X_idx_sorted) - else: - tree.fit(X, residual, sample_weight=sample_weight, - check_input=False, X_idx_sorted=X_idx_sorted) + tree.fit(X_csc if X_csc is not None else X, residual, + sample_weight=sample_weight, categorical=categorical, + check_input=False, X_idx_sorted=X_idx_sorted) # update tree leaves - if X_csr is not None: - loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred, - sample_weight, sample_mask, - self.learning_rate, k=k) - else: - loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred, - sample_weight, sample_mask, - self.learning_rate, k=k) + loss.update_terminal_regions( + tree.tree_, X_csr if X_csr is not None else X, y, residual, + y_pred, sample_weight, sample_mask, self.learning_rate, k=k) # add tree to ensemble self.estimators_[i, k] = tree @@ -928,7 +921,7 @@ def _check_initialized(self): raise NotFittedError("Estimator not fitted, call `fit`" " before making predictions`.") - def fit(self, X, y, sample_weight=None, monitor=None): + def fit(self, X, y, sample_weight=None, categorical='None', monitor=None): """Fit the gradient boosting model. Parameters @@ -949,6 +942,15 @@ def fit(self, X, y, sample_weight=None, monitor=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. + categorical : array-like or str + Array of feature indices, boolean array of length + n_features, ``'All'``, or ``'None'``. Indicates which + features should be considered as categorical rather than + ordinal. The maximum number of categories per feature is + 64, though the real-world limit will be much lower because + evaluating splits has :math:`O(2^N)` time complexity, for + :math:`N` categories. + monitor : callable, optional The monitor is called after each iteration with the current iteration, a reference to the estimator and the local variables of @@ -1022,8 +1024,9 @@ def fit(self, X, y, sample_weight=None, monitor=None): dtype=np.int32) # fit the boosting stages - n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state, - begin_at_stage, monitor, X_idx_sorted) + n_stages = self._fit_stages(X, y, y_pred, sample_weight, categorical, + random_state, begin_at_stage, monitor, + X_idx_sorted) # change shape of arrays after fit (early-stopping or additional ests) if n_stages != self.estimators_.shape[0]: self.estimators_ = self.estimators_[:n_stages] @@ -1033,8 +1036,9 @@ def fit(self, X, y, sample_weight=None, monitor=None): return self - def _fit_stages(self, X, y, y_pred, sample_weight, random_state, - begin_at_stage=0, monitor=None, X_idx_sorted=None): + def _fit_stages(self, X, y, y_pred, sample_weight, categorical, + random_state, begin_at_stage=0, monitor=None, + X_idx_sorted=None): """Iteratively fits the stages. For each stage it computes the progress (OOB, train score) @@ -1077,8 +1081,8 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state, # fit next stage of trees y_pred = self._fit_stage(i, X, y, y_pred, sample_weight, - sample_mask, random_state, X_idx_sorted, - X_csc, X_csr) + sample_mask, categorical, random_state, + X_idx_sorted, X_csc, X_csr) # track deviance (= loss) if do_oob: From 30fe8296c69f813a696b4625bb0c1b9eb241cbff Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Wed, 13 May 2015 23:37:00 -0700 Subject: [PATCH 13/35] Added bit caches to Splitter and Tree. These are used to avoid regenerating a big list of random numbers for every sample in every node when sending it left or right. --- sklearn/ensemble/_gradient_boosting.pyx | 13 +++++-- sklearn/tree/_splitter.pxd | 2 ++ sklearn/tree/_splitter.pyx | 25 ++++++++++--- sklearn/tree/_tree.pxd | 4 +++ sklearn/tree/_tree.pyx | 39 ++++++++++++++++++-- sklearn/tree/_utils.pxd | 11 ++++-- sklearn/tree/_utils.pyx | 47 ++++++++++++++++++------- 7 files changed, 118 insertions(+), 23 deletions(-) diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index f95d120943957..a26762bad21d6 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -97,7 +97,7 @@ cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X, # While node not a leaf while node.left_child != -1 and node.right_child != -1: if goes_left(X[i * n_features + node.feature], node.split_value, - n_categories[node.feature]): + n_categories[node.feature], node._bit_cache): node = root_node + node.left_child else: node = root_node + node.right_child @@ -123,6 +123,8 @@ def predict_stages(np.ndarray[object, ndim=2] estimators, for k in range(K): tree = estimators[i, k].tree_ + tree.populate_bit_caches() + # avoid buffer validation by casting to ndarray # and get data pointer # need brackets because of casting operator priority @@ -134,6 +136,8 @@ def predict_stages(np.ndarray[object, ndim=2] estimators, ( out).data) ## out += scale * tree.predict(X).reshape((X.shape[0], 1)) + tree.delete_bit_caches() + @cython.nonecheck(False) def predict_stage(np.ndarray[object, ndim=2] estimators, @@ -212,6 +216,8 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X, underlying_stack = np_zeros((stack_capacity,), dtype=np.intp) node_stack = ( underlying_stack).data + tree.populate_bit_caches() + for i in range(X.shape[0]): # init stacks for new example stack_size = 1 @@ -235,7 +241,8 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X, # split feature in target set # push left or right child on stack if goes_left(X[i, feature_index], current_node.split_value, - tree.n_categories[current_node.feature]): + tree.n_categories[current_node.feature], + current_node._bit_cache): # left node_stack[stack_size] = (root_node + current_node.left_child) @@ -273,6 +280,8 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X, raise ValueError("Total weight should be 1.0 but was %.9f" % total_weight) + tree.delete_bit_caches() + def _random_sample_mask(np.npy_intp n_total_samples, np.npy_intp n_total_in_bag, random_state): diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 809feb2ac32fe..64bfb22bd9284 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -17,6 +17,7 @@ from ._criterion cimport Criterion ctypedef np.npy_float32 DTYPE_t # Type of X ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef np.npy_intp SIZE_t # Type for indices and counters +ctypedef np.npy_uint8 UINT8_t # Unsigned 8 bit integer ctypedef np.npy_int32 INT32_t # Signed 32 bit integer ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer ctypedef np.npy_uint64 UINT64_t # Unsigned 64 bit integer @@ -88,6 +89,7 @@ cdef class Splitter: cdef DOUBLE_t* sample_weight cdef INT32_t* n_categories # (n_features) array giving number of # categories (<0 for non-categorical) + cdef UINT8_t* _bit_cache # The samples vector `samples` is maintained by the Splitter object such # that the samples contained in a node are contiguous. With this setting, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 46cb3be42898a..49fc1af5ed3c6 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -34,6 +34,7 @@ from ._utils cimport rand_int from ._utils cimport rand_uniform from ._utils cimport RAND_R_MAX from ._utils cimport safe_realloc +from ._utils cimport make_bit_cache from ._utils cimport goes_left cdef double INFINITY = np.inf @@ -98,6 +99,7 @@ cdef class Splitter: self.y_stride = 0 self.sample_weight = NULL self.n_categories = NULL + self._bit_cache = NULL self.max_features = max_features self.min_samples_leaf = min_samples_leaf @@ -113,6 +115,7 @@ cdef class Splitter: free(self.constant_features) free(self.feature_values) free(self.n_categories) + free(self._bit_cache) def __getstate__(self): return {} @@ -193,6 +196,12 @@ cdef class Splitter: self.n_categories[i] = (-1 if n_categories == NULL else n_categories[i]) + # If needed, allocate cache space to hold split info + cdef INT32_t max_n_categories = max( + [self.n_categories[i] for i in range(n_features)]) + if max_n_categories > 0: + safe_realloc(&self._bit_cache, (max_n_categories + 7) // 8) + cdef void node_reset(self, SIZE_t start, SIZE_t end, double* weighted_n_node_samples) nogil: """Reset splitter on node samples[start:end]. @@ -459,7 +468,7 @@ cdef class BestSplitter(BaseDenseSplitter): q = start partition_end = end while q < partition_end: - if ((p >> Xf[q]) & 1): + if (p >> Xf[q]) & 1: q += 1 else: partition_end -= 1 @@ -511,13 +520,16 @@ cdef class BestSplitter(BaseDenseSplitter): # Reorganize into samples[start:best.pos] + samples[best.pos:end] if best.pos < end: + make_bit_cache(best.split_value, self.n_categories[best.feature], + self._bit_cache) feature_offset = X_feature_stride * best.feature partition_end = end p = start while p < partition_end: if goes_left(X[X_sample_stride * samples[p] + feature_offset], - best.split_value, self.n_categories[best.feature]): + best.split_value, self.n_categories[best.feature], + self._bit_cache): p += 1 else: @@ -807,12 +819,14 @@ cdef class RandomSplitter(BaseDenseSplitter): current.split_value.threshold = min_feature_value # Partition + make_bit_cache(current.split_value, self.n_categories[current.feature], + self._bit_cache) partition_end = end p = start while p < partition_end: current_feature_value = Xf[p] if goes_left(current_feature_value, current.split_value, - self.n_categories[current.feature]): + self.n_categories[current.feature], self._bit_cache): p += 1 else: partition_end -= 1 @@ -848,13 +862,16 @@ cdef class RandomSplitter(BaseDenseSplitter): # Reorganize into samples[start:best.pos] + samples[best.pos:end] feature_stride = X_feature_stride * best.feature if best.pos < end: + make_bit_cache(best.split_value, self.n_categories[best.feature], + self._bit_cache) if current.feature != best.feature: partition_end = end p = start while p < partition_end: if goes_left(X[X_sample_stride * samples[p] + feature_stride], - best.split_value, self.n_categories[best.feature]): + best.split_value, self.n_categories[best.feature], + self._bit_cache): p += 1 else: diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index e877f66cf6d0b..e330848fb85a5 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -15,6 +15,7 @@ cimport numpy as np ctypedef np.npy_float32 DTYPE_t # Type of X ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef np.npy_intp SIZE_t # Type for indices and counters +ctypedef np.npy_uint8 UINT8_t # Unsigned 8 bit integer ctypedef np.npy_int32 INT32_t # Signed 32 bit integer ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer ctypedef np.npy_uint64 UINT64_t # Unsigned 64 bit integer @@ -34,6 +35,7 @@ cdef struct Node: DOUBLE_t impurity # Impurity of the node (i.e., the value of the criterion) SIZE_t n_node_samples # Number of samples at the node DOUBLE_t weighted_n_node_samples # Weighted number of samples at the node + UINT8_t* _bit_cache cdef class Tree: @@ -69,6 +71,8 @@ cdef class Tree: cdef np.ndarray _get_value_ndarray(self) cdef np.ndarray _get_node_ndarray(self) cdef np.ndarray _get_ncat_ndarray(self) + cdef void populate_bit_caches(self) + cdef void delete_bit_caches(self) cpdef np.ndarray predict(self, object X) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index f88618a0762b5..8bbb24ffb2cd6 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -35,6 +35,7 @@ from ._utils cimport StackRecord from ._utils cimport PriorityHeap from ._utils cimport PriorityHeapRecord from ._utils cimport safe_realloc +from ._utils cimport make_bit_cache from ._utils cimport goes_left from ._utils cimport sizet_ptr_to_ndarray @@ -643,6 +644,7 @@ cdef class Tree: def __dealloc__(self): """Destructor.""" # Free all inner structures + self.delete_bit_caches() free(self.n_classes) free(self.value) free(self.nodes) @@ -781,10 +783,39 @@ cdef class Tree: node.feature = feature node.split_value = split_value + node._bit_cache = NULL + self.node_count += 1 return node_id + cdef void populate_bit_caches(self): + """Allocates and populates bit caches for nodes that split on + categorical features. Should be run before every tree traversal.""" + cdef Node* node = self.nodes + cdef Node* end_node = self.nodes + self.node_count + cdef INT32_t n_categories = 0 + + while node != end_node: + if node.left_child != _TREE_LEAF: + n_categories = self.n_categories[node.feature] + if n_categories > 0: + safe_realloc(&node._bit_cache, (n_categories + 7) // 8) + make_bit_cache(node.split_value, n_categories, + node._bit_cache) + node += 1 + + cdef void delete_bit_caches(self): + """Deallocates the bit cache of each node in the tree. Should be run + after tree traversal.""" + cdef Node* node = self.nodes + cdef Node* end_node = self.nodes + self.node_count + + while node != end_node: + free(node._bit_cache) + node._bit_cache = NULL + node += 1 + cpdef np.ndarray predict(self, object X): """Predict target for X.""" out = self._get_value_ndarray().take(self.apply(X), axis=0, @@ -826,6 +857,8 @@ cdef class Tree: cdef Node* node = NULL cdef SIZE_t i = 0 + self.populate_bit_caches() + with nogil: for i in range(n_samples): node = self.nodes @@ -833,13 +866,15 @@ cdef class Tree: while node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: if goes_left(X_ptr[X_sample_stride * i + X_fx_stride * node.feature], - node.split_value, self.n_categories[node.feature]): + node.split_value, self.n_categories[node.feature], node._bit_cache): node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] out_ptr[i] = (node - self.nodes) # node offset + self.delete_bit_caches() + return out cdef inline np.ndarray _apply_sparse_csr(self, object X): @@ -905,7 +940,7 @@ cdef class Tree: feature_value = 0. if goes_left(feature_value, node.split_value, - self.n_categories[node.feature]): + self.n_categories[node.feature], node._bit_cache): node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 856dfaf87aaf2..aea80233cb7c3 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -15,6 +15,7 @@ from ._splitter cimport SplitValue ctypedef np.npy_float32 DTYPE_t # Type of X ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef np.npy_intp SIZE_t # Type for indices and counters +ctypedef np.npy_uint8 UINT8_t # Unsigned 8 bit integer ctypedef np.npy_int32 INT32_t # Signed 32 bit integer ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer @@ -56,9 +57,13 @@ cdef double rand_uniform(double low, double high, cdef double log(double x) nogil -# Function for traversing a tree -cdef bint goes_left(DTYPE_t feature_value, SplitValue split, - INT32_t n_categories) nogil + +# Functions for traversing a tree +cdef inline void make_bit_cache(SplitValue split, INT32_t n_categories, + UINT8_t* bit_cache) nogil + +cdef inline bint goes_left(DTYPE_t feature_value, SplitValue split, + INT32_t n_categories, UINT8_t* bit_cache) nogil # ============================================================================= # Stack data structure diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index a2c975506a751..10d48db6c922e 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -82,25 +82,48 @@ cdef inline double rand_uniform(double low, double high, cdef inline double log(double x) nogil: return ln(x) / ln(2.0) -cdef bint goes_left(DTYPE_t feature_value, SplitValue split, - INT32_t n_categories) nogil: - """Determine whether a sample goes to the left or right child node.""" +cdef inline void make_bit_cache(SplitValue split, INT32_t n_categories, + UINT8_t* bit_cache) nogil: + """Regenerate and store the random numbers for a split.""" cdef UINT32_t rng_seed + cdef SIZE_t q + cdef UINT32_t val, idx, shift - if n_categories < 1: - # Non-categorical feature - return feature_value <= split.threshold - elif (split.cat_split & 1 == 0): + if (n_categories <= 0): + # Non-categorical feature; bit cache not used + return + + if (split.cat_split & 1 == 0): # Bitfield model - return (split.cat_split >> feature_value) & 1 + for q in range((n_categories + 7) // 8): + bit_cache[q] = (split.cat_split >> (q * 8)) & 0xFF else: # Random model + for q in range((n_categories + 7) // 8): + bit_cache[q] = 0 rng_seed = split.cat_split >> 32 for q in range((split.cat_split & 0xFFFFFFFF) >> 1): - if (feature_value == - rand_int(0, n_categories, &rng_seed)): - return 1 - return 0 + val = rand_int(0, n_categories, &rng_seed) + idx = val // 8 + shift = val % 8 + bit_cache[idx] |= (1 << shift) + +cdef inline bint goes_left(DTYPE_t feature_value, SplitValue split, + INT32_t n_categories, UINT8_t* bit_cache) nogil: + """Determine whether a sample goes to the left or right child node.""" + cdef SIZE_t idx, shift + + if n_categories < 1: + # Non-categorical feature + return feature_value <= split.threshold + else: + # Categorical feature, using bit cache + if ( feature_value) < n_categories: + idx = ( feature_value) // 8 + shift = ( feature_value) % 8 + return (bit_cache[idx] >> shift) & 1 + else: + return 0 # ============================================================================= From 1c1b7764a0271d23cf71a1f87ccf99adf25ee93a Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Wed, 13 May 2015 23:43:30 -0700 Subject: [PATCH 14/35] Added a check on the predict() and predict_proba() methods of trees that there are no sparse categorical variables, since they're currently not supported. --- sklearn/tree/tree.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 80a19d573f8bd..1335399af3505 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -450,6 +450,9 @@ def _validate_X_predict(self, X, check_input): n_categories = self.tree_.n_categories categorical_features = np.nonzero(n_categories > 0)[0] if categorical_features.size > 0: + if issparse(X): + raise NotImplementedError("Categorical features not supported" + " with sparse inputs") X = np.copy(X) for feature in categorical_features: rounded = np.round(X[:, feature]).astype('int64') From b68be8cf376dc39c400acee9a43e66760d0eae0a Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Fri, 5 Jun 2015 19:09:07 -0700 Subject: [PATCH 15/35] Added property getter for a Tree's split_values in addition to thresholds. --- sklearn/tree/_tree.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 8bbb24ffb2cd6..93a6d8cc89010 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -589,6 +589,10 @@ cdef class Tree: def __get__(self): return self._get_node_ndarray()['split_value']['threshold'][:self.node_count] + property split_value: + def __get__(self): + return self._get_node_ndarray()['split_value'][:self.node_count] + property impurity: def __get__(self): return self._get_node_ndarray()['impurity'][:self.node_count] From e187c1074dc880ff4e59e1280ee5c96b8168ae18 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Wed, 17 Jun 2015 18:55:58 -0700 Subject: [PATCH 16/35] Added a check to prevent use of DecisionTree (instead of ExtraTree) with more than 64 categories. --- sklearn/tree/tree.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 1335399af3505..e6575c7332b2d 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -344,7 +344,7 @@ def fit(self, X, y, sample_weight=None, categorical='None', rounded = np.round(X[:, feature]).astype(np.int64) self.category_map_[feature] = dict(izip(set(rounded), count())) X[:, feature] = np.array([self.category_map_[feature][x] - for x in rounded]).astype(DTYPE) + for x in rounded]).astype(DTYPE) n_categories[feature] = len(self.category_map_[feature]) # Set min_weight_leaf from min_weight_fraction_leaf @@ -401,6 +401,12 @@ def fit(self, X, y, sample_weight=None, categorical='None', random_state, self.presort) + if (not isinstance(self.splitter, _splitter.RandomSplitter) and + np.max(n_categories) > 64): + raise ValueError('A feature with {} categories was detected; to' + ' use more than 64, use ExtraTree rather than' + ' DecisionTree.'.format(np.max(n_categories))) + self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) self.tree_.n_categories = n_categories From 909f09b299c28af0817b961f381514faab7f352b Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Tue, 23 Jun 2015 12:12:35 -0700 Subject: [PATCH 17/35] Fixed the numpy NODE dtype so it matches the cython struct. --- sklearn/tree/_tree.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 93a6d8cc89010..33756f0932c5b 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -75,9 +75,10 @@ SPLITVALUE_DTYPE = np.dtype({ }) NODE_DTYPE = np.dtype({ 'names': ['left_child', 'right_child', 'feature', 'split_value', - 'impurity', 'n_node_samples', 'weighted_n_node_samples'], + 'impurity', 'n_node_samples', 'weighted_n_node_samples', + '_bit_cache'], 'formats': [np.intp, np.intp, np.intp, SPLITVALUE_DTYPE, np.float64, - np.intp, np.float64], + np.intp, np.float64, np.intp], 'offsets': [ &( NULL).left_child, &( NULL).right_child, @@ -85,7 +86,8 @@ NODE_DTYPE = np.dtype({ &( NULL).split_value, &( NULL).impurity, &( NULL).n_node_samples, - &( NULL).weighted_n_node_samples + &( NULL).weighted_n_node_samples, + &( NULL)._bit_cache ] }) From ede1e69ccaec78af821538287966f1ce029a1d9e Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Tue, 23 Jun 2015 12:13:38 -0700 Subject: [PATCH 18/35] Put the gradient boosting tree descent code into a try..finally block to make sure we free the bit cache. --- sklearn/ensemble/_gradient_boosting.pyx | 126 ++++++++++++------------ 1 file changed, 64 insertions(+), 62 deletions(-) diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index a26762bad21d6..49524cd697751 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -218,69 +218,71 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X, tree.populate_bit_caches() - for i in range(X.shape[0]): - # init stacks for new example - stack_size = 1 - node_stack[0] = root_node - weight_stack[0] = 1.0 - total_weight = 0.0 - - while stack_size > 0: - # get top node on stack - stack_size -= 1 - current_node = node_stack[stack_size] - - if current_node.left_child == LEAF: - out[i] += weight_stack[stack_size] * value[current_node - root_node] * \ - learn_rate - total_weight += weight_stack[stack_size] - else: - # non-terminal node - feature_index = array_index(current_node.feature, target_feature) - if feature_index != -1: - # split feature in target set - # push left or right child on stack - if goes_left(X[i, feature_index], current_node.split_value, - tree.n_categories[current_node.feature], - current_node._bit_cache): - # left - node_stack[stack_size] = (root_node + - current_node.left_child) - else: - # right - node_stack[stack_size] = (root_node + - current_node.right_child) - stack_size += 1 + try: + for i in range(X.shape[0]): + # init stacks for new example + stack_size = 1 + node_stack[0] = root_node + weight_stack[0] = 1.0 + total_weight = 0.0 + + while stack_size > 0: + # get top node on stack + stack_size -= 1 + current_node = node_stack[stack_size] + + if current_node.left_child == LEAF: + out[i] += weight_stack[stack_size] * value[current_node - root_node] * \ + learn_rate + total_weight += weight_stack[stack_size] else: - # split feature in complement set - # push both children onto stack - - # push left child - node_stack[stack_size] = root_node + current_node.left_child - current_weight = weight_stack[stack_size] - left_sample_frac = root_node[current_node.left_child].n_node_samples / \ - current_node.n_node_samples - if left_sample_frac <= 0.0 or left_sample_frac >= 1.0: - raise ValueError("left_sample_frac:%f, " - "n_samples current: %d, " - "n_samples left: %d" - % (left_sample_frac, - current_node.n_node_samples, - root_node[current_node.left_child].n_node_samples)) - weight_stack[stack_size] = current_weight * left_sample_frac - stack_size +=1 - - # push right child - node_stack[stack_size] = root_node + current_node.right_child - weight_stack[stack_size] = current_weight * \ - (1.0 - left_sample_frac) - stack_size +=1 - - if not (0.999 < total_weight < 1.001): - raise ValueError("Total weight should be 1.0 but was %.9f" % - total_weight) - - tree.delete_bit_caches() + # non-terminal node + feature_index = array_index(current_node.feature, target_feature) + if feature_index != -1: + # split feature in target set + # push left or right child on stack + if goes_left(X[i, feature_index], current_node.split_value, + tree.n_categories[current_node.feature], + current_node._bit_cache): + # left + node_stack[stack_size] = (root_node + + current_node.left_child) + else: + # right + node_stack[stack_size] = (root_node + + current_node.right_child) + stack_size += 1 + else: + # split feature in complement set + # push both children onto stack + + # push left child + node_stack[stack_size] = root_node + current_node.left_child + current_weight = weight_stack[stack_size] + left_sample_frac = root_node[current_node.left_child].n_node_samples / \ + current_node.n_node_samples + if left_sample_frac <= 0.0 or left_sample_frac >= 1.0: + raise ValueError("left_sample_frac:%f, " + "n_samples current: %d, " + "n_samples left: %d" + % (left_sample_frac, + current_node.n_node_samples, + root_node[current_node.left_child].n_node_samples)) + weight_stack[stack_size] = current_weight * left_sample_frac + stack_size +=1 + + # push right child + node_stack[stack_size] = root_node + current_node.right_child + weight_stack[stack_size] = current_weight * \ + (1.0 - left_sample_frac) + stack_size +=1 + + if not (0.999 < total_weight < 1.001): + raise ValueError("Total weight should be 1.0 but was %.9f" % + total_weight) + + finally: + tree.delete_bit_caches() def _random_sample_mask(np.npy_intp n_total_samples, From abd27dfa62b6564c46280322d0f321ff1f5e5571 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Thu, 25 Jun 2015 13:21:26 -0700 Subject: [PATCH 19/35] Fixed a python3 compatibility problem. --- sklearn/tree/tree.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index e6575c7332b2d..61bba4d6c8cf0 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -20,7 +20,7 @@ from abc import ABCMeta from abc import abstractmethod from math import ceil -from itertools import izip, count +from itertools import count import numpy as np from scipy.sparse import issparse @@ -29,6 +29,7 @@ from ..base import ClassifierMixin from ..base import RegressorMixin from ..externals import six +from ..externals.six.moves import zip from ..feature_selection.from_model import _LearntSelectorMixin from ..utils import check_array from ..utils import check_random_state @@ -342,7 +343,7 @@ def fit(self, X, y, sample_weight=None, categorical='None', X = np.copy(X) for feature in categorical: rounded = np.round(X[:, feature]).astype(np.int64) - self.category_map_[feature] = dict(izip(set(rounded), count())) + self.category_map_[feature] = dict(zip(set(rounded), count())) X[:, feature] = np.array([self.category_map_[feature][x] for x in rounded]).astype(DTYPE) n_categories[feature] = len(self.category_map_[feature]) @@ -463,7 +464,7 @@ def _validate_X_predict(self, X, check_input): for feature in categorical_features: rounded = np.round(X[:, feature]).astype('int64') new_cat = set(rounded) - set(self.category_map_[feature]) - new_cat_map = dict(izip(new_cat, count(n_categories[feature]))) + new_cat_map = dict(zip(new_cat, count(n_categories[feature]))) X[:, feature] = np.array( [self.category_map_[feature].get(x, new_cat_map.get(x)) for x in rounded]).astype(DTYPE) From ee398817e4e0215403e23ed4c1842dd2a87fc43a Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Tue, 13 Oct 2015 17:36:49 -0700 Subject: [PATCH 20/35] Fixed a Splitter validation bug in tree.py --- sklearn/tree/tree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 61bba4d6c8cf0..02133db98f96b 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -394,7 +394,7 @@ def fit(self, X, y, sample_weight=None, categorical='None', SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter - if not isinstance(self.splitter, Splitter): + if not isinstance(splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, min_samples_leaf, @@ -402,7 +402,7 @@ def fit(self, X, y, sample_weight=None, categorical='None', random_state, self.presort) - if (not isinstance(self.splitter, _splitter.RandomSplitter) and + if (not isinstance(splitter, _splitter.RandomSplitter) and np.max(n_categories) > 64): raise ValueError('A feature with {} categories was detected; to' ' use more than 64, use ExtraTree rather than' From b81b29cc13c865ff26eca34bec42515840cae13d Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Tue, 13 Oct 2015 17:42:37 -0700 Subject: [PATCH 21/35] RandomSplitter now flips a coin to send each category value left or right --- sklearn/tree/_splitter.pxd | 7 +++---- sklearn/tree/_splitter.pyx | 8 +------- sklearn/tree/_utils.pyx | 10 ++++------ 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 64bfb22bd9284..0565a87dde706 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -35,10 +35,9 @@ ctypedef union SplitValue: # for up to 64 categories, sending samples left if the bit # corresponding to their category is 1 or right if it is 0. If the # LSB is 1, then the more significant 32 bits of cat_split is a - # random seed and the next 31 bits are the number of deviates to - # draw. To evaluate a sample, draw the required set of categories - # and check if the sample's feature value is in the set. If so, - # send it left; otherwise right. This second method allows up to + # random seed. To evaluate a sample, use the random seed to flip a + # coin (category_value + 1) times and send it left if the last + # flip gives 1; otherwise right. This second method allows up to # 2**31 category values, but can only be used for RandomSplitter. DOUBLE_t threshold UINT64_t cat_split diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 49fc1af5ed3c6..f4b406f0dd391 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -726,7 +726,6 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef DTYPE_t current_feature_value cdef SIZE_t partition_end cdef bint is_categorical - cdef UINT32_t split_n_draw cdef UINT64_t split_seed _init_split(&best, end) @@ -805,13 +804,8 @@ cdef class RandomSplitter(BaseDenseSplitter): # Construct a random split is_categorical = self.n_categories[current.feature] > 0 if is_categorical: - # split_n_draw is the number of categories to send left - # TODO: this should be a binomial draw - split_n_draw = rand_int(1, self.n_categories[current.feature], - random_state) & 0x7FFFFFFF split_seed = our_rand_r(random_state) - current.split_value.cat_split = ( - (split_seed << 32) | (split_n_draw << 1) | 1) + current.split_value.cat_split = (split_seed << 32) | 1 else: current.split_value.threshold = rand_uniform( min_feature_value, max_feature_value, random_state) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 10d48db6c922e..4f4e2488bc12e 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -87,7 +87,7 @@ cdef inline void make_bit_cache(SplitValue split, INT32_t n_categories, """Regenerate and store the random numbers for a split.""" cdef UINT32_t rng_seed cdef SIZE_t q - cdef UINT32_t val, idx, shift + cdef UINT32_t val if (n_categories <= 0): # Non-categorical feature; bit cache not used @@ -102,11 +102,9 @@ cdef inline void make_bit_cache(SplitValue split, INT32_t n_categories, for q in range((n_categories + 7) // 8): bit_cache[q] = 0 rng_seed = split.cat_split >> 32 - for q in range((split.cat_split & 0xFFFFFFFF) >> 1): - val = rand_int(0, n_categories, &rng_seed) - idx = val // 8 - shift = val % 8 - bit_cache[idx] |= (1 << shift) + for q in range(n_categories): + val = rand_int(0, 2, &rng_seed) + bit_cache[q // 8] |= val << (q % 8) cdef inline bint goes_left(DTYPE_t feature_value, SplitValue split, INT32_t n_categories, UINT8_t* bit_cache) nogil: From 548e4e5ede16173cb53fde28b621eb0478c4fb9c Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Tue, 13 Oct 2015 17:49:06 -0700 Subject: [PATCH 22/35] For categorical features, RandomSplitter now retries up to 20 random splits until a non-trivial one is found --- sklearn/tree/_splitter.pyx | 60 +++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index f4b406f0dd391..e259f524bc704 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -712,6 +712,7 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef SIZE_t f_i = n_features cdef SIZE_t f_j cdef SIZE_t p + cdef SIZE_t q cdef SIZE_t feature_stride # Number of features discovered to be constant during the split search cdef SIZE_t n_found_constants = 0 @@ -802,36 +803,43 @@ cdef class RandomSplitter(BaseDenseSplitter): features[f_i], features[f_j] = features[f_j], features[f_i] # Construct a random split - is_categorical = self.n_categories[current.feature] > 0 - if is_categorical: - split_seed = our_rand_r(random_state) - current.split_value.cat_split = (split_seed << 32) | 1 - else: - current.split_value.threshold = rand_uniform( - min_feature_value, max_feature_value, random_state) - if current.split_value.threshold == max_feature_value: - current.split_value.threshold = min_feature_value - - # Partition - make_bit_cache(current.split_value, self.n_categories[current.feature], - self._bit_cache) - partition_end = end - p = start - while p < partition_end: - current_feature_value = Xf[p] - if goes_left(current_feature_value, current.split_value, - self.n_categories[current.feature], self._bit_cache): - p += 1 + # Repeat up to 20 times if a trivial split is constructed + # (this can only happen with a categorical feature) + for q in range(20): + is_categorical = self.n_categories[current.feature] > 0 + if is_categorical: + split_seed = our_rand_r(random_state) + current.split_value.cat_split = (split_seed << 32) | 1 else: - partition_end -= 1 + current.split_value.threshold = rand_uniform( + min_feature_value, max_feature_value, random_state) + if current.split_value.threshold == max_feature_value: + current.split_value.threshold = min_feature_value + + # Partition + make_bit_cache(current.split_value, self.n_categories[current.feature], + self._bit_cache) + partition_end = end + p = start + while p < partition_end: + current_feature_value = Xf[p] + if goes_left(current_feature_value, current.split_value, + self.n_categories[current.feature], self._bit_cache): + p += 1 + else: + partition_end -= 1 + + Xf[p] = Xf[partition_end] + Xf[partition_end] = current_feature_value - Xf[p] = Xf[partition_end] - Xf[partition_end] = current_feature_value + samples[p], samples[partition_end] = ( + samples[partition_end], samples[p]) - samples[p], samples[partition_end] = ( - samples[partition_end], samples[p]) + current.pos = partition_end - current.pos = partition_end + # Break early if a non-trivial split was found + if partition_end != start and partition_end != end: + break # Reject if min_samples_leaf is not guaranteed if (((current.pos - start) < min_samples_leaf) or From f28074f903402cb1ab08c9038dc2c287a1766c35 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Wed, 14 Oct 2015 19:10:29 -0700 Subject: [PATCH 23/35] Fixed a nasty bug where categorical normalization was happening incorrectly with ensembles. Refactored the categorical transformations into functions. --- sklearn/ensemble/forest.py | 13 +- sklearn/ensemble/gradient_boosting.py | 16 ++- sklearn/tree/__init__.py | 1 + sklearn/tree/tree.py | 180 ++++++++++++++++++-------- 4 files changed, 153 insertions(+), 57 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index bc88343261612..e393d7aface71 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -57,7 +57,8 @@ class calls the ``fit`` method of each sub-estimator on random samples from ..metrics import r2_score from ..preprocessing import OneHotEncoder from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor, - ExtraTreeClassifier, ExtraTreeRegressor) + ExtraTreeClassifier, ExtraTreeRegressor, + preproc_categorical, validate_categorical) from ..tree._tree import DTYPE, DOUBLE from ..utils import check_random_state, check_array, compute_sample_weight from ..exceptions import DataConversionWarning, NotFittedError @@ -155,6 +156,7 @@ def __init__(self, self.verbose = verbose self.warm_start = warm_start self.class_weight = class_weight + self.category_map_ = None def apply(self, X): """Apply trees in the forest to X, return leaf indices. @@ -260,6 +262,10 @@ def fit(self, X, y, sample_weight=None, categorical='None'): # ensemble sorts the indices. X.sort_indices() + # Preprocess categorical variables + X, _, self.category_map_ = preproc_categorical( + X, categorical, check_input=True) + # Remap output n_samples, self.n_features_ = X.shape @@ -361,7 +367,10 @@ def _validate_X_predict(self, X): raise NotFittedError("Estimator not fitted, " "call `fit` before exploiting the model.") - return self.estimators_[0]._validate_X_predict(X, check_input=True) + X = self.estimators_[0]._validate_X_predict(X, check_input=True) + X = validate_categorical(X, self.category_map_) + + return X @property def feature_importances_(self): diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 087a8430f0b15..93f965e6b13cc 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -47,6 +47,8 @@ from time import time from ..tree.tree import DecisionTreeRegressor +from ..tree.tree import preproc_categorical +from ..tree.tree import validate_categorical from ..tree._tree import DTYPE from ..tree._tree import TREE_LEAF @@ -742,6 +744,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split, self.max_leaf_nodes = max_leaf_nodes self.warm_start = warm_start self.presort = presort + self.category_map_ = None self.estimators_ = np.empty((0, 0), dtype=np.object) @@ -981,6 +984,10 @@ def fit(self, X, y, sample_weight=None, categorical='None', monitor=None): y = self._validate_y(y) + # Preprocess categorical variables + X, _, self.category_map_ = preproc_categorical( + X, categorical, check_input=True) + random_state = check_random_state(self.random_state) self._check_params() @@ -1144,9 +1151,10 @@ def decision_function(self, X): Regression and binary classification produce an array of shape [n_samples]. """ - self._check_initialized() X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True) + X = validate_categorical(X, self.category_map_) + score = self._decision_function(X) if score.shape[1] == 1: return score.ravel() @@ -1172,6 +1180,8 @@ def _staged_decision_function(self, X): ``k == 1``, otherwise ``k==n_classes``. """ X = check_array(X, dtype=DTYPE, order="C") + X = validate_categorical(X, self.category_map_) + score = self._init_decision_function(X) for i in range(self.estimators_.shape[0]): predict_stage(self.estimators_, i, X, self.learning_rate, score) @@ -1471,6 +1481,8 @@ def decision_function(self, X): [n_samples]. """ X = check_array(X, dtype=DTYPE, order="C") + X = validate_categorical(X, self.category_map_) + score = self._decision_function(X) if score.shape[1] == 1: return score.ravel() @@ -1807,6 +1819,8 @@ def predict(self, X): The predicted values. """ X = check_array(X, dtype=DTYPE, order="C") + X = validate_categorical(X, self.category_map_) + return self._decision_function(X).ravel() def staged_predict(self, X): diff --git a/sklearn/tree/__init__.py b/sklearn/tree/__init__.py index 1394bd914d27c..42bab93f089ab 100644 --- a/sklearn/tree/__init__.py +++ b/sklearn/tree/__init__.py @@ -7,6 +7,7 @@ from .tree import DecisionTreeRegressor from .tree import ExtraTreeClassifier from .tree import ExtraTreeRegressor +from .tree import preproc_categorical, validate_categorical from .export import export_graphviz __all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor", diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 02133db98f96b..96e02a1a3d063 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -66,6 +66,117 @@ SPARSE_SPLITTERS = {"best": _splitter.BestSparseSplitter, "random": _splitter.RandomSparseSplitter} + +# ============================================================================= +# Support functions +# ============================================================================= +def preproc_categorical(X, categorical, check_input): + """Preprocess categorical features by mapping them to + range(n_categories). Used for fitting. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + Feature array + categorical : array-like or str + Specification of which features are categorical. See fit(). + check_input : bool + If False, bypass creation of category map and transformation + of X. Use only if you know what you are doing. + + Returns + ------- + X : array, shape=(n_samples, n_features) + Transformed copy of the feature array (or the original if + there are no categorical features) + n_categories : array, shape=(n_features,) + Number of categories for each feature (-1 if non-categorical) + category_map : list, length n_features + For each feature, a dictionary relating values to transformed + values, or an empty dictionary for non-categorical features + + """ + n_features = np.shape(X)[1] + if isinstance(categorical, str): + if categorical == "None": + categorical = np.array([]) + elif categorical == "All": + categorical = np.arange(n_features) + else: + raise ValueError("Invalid value for categorical: %s. Allowed" + " strings are 'All' or 'None'" % categorical) + categorical = np.asarray(categorical) + if categorical.dtype == np.bool: + if categorical.shape != (n_features,): + raise ValueError("Shape of boolean parameter categorical must" + " be (n_features,)") + categorical = np.nonzero(categorical)[0] + if (len(categorical.shape) != 1 or + categorical.size > n_features or + (categorical.size > 0 and + (np.min(categorical) < 0 or + np.max(categorical) >= n_features))): + raise ValueError("Invalid shape or invalid feature index for" + " parameter categorical") + if issparse(X) and categorical.size > 0: + raise NotImplementedError("Categorical features not supported with" + " sparse inputs") + + n_categories = np.full(n_features, -1, dtype=np.int32) + category_map = [{}] * n_features + if categorical.size > 0 and check_input: + X = np.copy(X) + for feature in categorical: + rounded = np.round(X[:, feature]).astype(np.int64) + unique_rounded = np.unique(rounded) + if check_input: + category_map[feature] = dict(zip(unique_rounded, count())) + X[:, feature] = np.array([category_map[feature][x] + for x in rounded], dtype=DTYPE) + n_categories[feature] = len(unique_rounded) + + return X, n_categories, category_map + + +def validate_categorical(X, category_map): + """Map categorical features onto sequential integers. Used for + predicting. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + Feature array + category_map : list, length n_features + For each feature, a dictionary relating values to transformed + values, or an empty dictionary for non-categorical features + + Returns + ------- + X : array, shape=(n_samples, n_features) + Transformed copy of the feature array (or the original if + there are no categorical features) + """ + if category_map is None: + return X + + n_categories = np.array([len(x) for x in category_map]) + categorical_features = np.nonzero(n_categories > 0)[0] + if categorical_features.size > 0: + if issparse(X): + raise NotImplementedError("Categorical features not supported" + " with sparse inputs") + X = np.copy(X) + for feature in categorical_features: + rounded = np.round(X[:, feature]).astype('int64') + new_cat = set(rounded) - set(category_map[feature]) + new_cat_map = dict(zip(new_cat, count(n_categories[feature]))) + X[:, feature] = np.array( + [category_map[feature].get(x, new_cat_map.get(x)) + for x in rounded]).astype(DTYPE) + + return X + + # ============================================================================= # Base decision tree # ============================================================================= @@ -312,41 +423,9 @@ def fit(self, X, y, sample_weight=None, categorical='None', else: sample_weight = expanded_class_weight - if isinstance(categorical, str): - if categorical == "None": - categorical = np.array([]) - elif categorical == "All": - categorical = np.arange(self.n_features_) - else: - raise ValueError("Invalid value for categorical: %s. Allowed" - " strings are 'All' or 'None'" % categorical) - categorical = np.atleast_1d(categorical).flatten() - if categorical.dtype == np.bool: - if categorical.size != self.n_features_: - raise ValueError("Shape of boolean parameter categorical must" - " be [n_features]") - categorical = np.nonzero(categorical)[0] - if (categorical.size > self.n_features_ or - (categorical.size > 0 and - (np.min(categorical) < 0 or - np.max(categorical) >= self.n_features_))): - raise ValueError("Invalid shape or invalid feature index for" - " parameter categorical") - if issparse(X) and len(categorical) > 0: - raise NotImplementedError("Categorical features not supported with" - " sparse inputs") - - # Determine the number of categories in each categorical feature - n_categories = np.zeros(self.n_features_, dtype=np.int32) - 1 - self.category_map_ = [None] * self.n_features_ - if categorical.size > 0: - X = np.copy(X) - for feature in categorical: - rounded = np.round(X[:, feature]).astype(np.int64) - self.category_map_[feature] = dict(zip(set(rounded), count())) - X[:, feature] = np.array([self.category_map_[feature][x] - for x in rounded]).astype(DTYPE) - n_categories[feature] = len(self.category_map_[feature]) + # Do preprocessing of categorical variables + X, n_categories, self.category_map_ = preproc_categorical( + X, categorical, check_input) # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: @@ -453,22 +532,6 @@ def _validate_X_predict(self, X, check_input): "input n_features is %s " % (self.n_features_, n_features)) - # Map categorical features onto integers - n_categories = self.tree_.n_categories - categorical_features = np.nonzero(n_categories > 0)[0] - if categorical_features.size > 0: - if issparse(X): - raise NotImplementedError("Categorical features not supported" - " with sparse inputs") - X = np.copy(X) - for feature in categorical_features: - rounded = np.round(X[:, feature]).astype('int64') - new_cat = set(rounded) - set(self.category_map_[feature]) - new_cat_map = dict(zip(new_cat, count(n_categories[feature]))) - X[:, feature] = np.array( - [self.category_map_[feature].get(x, new_cat_map.get(x)) - for x in rounded]).astype(DTYPE) - return X def predict(self, X, check_input=True): @@ -487,7 +550,7 @@ def predict(self, X, check_input=True): check_input : boolean, (default=True) Allow to bypass several input checking. - Don't use this parameter unless you know what you do. + Don't use this parameter unless you know what you are doing. Returns ------- @@ -496,6 +559,9 @@ def predict(self, X, check_input=True): """ X = self._validate_X_predict(X, check_input) + if check_input: + X = validate_categorical(X, self.category_map_) + proba = self.tree_.predict(X) n_samples = X.shape[0] @@ -537,7 +603,7 @@ def apply(self, X, check_input=True): check_input : boolean, (default=True) Allow to bypass several input checking. - Don't use this parameter unless you know what you do. + Don't use this parameter unless you know what you are doing. Returns ------- @@ -548,6 +614,9 @@ def apply(self, X, check_input=True): numbering. """ X = self._validate_X_predict(X, check_input) + if check_input: + X = validate_categorical(X, self.category_map_) + return self.tree_.apply(X) def decision_path(self, X, check_input=True): @@ -782,7 +851,7 @@ class in a leaf. check_input : boolean, (default=True) Allow to bypass several input checking. - Don't use this parameter unless you know what you do. + Don't use this parameter unless you know what you are doing. Parameters ---------- @@ -799,6 +868,9 @@ class in a leaf. classes corresponds to that in the attribute `classes_`. """ X = self._validate_X_predict(X, check_input) + if check_input: + X = validate_categorical(X, self.category_map_) + proba = self.tree_.predict(X) if self.n_outputs_ == 1: From 36c2e47faf96ec50a067c842f655f3816f46b5db Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Wed, 9 Dec 2015 19:26:49 -0800 Subject: [PATCH 24/35] Added some unit tests. More to come. --- sklearn/tree/tests/test_tree.py | 41 +++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index e4ca2be5e452a..1b55529bbca7f 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1443,3 +1443,44 @@ def test_no_sparse_y_support(): # Currently we don't support sparse y for name in ALL_TREES: yield (check_no_sparse_y_support, name) + + +def test_invalid_categorical_str(): + check = lambda nm: assert_raises(ValueError, ALL_TREES[nm]().fit, X, y, + categorical='example invalid string') + for name in ALL_TREES: + yield check, name + + +def test_invalid_categorical_bool(): + check = lambda nm: assert_raises(ValueError, ALL_TREES[nm]().fit, X, y, + categorical=[False, False, False]) + for name in ALL_TREES: + yield check, name + + +def check_invalid_categorical_idx(name): + Tree = ALL_TREES[name] + bad_catvals = [[1, 2], [-3], [[0]], [0, 0, 1]] + for catval in bad_catvals: + assert_raises(ValueError, Tree().fit, X, y, categorical=catval) + + +def test_invalid_categorical_idx(): + for name in ALL_TREES: + yield check_invalid_categorical_idx, name + + +def check_no_sparse_with_categorical(name): + X, y, X_sparse = [DATASETS['toy'][z] for z in + ['X', 'y', 'X_sparse']] + Tree = ALL_TREES[name] + assert_raises(NotImplementedError, Tree().fit, X_sparse, y, + categorical='All') + assert_raises(NotImplementedError, + Tree().fit(X, y, categorical='All').predict, X_sparse) + + +def test_no_sparse_with_categorical(): + for name in SPARSE_TREES: + yield check_no_sparse_with_categorical, name From d555231c0043984b0c4c95fed87bb94653112133 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Mon, 21 Mar 2016 18:49:02 -0700 Subject: [PATCH 25/35] Upped the maximum number of times RandomSplitter will retry if it accidentally creates a trivial split. --- sklearn/tree/_splitter.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index e259f524bc704..028f346c814d8 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -803,9 +803,9 @@ cdef class RandomSplitter(BaseDenseSplitter): features[f_i], features[f_j] = features[f_j], features[f_i] # Construct a random split - # Repeat up to 20 times if a trivial split is constructed + # Repeat up to 60 times if a trivial split is constructed # (this can only happen with a categorical feature) - for q in range(20): + for q in range(60): is_categorical = self.n_categories[current.feature] > 0 if is_categorical: split_seed = our_rand_r(random_state) From e2fb46ed6022e8e82348e75ba7809d39cf94407d Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Sat, 26 Mar 2016 00:08:46 -0700 Subject: [PATCH 26/35] Added code to BestSplitter to restrict split trials to categories represented in the local sample. --- sklearn/tree/_splitter.pyx | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 028f346c814d8..f002a5bdb5e06 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -365,6 +365,9 @@ cdef class BestSplitter(BaseDenseSplitter): cdef DTYPE_t current_feature_value cdef SIZE_t partition_end cdef bint is_categorical + cdef UINT64_t cat_split + cdef INT32_t ncat_present + cdef INT32_t cat_offs[64] _init_split(&best, end) @@ -452,23 +455,40 @@ cdef class BestSplitter(BaseDenseSplitter): # Evaluate all splits self.criterion.reset() is_categorical = self.n_categories[current.feature] > 0 - p = 0 if is_categorical else start + if is_categorical: + p = 0 + # Identify the subset of categories present (for performance reasons) + cat_split = 0 + ncat_present = 0 + for q in range(start, end): + cat_split |= 1 << (Xf[q]) + for q in range(self.n_categories[current.feature]): + if cat_split & (1 << q): + cat_offs[ncat_present] = q - ncat_present + ncat_present += 1 + else: + p = start while True: if is_categorical: # WARNING: This is O(n_samples * # 2**n_categories), and will be very slow # for more than just a few categories. - if p > (1 << self.n_categories[current.feature]) - 1: + if p > (1 << ncat_present) - 1: break else: p += 2 # LSB must always be 0 + # Expand the bits of p out into cat_split + cat_split = 0 + for q in range(ncat_present): + cat_split |= (p & (1 << q)) << cat_offs[q] + # Partition q = start partition_end = end while q < partition_end: - if (p >> Xf[q]) & 1: + if (cat_split >> Xf[q]) & 1: q += 1 else: partition_end -= 1 @@ -510,7 +530,7 @@ cdef class BestSplitter(BaseDenseSplitter): if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement if is_categorical: - current.split_value.cat_split = p + current.split_value.cat_split = cat_split else: current.split_value.threshold = (Xf[p - 1] + Xf[p]) / 2.0 if current.split_value.threshold == Xf[p]: From 05cd4fefcf0275056433371d27d3c6a2efd06fea Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Thu, 5 May 2016 21:57:18 -0700 Subject: [PATCH 27/35] Fixed a warning, which was actually a compile error in clang. --- sklearn/tree/_utils.pxd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index aea80233cb7c3..5d2bf007c3926 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -59,11 +59,11 @@ cdef double log(double x) nogil # Functions for traversing a tree -cdef inline void make_bit_cache(SplitValue split, INT32_t n_categories, - UINT8_t* bit_cache) nogil +cdef void make_bit_cache(SplitValue split, INT32_t n_categories, + UINT8_t* bit_cache) nogil -cdef inline bint goes_left(DTYPE_t feature_value, SplitValue split, - INT32_t n_categories, UINT8_t* bit_cache) nogil +cdef bint goes_left(DTYPE_t feature_value, SplitValue split, + INT32_t n_categories, UINT8_t* bit_cache) nogil # ============================================================================= # Stack data structure From 227d65ee3809eff505fe087a3349ec4d4a6424fd Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Sun, 29 May 2016 22:22:03 -0700 Subject: [PATCH 28/35] Fixed a bug where BestSplitter was miscalculating impurities for categorical features. --- sklearn/tree/_splitter.pyx | 3 +++ sklearn/tree/log | 0 2 files changed, 3 insertions(+) create mode 100644 sklearn/tree/log diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index f002a5bdb5e06..bfc1e0c7a787d 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -496,6 +496,9 @@ cdef class BestSplitter(BaseDenseSplitter): samples[q], samples[partition_end] = ( samples[partition_end], samples[q]) current.pos = q + + # Must reset criterion since we've reordered the samples + self.criterion.reset() else: # Non-categorical feature while (p + 1 < end and diff --git a/sklearn/tree/log b/sklearn/tree/log new file mode 100644 index 0000000000000..e69de29bb2d1d From 548c94506a9fc0984e328d92489845266d5a10a4 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Thu, 26 May 2016 23:21:51 -0700 Subject: [PATCH 29/35] Added an implementation of the Breiman shortcut. Turned off for now. --- sklearn/tree/_splitter.pyx | 76 +++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 10 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index bfc1e0c7a787d..ed3136c204cc3 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -317,8 +317,51 @@ cdef class BestSplitter(BaseDenseSplitter): self.random_state, self.presort), self.__getstate__()) + + cdef void _shortcut_catlist(self, SIZE_t start, SIZE_t end, INT32_t ncat, + INT32_t ncat_present, const INT32_t *cat_offs, + SIZE_t *shortcut_cat) nogil: + """The Breiman shortcut for finding the best split involves a + preprocessing step wherein we sort the categories by + increasing (weighted) mean of the outcome y (whether 0/1 + binary for classification or quantitative for + regression). This function implements this preprocessing step + and produces a sorted list of category values. + + """ + cdef SIZE_t *samples = self.samples + cdef DTYPE_t *Xf = self.feature_values + cdef DOUBLE_t *y = self.y + cdef SIZE_t y_stride = self.y_stride + cdef DOUBLE_t *sample_weight = self.sample_weight + cdef DOUBLE_t w + cdef SIZE_t cat, localcat + cdef SIZE_t q, partition_end + cdef DTYPE_t sort_value[64] + cdef DTYPE_t sort_den[64] + + for cat in range(ncat): + sort_value[cat] = 0 + sort_den[cat] = 0 + + for q in range(start, end): + cat = Xf[q] + w = sample_weight[samples[q]] if sample_weight else 1.0 + sort_value[cat] += w * (y[y_stride * samples[q]]) + sort_den[cat] += w + + for localcat in range(ncat_present): + cat = localcat + cat_offs[localcat] + sort_value[localcat] = sort_value[cat] / sort_den[cat] + shortcut_cat[localcat] = cat + + # Second step: sort by decreasing impurity + sort(&sort_value[0], shortcut_cat, ncat_present) + + cdef void node_split(self, double impurity, SplitRecord* split, SIZE_t* n_constant_features) nogil: + """Find the best split on node samples[start:end].""" # Find the best split cdef SIZE_t* samples = self.samples @@ -368,6 +411,8 @@ cdef class BestSplitter(BaseDenseSplitter): cdef UINT64_t cat_split cdef INT32_t ncat_present cdef INT32_t cat_offs[64] + cdef SIZE_t shortcut_cat[64] + cdef bint shortcut = 0 _init_split(&best, end) @@ -466,23 +511,34 @@ cdef class BestSplitter(BaseDenseSplitter): if cat_split & (1 << q): cat_offs[ncat_present] = q - ncat_present ncat_present += 1 + if shortcut: + self._shortcut_catlist(start, end, self.n_categories[current.feature], + ncat_present, cat_offs, &shortcut_cat[0]) else: p = start while True: if is_categorical: - # WARNING: This is O(n_samples * - # 2**n_categories), and will be very slow - # for more than just a few categories. - if p > (1 << ncat_present) - 1: - break + if shortcut: + p += 1 + if p >= ncat_present: + break + cat_split = 0 + for q in range(p): + cat_split |= ( 1) << shortcut_cat[q] + if cat_split & 1: + cat_split = (~cat_split) & ( + (( 1) << self.n_categories[current.feature]) - 1) else: - p += 2 # LSB must always be 0 + if p > (1 << ncat_present) - 1: + break + else: + p += 2 # LSB must always be 0 - # Expand the bits of p out into cat_split - cat_split = 0 - for q in range(ncat_present): - cat_split |= (p & (1 << q)) << cat_offs[q] + # Expand the bits of p out into cat_split + cat_split = 0 + for q in range(ncat_present): + cat_split |= (p & (( 1) << q)) << cat_offs[q] # Partition q = start From 1f901ea38dc0efa0aa33a2ec697fac2488d47883 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Mon, 30 May 2016 23:34:18 -0700 Subject: [PATCH 30/35] Added code to automatically trigger the Breiman shortcut when appropriate for categorical features. --- sklearn/tree/_splitter.pxd | 2 ++ sklearn/tree/_splitter.pyx | 10 ++++++---- sklearn/tree/tree.py | 9 ++++++++- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 0565a87dde706..df009c988f925 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -82,6 +82,8 @@ cdef class Splitter: cdef bint presort # Whether to use presorting, only # allowed on dense data + cdef bint shortcut # Whether decision trees are allowed to use the + # Breiman shortcut for categorical features cdef DOUBLE_t* y cdef SIZE_t y_stride diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index ed3136c204cc3..d155a6769fbdf 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -63,7 +63,7 @@ cdef class Splitter: def __cinit__(self, Criterion criterion, SIZE_t max_features, SIZE_t min_samples_leaf, double min_weight_leaf, - object random_state, bint presort): + object random_state, bint presort, bint shortcut): """ Parameters ---------- @@ -106,6 +106,7 @@ cdef class Splitter: self.min_weight_leaf = min_weight_leaf self.random_state = random_state self.presort = presort + self.shortcut = shortcut def __dealloc__(self): """Destructor.""" @@ -263,7 +264,7 @@ cdef class BaseDenseSplitter(Splitter): def __cinit__(self, Criterion criterion, SIZE_t max_features, SIZE_t min_samples_leaf, double min_weight_leaf, - object random_state, bint presort): + object random_state, bint presort, bint shortcut): self.X = NULL self.X_sample_stride = 0 @@ -412,7 +413,7 @@ cdef class BestSplitter(BaseDenseSplitter): cdef INT32_t ncat_present cdef INT32_t cat_offs[64] cdef SIZE_t shortcut_cat[64] - cdef bint shortcut = 0 + cdef bint shortcut = self.shortcut _init_split(&best, end) @@ -511,6 +512,7 @@ cdef class BestSplitter(BaseDenseSplitter): if cat_split & (1 << q): cat_offs[ncat_present] = q - ncat_present ncat_present += 1 + shortcut = self.shortcut if ncat_present > 3 else 0 # No benefit for small N if shortcut: self._shortcut_catlist(start, end, self.n_categories[current.feature], ncat_present, cat_offs, &shortcut_cat[0]) @@ -996,7 +998,7 @@ cdef class BaseSparseSplitter(Splitter): def __cinit__(self, Criterion criterion, SIZE_t max_features, SIZE_t min_samples_leaf, double min_weight_leaf, - object random_state, bint presort): + object random_state, bint presort, bint shortcut): # Parent __cinit__ is automatically called self.X_data = NULL diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 96e02a1a3d063..20b5b9857be06 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -469,6 +469,12 @@ def fit(self, X, y, sample_weight=None, categorical='None', self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) + if is_classification: + use_shortcut = (self.n_classes_.tolist() == [2] and + (isinstance(criterion, _criterion.Gini) or + isinstance(criterion, _criterion.Entropy))) + else: + use_shortcut = isinstance(criterion, _criterion.MSE) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS @@ -479,7 +485,8 @@ def fit(self, X, y, sample_weight=None, categorical='None', min_samples_leaf, min_weight_leaf, random_state, - self.presort) + self.presort, + use_shortcut) if (not isinstance(splitter, _splitter.RandomSplitter) and np.max(n_categories) > 64): From 18cecc5040b592d12be4613d57d1260c7e586ac1 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Sun, 5 Jun 2016 13:49:23 -0700 Subject: [PATCH 31/35] Fixed a left-shift-too-far (undefined behavior) bug. --- sklearn/tree/_splitter.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index d155a6769fbdf..aebe988e15e6d 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -527,12 +527,12 @@ cdef class BestSplitter(BaseDenseSplitter): break cat_split = 0 for q in range(p): - cat_split |= ( 1) << shortcut_cat[q] + cat_split |= ( 1) << shortcut_cat[q] if cat_split & 1: cat_split = (~cat_split) & ( - (( 1) << self.n_categories[current.feature]) - 1) + (~( 0)) >> (64 - self.n_categories[current.feature])) else: - if p > (1 << ncat_present) - 1: + if p > ((~( 0)) >> (64 - ncat_present)): break else: p += 2 # LSB must always be 0 @@ -540,7 +540,7 @@ cdef class BestSplitter(BaseDenseSplitter): # Expand the bits of p out into cat_split cat_split = 0 for q in range(ncat_present): - cat_split |= (p & (( 1) << q)) << cat_offs[q] + cat_split |= (p & (( 1) << q)) << cat_offs[q] # Partition q = start From 138c8d4b71e6dee78581459b1433c4f6e5d3b939 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Mon, 4 Jul 2016 22:48:10 -0700 Subject: [PATCH 32/35] Removed numpy access to the split value union to pacify unit tests run with numpy v1.6. This should be reverted when support for numpy v1.6 is dropped. --- sklearn/tree/_tree.pyx | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 33756f0932c5b..401de169975a8 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -68,16 +68,11 @@ cdef SIZE_t INITIAL_STACK_SIZE = 10 cdef DTYPE_t MIN_IMPURITY_SPLIT = 1e-7 # Repeat struct definition for numpy -SPLITVALUE_DTYPE = np.dtype({ - 'names': ['threshold', 'cat_split'], - 'formats': [np.float64, np.uint64], - 'offsets': [0, 0] -}) NODE_DTYPE = np.dtype({ - 'names': ['left_child', 'right_child', 'feature', 'split_value', + 'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples', '_bit_cache'], - 'formats': [np.intp, np.intp, np.intp, SPLITVALUE_DTYPE, np.float64, + 'formats': [np.intp, np.intp, np.intp, np.float64, np.float64, np.intp, np.float64, np.intp], 'offsets': [ &( NULL).left_child, @@ -589,11 +584,7 @@ cdef class Tree: property threshold: def __get__(self): - return self._get_node_ndarray()['split_value']['threshold'][:self.node_count] - - property split_value: - def __get__(self): - return self._get_node_ndarray()['split_value'][:self.node_count] + return self._get_node_ndarray()['threshold'][:self.node_count] property impurity: def __get__(self): From 1330942d70b0161fe4ae4e86b1beab8171aaaf19 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Wed, 13 Jul 2016 22:19:43 -0700 Subject: [PATCH 33/35] Replaced a call to np.full with np.ones, to accomodate older versions of numpy. --- sklearn/tree/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 20b5b9857be06..c10303d395ff6 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -122,7 +122,7 @@ def preproc_categorical(X, categorical, check_input): raise NotImplementedError("Categorical features not supported with" " sparse inputs") - n_categories = np.full(n_features, -1, dtype=np.int32) + n_categories = -np.ones(n_features, dtype=np.int32) category_map = [{}] * n_features if categorical.size > 0 and check_input: X = np.copy(X) From 2a555b17c772b687ab5344092a9f5bb90ac0d308 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Thu, 14 Jul 2016 08:57:43 -0700 Subject: [PATCH 34/35] Moved the categorical parameter from fit() to the constructor, for trees, forests, and gradient boosting. Tweaked unit tests to match. --- sklearn/ensemble/forest.py | 116 +++++++++++++++++++------- sklearn/ensemble/gradient_boosting.py | 64 +++++++------- sklearn/tree/tests/test_tree.py | 23 +++-- sklearn/tree/tree.py | 92 +++++++++++++------- 4 files changed, 202 insertions(+), 93 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index e393d7aface71..8afabe098bb27 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -91,8 +91,8 @@ def _generate_unsampled_indices(random_state, n_samples): return unsampled_indices -def _parallel_build_trees(tree, forest, X, y, sample_weight, categorical, - tree_idx, n_trees, verbose=0, class_weight=None): +def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, + n_trees, verbose=0, class_weight=None): """Private function used to fit a single tree in parallel.""" if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) @@ -115,11 +115,9 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, categorical, elif class_weight == 'balanced_subsample': curr_sample_weight *= compute_sample_weight('balanced', y, indices) - tree.fit(X, y, sample_weight=curr_sample_weight, - categorical=categorical, check_input=False) + tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) else: - tree.fit(X, y, sample_weight=sample_weight, - categorical=categorical, check_input=False) + tree.fit(X, y, sample_weight=sample_weight, check_input=False) return tree @@ -215,7 +213,7 @@ def decision_path(self, X): return sparse_hstack(indicators).tocsr(), n_nodes_ptr - def fit(self, X, y, sample_weight=None, categorical='None'): + def fit(self, X, y, sample_weight=None): """Build a forest of trees from the training set (X, y). Parameters @@ -236,19 +234,6 @@ def fit(self, X, y, sample_weight=None, categorical='None'): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. - categorical : array-like or str - Array of feature indices, boolean array of length - n_features, ``'All'``, or ``'None'``. Indicates which - features should be considered as categorical rather than - ordinal. For decision trees, the maximum number of - categories per feature is 64, though the real-world limit - will be much lower because evaluating splits has - :math:`O(2^N)` time complexity, for :math:`N` - categories. Extra-randomized trees do not have this - limitation because they do not try to find the best - split. For these trees, the maximum number of categories - per feature is :math:`2^{31}`. - Returns ------- self : object @@ -264,7 +249,7 @@ def fit(self, X, y, sample_weight=None, categorical='None'): # Preprocess categorical variables X, _, self.category_map_ = preproc_categorical( - X, categorical, check_input=True) + X, self.categorical, check_input=True) # Remap output n_samples, self.n_features_ = X.shape @@ -336,7 +321,7 @@ def fit(self, X, y, sample_weight=None, categorical='None'): trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_trees)( - t, self, X, y, sample_weight, categorical, i, len(trees), + t, self, X, y, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight) for i, t in enumerate(trees)) @@ -829,6 +814,19 @@ class RandomForestClassifier(ForestClassifier): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + categorical : array-like or str + Array of feature indices, boolean array of length + n_features, ``'all'``, or ``'none'``. Indicates which + features should be considered as categorical rather than + ordinal. For decision trees, the maximum number of + categories per feature is 64, though the real-world limit + will be much lower because evaluating splits has + :math:`O(2^N)` time complexity, for :math:`N` + categories. Extra-randomized trees do not have this + limitation because they do not try to find the best + split. For these trees, the maximum number of categories + per feature is :math:`2^{31}`. + bootstrap : boolean, optional (default=True) Whether bootstrap samples are used when building trees. @@ -922,6 +920,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, + categorical="none", bootstrap=True, oob_score=False, n_jobs=1, @@ -935,7 +934,7 @@ def __init__(self, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", - "random_state"), + "random_state", "categorical"), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, @@ -951,6 +950,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes + self.categorical = categorical class RandomForestRegressor(ForestRegressor): @@ -1022,6 +1022,19 @@ class RandomForestRegressor(ForestRegressor): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + categorical : array-like or str + Array of feature indices, boolean array of length + n_features, ``'all'``, or ``'none'``. Indicates which + features should be considered as categorical rather than + ordinal. For decision trees, the maximum number of + categories per feature is 64, though the real-world limit + will be much lower because evaluating splits has + :math:`O(2^N)` time complexity, for :math:`N` + categories. Extra-randomized trees do not have this + limitation because they do not try to find the best + split. For these trees, the maximum number of categories + per feature is :math:`2^{31}`. + bootstrap : boolean, optional (default=True) Whether bootstrap samples are used when building trees. @@ -1085,6 +1098,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, + categorical="none", bootstrap=True, oob_score=False, n_jobs=1, @@ -1097,7 +1111,7 @@ def __init__(self, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", - "random_state"), + "random_state", "categorical"), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, @@ -1112,6 +1126,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes + self.categorical = categorical class ExtraTreesClassifier(ForestClassifier): @@ -1181,6 +1196,19 @@ class ExtraTreesClassifier(ForestClassifier): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + categorical : array-like or str + Array of feature indices, boolean array of length + n_features, ``'all'``, or ``'none'``. Indicates which + features should be considered as categorical rather than + ordinal. For decision trees, the maximum number of + categories per feature is 64, though the real-world limit + will be much lower because evaluating splits has + :math:`O(2^N)` time complexity, for :math:`N` + categories. Extra-randomized trees do not have this + limitation because they do not try to find the best + split. For these trees, the maximum number of categories + per feature is :math:`2^{31}`. + bootstrap : boolean, optional (default=False) Whether bootstrap samples are used when building trees. @@ -1276,6 +1304,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, + categorical="none", bootstrap=False, oob_score=False, n_jobs=1, @@ -1289,7 +1318,7 @@ def __init__(self, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", - "random_state"), + "random_state", "categorical"), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, @@ -1305,6 +1334,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes + self.categorical = categorical class ExtraTreesRegressor(ForestRegressor): @@ -1374,6 +1404,19 @@ class ExtraTreesRegressor(ForestRegressor): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + categorical : array-like or str + Array of feature indices, boolean array of length + n_features, ``'all'``, or ``'none'``. Indicates which + features should be considered as categorical rather than + ordinal. For decision trees, the maximum number of + categories per feature is 64, though the real-world limit + will be much lower because evaluating splits has + :math:`O(2^N)` time complexity, for :math:`N` + categories. Extra-randomized trees do not have this + limitation because they do not try to find the best + split. For these trees, the maximum number of categories + per feature is :math:`2^{31}`. + bootstrap : boolean, optional (default=False) Whether bootstrap samples are used when building trees. @@ -1438,6 +1481,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, + categorical="none", bootstrap=False, oob_score=False, n_jobs=1, @@ -1450,7 +1494,7 @@ def __init__(self, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", - "random_state"), + "random_state", "categorical"), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, @@ -1465,6 +1509,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes + self.categorical = categorical class RandomTreesEmbedding(BaseForest): @@ -1541,6 +1586,19 @@ class RandomTreesEmbedding(BaseForest): and add more estimators to the ensemble, otherwise, just fit a whole new forest. + categorical : array-like or str + Array of feature indices, boolean array of length + n_features, ``'all'``, or ``'none'``. Indicates which + features should be considered as categorical rather than + ordinal. For decision trees, the maximum number of + categories per feature is 64, though the real-world limit + will be much lower because evaluating splits has + :math:`O(2^N)` time complexity, for :math:`N` + categories. Extra-randomized trees do not have this + limitation because they do not try to find the best + split. For these trees, the maximum number of categories + per feature is :math:`2^{31}`. + Attributes ---------- estimators_ : list of DecisionTreeClassifier @@ -1567,14 +1625,15 @@ def __init__(self, n_jobs=1, random_state=None, verbose=0, - warm_start=False): + warm_start=False, + categorical="none"): super(RandomTreesEmbedding, self).__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", - "random_state"), + "random_state", "categorical"), bootstrap=False, oob_score=False, n_jobs=n_jobs, @@ -1590,6 +1649,7 @@ def __init__(self, self.max_features = 1 self.max_leaf_nodes = max_leaf_nodes self.sparse_output = sparse_output + self.categorical = categorical def _set_oob_score(self, X, y): raise NotImplementedError("OOB score not supported by tree embedding") diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 93f965e6b13cc..66af4b3741f39 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -726,7 +726,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, init, subsample, max_features, random_state, alpha=0.9, verbose=0, max_leaf_nodes=None, - warm_start=False, presort='auto'): + warm_start=False, presort='auto', categorical='none'): self.n_estimators = n_estimators self.learning_rate = learning_rate @@ -744,13 +744,13 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split, self.max_leaf_nodes = max_leaf_nodes self.warm_start = warm_start self.presort = presort + self.categorical = categorical self.category_map_ = None self.estimators_ = np.empty((0, 0), dtype=np.object) def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, - categorical, random_state, X_idx_sorted, X_csc=None, - X_csr=None): + random_state, X_idx_sorted, X_csc=None, X_csr=None): """Fit another stage of ``n_classes_`` trees to the boosting model. """ assert sample_mask.dtype == np.bool @@ -775,15 +775,16 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=random_state, - presort=self.presort) + presort=self.presort, + categorical=self.categorical) if self.subsample < 1.0: # no inplace multiplication! sample_weight = sample_weight * sample_mask.astype(np.float64) tree.fit(X_csc if X_csc is not None else X, residual, - sample_weight=sample_weight, categorical=categorical, - check_input=False, X_idx_sorted=X_idx_sorted) + sample_weight=sample_weight, check_input=False, + X_idx_sorted=X_idx_sorted) # update tree leaves loss.update_terminal_regions( @@ -924,7 +925,7 @@ def _check_initialized(self): raise NotFittedError("Estimator not fitted, call `fit`" " before making predictions`.") - def fit(self, X, y, sample_weight=None, categorical='None', monitor=None): + def fit(self, X, y, sample_weight=None, monitor=None): """Fit the gradient boosting model. Parameters @@ -945,15 +946,6 @@ def fit(self, X, y, sample_weight=None, categorical='None', monitor=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. - categorical : array-like or str - Array of feature indices, boolean array of length - n_features, ``'All'``, or ``'None'``. Indicates which - features should be considered as categorical rather than - ordinal. The maximum number of categories per feature is - 64, though the real-world limit will be much lower because - evaluating splits has :math:`O(2^N)` time complexity, for - :math:`N` categories. - monitor : callable, optional The monitor is called after each iteration with the current iteration, a reference to the estimator and the local variables of @@ -986,7 +978,7 @@ def fit(self, X, y, sample_weight=None, categorical='None', monitor=None): # Preprocess categorical variables X, _, self.category_map_ = preproc_categorical( - X, categorical, check_input=True) + X, self.categorical, check_input=True) random_state = check_random_state(self.random_state) self._check_params() @@ -1031,9 +1023,8 @@ def fit(self, X, y, sample_weight=None, categorical='None', monitor=None): dtype=np.int32) # fit the boosting stages - n_stages = self._fit_stages(X, y, y_pred, sample_weight, categorical, - random_state, begin_at_stage, monitor, - X_idx_sorted) + n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state, + begin_at_stage, monitor, X_idx_sorted) # change shape of arrays after fit (early-stopping or additional ests) if n_stages != self.estimators_.shape[0]: self.estimators_ = self.estimators_[:n_stages] @@ -1043,9 +1034,8 @@ def fit(self, X, y, sample_weight=None, categorical='None', monitor=None): return self - def _fit_stages(self, X, y, y_pred, sample_weight, categorical, - random_state, begin_at_stage=0, monitor=None, - X_idx_sorted=None): + def _fit_stages(self, X, y, y_pred, sample_weight, random_state, + begin_at_stage=0, monitor=None, X_idx_sorted=None): """Iteratively fits the stages. For each stage it computes the progress (OOB, train score) @@ -1088,7 +1078,7 @@ def _fit_stages(self, X, y, y_pred, sample_weight, categorical, # fit next stage of trees y_pred = self._fit_stage(i, X, y, y_pred, sample_weight, - sample_mask, categorical, random_state, + sample_mask, random_state, X_idx_sorted, X_csc, X_csr) # track deviance (= loss) @@ -1393,6 +1383,15 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): .. versionadded:: 0.17 *presort* parameter. + categorical : array-like or str + Array of feature indices, boolean array of length + n_features, ``'all'``, or ``'none'``. Indicates which + features should be considered as categorical rather than + ordinal. The maximum number of categories per feature is + 64, though the real-world limit will be much lower because + evaluating splits has :math:`O(2^N)` time complexity, for + :math:`N` categories. + Attributes ---------- feature_importances_ : array, shape = [n_features] @@ -1445,7 +1444,7 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, max_depth=3, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, - presort='auto'): + presort='auto', categorical='none'): super(GradientBoostingClassifier, self).__init__( loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, @@ -1456,7 +1455,7 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, max_features=max_features, random_state=random_state, verbose=verbose, max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, - presort=presort) + presort=presort, categorical=categorical) def _validate_y(self, y): check_classification_targets(y) @@ -1744,6 +1743,15 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): .. versionadded:: 0.17 optional parameter *presort*. + categorical : array-like or str + Array of feature indices, boolean array of length + n_features, ``'all'``, or ``'none'``. Indicates which + features should be considered as categorical rather than + ordinal. The maximum number of categories per feature is + 64, though the real-world limit will be much lower because + evaluating splits has :math:`O(2^N)` time complexity, for + :math:`N` categories. + Attributes ---------- feature_importances_ : array, shape = [n_features] @@ -1792,7 +1800,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, - warm_start=False, presort='auto'): + warm_start=False, presort='auto', categorical='none'): super(GradientBoostingRegressor, self).__init__( loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, @@ -1803,7 +1811,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, max_features=max_features, random_state=random_state, alpha=alpha, verbose=verbose, max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, - presort=presort) + presort=presort, categorical=categorical) def predict(self, X): """Predict regression target for X. diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 1b55529bbca7f..c03420def7723 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1446,24 +1446,31 @@ def test_no_sparse_y_support(): def test_invalid_categorical_str(): - check = lambda nm: assert_raises(ValueError, ALL_TREES[nm]().fit, X, y, + check = lambda nm: assert_raises(ValueError, ALL_TREES[nm], categorical='example invalid string') for name in ALL_TREES: yield check, name def test_invalid_categorical_bool(): - check = lambda nm: assert_raises(ValueError, ALL_TREES[nm]().fit, X, y, - categorical=[False, False, False]) + check = lambda nm: assert_raises( + ValueError, ALL_TREES[nm](categorical=[False, False, False]).fit, X, y) + for name in ALL_TREES: + yield check, name + + +def test_invalid_categorical_shape(): + check = lambda nm: assert_raises( + ValueError, ALL_TREES[nm], categorical=[[0]]) for name in ALL_TREES: yield check, name def check_invalid_categorical_idx(name): Tree = ALL_TREES[name] - bad_catvals = [[1, 2], [-3], [[0]], [0, 0, 1]] + bad_catvals = ([1, 2], [-3], [0, 0, 1]) for catval in bad_catvals: - assert_raises(ValueError, Tree().fit, X, y, categorical=catval) + assert_raises(ValueError, Tree(categorical=catval).fit, X, y) def test_invalid_categorical_idx(): @@ -1475,10 +1482,10 @@ def check_no_sparse_with_categorical(name): X, y, X_sparse = [DATASETS['toy'][z] for z in ['X', 'y', 'X_sparse']] Tree = ALL_TREES[name] - assert_raises(NotImplementedError, Tree().fit, X_sparse, y, - categorical='All') + assert_raises(NotImplementedError, Tree(categorical='all').fit, + X_sparse, y) assert_raises(NotImplementedError, - Tree().fit(X, y, categorical='All').predict, X_sparse) + Tree(categorical='all').fit(X, y).predict, X_sparse) def test_no_sparse_with_categorical(): diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index c10303d395ff6..87a6919067633 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -98,13 +98,15 @@ def preproc_categorical(X, categorical, check_input): """ n_features = np.shape(X)[1] if isinstance(categorical, str): - if categorical == "None": + if categorical == 'none': categorical = np.array([]) - elif categorical == "All": + elif categorical == 'all': categorical = np.arange(n_features) else: - raise ValueError("Invalid value for categorical: %s. Allowed" - " strings are 'All' or 'None'" % categorical) + # Should have been caught in the constructor, but just in case + raise ValueError("Invalid value for categorical: {}. Allowed" + " strings are 'all' or 'none'" + "".format(categorical)) categorical = np.asarray(categorical) if categorical.dtype == np.bool: if categorical.shape != (n_features,): @@ -202,7 +204,8 @@ def __init__(self, max_leaf_nodes, random_state, class_weight=None, - presort=False): + presort=False, + categorical='none'): self.criterion = criterion self.splitter = splitter self.max_depth = max_depth @@ -214,6 +217,7 @@ def __init__(self, self.max_leaf_nodes = max_leaf_nodes self.class_weight = class_weight self.presort = presort + self.categorical = categorical self.n_features_ = None self.n_outputs_ = None @@ -224,8 +228,17 @@ def __init__(self, self.tree_ = None self.max_features_ = None - def fit(self, X, y, sample_weight=None, categorical='None', - check_input=True, X_idx_sorted=None): + # Input validation for parameter categorical + if isinstance(self.categorical, str): + if categorical not in ('all', 'none'): + raise ValueError("Invalid value for categorical: {}. Allowed" + " strings are 'all' or 'none'" + "".format(categorical)) + elif len(np.shape(categorical)) != 1: + raise ValueError("Invalid shape for parameter categorical") + + def fit(self, X, y, sample_weight=None, check_input=True, + X_idx_sorted=None): """Build a decision tree from the training set (X, y). Parameters @@ -247,19 +260,6 @@ def fit(self, X, y, sample_weight=None, categorical='None', classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. - categorical : array-like or str - Array of feature indices, boolean array of length - n_features, ``'All'``, or ``'None'``. Indicates which - features should be considered as categorical rather than - ordinal. For decision trees, the maximum number of - categories per feature is 64, though the real-world limit - will be much lower because evaluating splits has - :math:`O(2^N)` time complexity, for :math:`N` - categories. Extra-randomized trees do not have this - limitation because they do not try to find the best - split. For these trees, the maximum number of categories - per feature is :math:`2^{31}`. - check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you are doing. @@ -425,7 +425,7 @@ def fit(self, X, y, sample_weight=None, categorical='None', # Do preprocessing of categorical variables X, n_categories, self.category_map_ = preproc_categorical( - X, categorical, check_input) + X, self.categorical, check_input) # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: @@ -765,6 +765,19 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin): When using either a smaller dataset or a restricted depth, this may speed up the training. + categorical : array-like or str + Array of feature indices, boolean array of length + n_features, ``'all'``, or ``'none'``. Indicates which + features should be considered as categorical rather than + ordinal. For decision trees, the maximum number of + categories per feature is 64, though the real-world limit + will be much lower because evaluating splits has + :math:`O(2^N)` time complexity, for :math:`N` + categories. Extra-randomized trees do not have this + limitation because they do not try to find the best + split. For these trees, the maximum number of categories + per feature is :math:`2^{31}`. + Attributes ---------- classes_ : array of shape = [n_classes] or a list of such arrays @@ -836,7 +849,8 @@ def __init__(self, random_state=None, max_leaf_nodes=None, class_weight=None, - presort=False): + presort=False, + categorical='none'): super(DecisionTreeClassifier, self).__init__( criterion=criterion, splitter=splitter, @@ -848,7 +862,8 @@ def __init__(self, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, random_state=random_state, - presort=presort) + presort=presort, + categorical=categorical) def predict_proba(self, X, check_input=True): """Predict class probabilities of the input samples X. @@ -1007,6 +1022,19 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): When using either a smaller dataset or a restricted depth, this may speed up the training. + categorical : array-like or str + Array of feature indices, boolean array of length + n_features, ``'all'``, or ``'none'``. Indicates which + features should be considered as categorical rather than + ordinal. For decision trees, the maximum number of + categories per feature is 64, though the real-world limit + will be much lower because evaluating splits has + :math:`O(2^N)` time complexity, for :math:`N` + categories. Extra-randomized trees do not have this + limitation because they do not try to find the best + split. For these trees, the maximum number of categories + per feature is :math:`2^{31}`. + Attributes ---------- feature_importances_ : array of shape = [n_features] @@ -1069,7 +1097,8 @@ def __init__(self, max_features=None, random_state=None, max_leaf_nodes=None, - presort=False): + presort=False, + categorical='none'): super(DecisionTreeRegressor, self).__init__( criterion=criterion, splitter=splitter, @@ -1080,7 +1109,8 @@ def __init__(self, max_features=max_features, max_leaf_nodes=max_leaf_nodes, random_state=random_state, - presort=presort) + presort=presort, + categorical=categorical) class ExtraTreeClassifier(DecisionTreeClassifier): @@ -1117,7 +1147,8 @@ def __init__(self, max_features="auto", random_state=None, max_leaf_nodes=None, - class_weight=None): + class_weight=None, + categorical='none'): super(ExtraTreeClassifier, self).__init__( criterion=criterion, splitter=splitter, @@ -1128,7 +1159,8 @@ def __init__(self, max_features=max_features, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, - random_state=random_state) + random_state=random_state, + categorical=categorical) class ExtraTreeRegressor(DecisionTreeRegressor): @@ -1164,7 +1196,8 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", random_state=None, - max_leaf_nodes=None): + max_leaf_nodes=None, + categorical='none'): super(ExtraTreeRegressor, self).__init__( criterion=criterion, splitter=splitter, @@ -1174,4 +1207,5 @@ def __init__(self, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, - random_state=random_state) + random_state=random_state, + categorical=categorical) From 806951228e08afd86962bbf26033d3346d244927 Mon Sep 17 00:00:00 2001 From: Jeffrey Blackburne Date: Sat, 23 Jul 2016 13:32:31 -0700 Subject: [PATCH 35/35] Added printf debug statements. --- sklearn/ensemble/tests/test_bagging.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 5d85713a76210..67dde9bdf32e6 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -5,6 +5,8 @@ # Author: Gilles Louppe # License: BSD 3 clause +import sys + import numpy as np from sklearn.base import BaseEstimator @@ -419,40 +421,49 @@ def test_parallel_classification(): X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) - + print('nocats checkpoint 1', file=sys.stderr) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) # predict_proba + print('nocats checkpoint 2', file=sys.stderr) ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) + print('nocats checkpoint 3', file=sys.stderr) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) + print('nocats checkpoint 4', file=sys.stderr) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) + print('nocats checkpoint 5', file=sys.stderr) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function + print('nocats checkpoint 6', file=sys.stderr) ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'), n_jobs=3, random_state=0).fit(X_train, y_train) + print('nocats checkpoint 7', file=sys.stderr) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) + print('nocats checkpoint 8', file=sys.stderr) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) + print('nocats checkpoint 9', file=sys.stderr) ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'), n_jobs=1, random_state=0).fit(X_train, y_train) + print('nocats checkpoint 10', file=sys.stderr) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3)