From 34355bbc6cd4d63d532ecfc9c1bae033cf482a09 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Sun, 12 Apr 2015 21:47:11 -0700
Subject: [PATCH 01/35] Created SplitValue datatype to generalize the concept
 of a threshold to categorical variables. Replaced the threshold attribute of
 SplitRecord and Node with SplitValue.

---
 sklearn/tree/_splitter.pxd | 27 +++++++++++++++++++++--
 sklearn/tree/_splitter.pyx | 44 ++++++++++++++++++--------------------
 sklearn/tree/_tree.pxd     |  5 ++++-
 sklearn/tree/_tree.pyx     | 39 ++++++++++++++++++---------------
 4 files changed, 72 insertions(+), 43 deletions(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index e716736e1cf91..8756e94fa5f42 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -19,6 +19,28 @@ ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
 ctypedef np.npy_intp SIZE_t              # Type for indices and counters
 ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
 ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
+ctypedef np.npy_uint64 UINT64_t          # Unsigned 64 bit integer
+
+cdef union SplitValue:
+    # Union type to generalize the concept of a threshold to
+    # categorical features. For non-categorical features, use the
+    # threshold member. It acts just as before, where feature values
+    # less than or equal to the threshold go left, and values greater
+    # than the threshold go right.
+    #
+    # For categorical features, use the cat_split member. It works in
+    # one of two ways, indicated by the value of its least significant
+    # bit (LSB). If the LSB is 0, then cat_split acts as a bitfield
+    # for up to 64 categories, sending samples left if the bit
+    # corresponding to their category is 1 or right if it is 0. If the
+    # LSB is 1, then the more significant 32 bits of cat_split is a
+    # random seed and the next 31 bits are the number of deviates to
+    # draw. To evaluate a sample, draw the required set of categories
+    # and check if the sample's feature value is in the set. If so,
+    # send it left; otherwise right. This second method allows up to
+    # 2**31 category values, but can only be used for RandomSplitter.
+    DOUBLE_t threshold
+    UINT64_t cat_split
 
 cdef struct SplitRecord:
     # Data to track sample split
@@ -26,7 +48,8 @@ cdef struct SplitRecord:
     SIZE_t pos             # Split samples array at the given position,
                            # i.e. count of samples below threshold for feature.
                            # pos is >= end if the node is a leaf.
-    double threshold       # Threshold to split at.
+    SplitValue split_value # Generalized threshold for categorical and
+                           # non-categorical features.
     double improvement     # Impurity improvement given parent node.
     double impurity_left   # Impurity of the left split.
     double impurity_right  # Impurity of the right split.
@@ -95,4 +118,4 @@ cdef class Splitter:
 
     cdef void node_value(self, double* dest) nogil
 
-    cdef double node_impurity(self) nogil
\ No newline at end of file
+    cdef double node_impurity(self) nogil
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 4c59b6960e7a0..4c20834259388 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -48,7 +48,7 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil:
     self.impurity_right = INFINITY
     self.pos = start_pos
     self.feature = 0
-    self.threshold = 0.
+    self.split_value.threshold = 0.
     self.improvement = -INFINITY
 
 cdef class Splitter:
@@ -461,10 +461,10 @@ cdef class BestSplitter(BaseDenseSplitter):
 
                             if current_proxy_improvement > best_proxy_improvement:
                                 best_proxy_improvement = current_proxy_improvement
-                                current.threshold = (Xf[p - 1] + Xf[p]) / 2.0
+                                current.split_value.threshold = (Xf[p - 1] + Xf[p]) / 2.0
 
-                                if current.threshold == Xf[p]:
-                                    current.threshold = Xf[p - 1]
+                                if current.split_value.threshold == Xf[p]:
+                                    current.split_value.threshold = Xf[p - 1]
 
                                 best = current  # copy
 
@@ -475,7 +475,7 @@ cdef class BestSplitter(BaseDenseSplitter):
             p = start
 
             while p < partition_end:
-                if X[X_sample_stride * samples[p] + feature_offset] <= best.threshold:
+                if X[X_sample_stride * samples[p] + feature_offset] <= best.split_value.threshold:
                     p += 1
 
                 else:
@@ -749,19 +749,18 @@ cdef class RandomSplitter(BaseDenseSplitter):
                     features[f_i], features[f_j] = features[f_j], features[f_i]
 
                     # Draw a random threshold
-                    current.threshold = rand_uniform(min_feature_value,
-                                                     max_feature_value,
-                                                     random_state)
+                    current.split_value.threshold = rand_uniform(
+                        min_feature_value, max_feature_value, random_state)
 
-                    if current.threshold == max_feature_value:
-                        current.threshold = min_feature_value
+                    if current.split_value.threshold == max_feature_value:
+                        current.split_value.threshold = min_feature_value
 
                     # Partition
                     partition_end = end
                     p = start
                     while p < partition_end:
                         current_feature_value = Xf[p]
-                        if current_feature_value <= current.threshold:
+                        if current_feature_value <= current.split_value.threshold:
                             p += 1
                         else:
                             partition_end -= 1
@@ -803,7 +802,7 @@ cdef class RandomSplitter(BaseDenseSplitter):
                 p = start
 
                 while p < partition_end:
-                    if X[X_sample_stride * samples[p] + feature_stride] <= best.threshold:
+                    if X[X_sample_stride * samples[p] + feature_stride] <= best.split_value.threshold:
                         p += 1
 
                     else:
@@ -1347,9 +1346,9 @@ cdef class BestSparseSplitter(BaseSparseSplitter):
                             if current_proxy_improvement > best_proxy_improvement:
                                 best_proxy_improvement = current_proxy_improvement
 
-                                current.threshold = (Xf[p_prev] + Xf[p]) / 2.0
-                                if current.threshold == Xf[p]:
-                                    current.threshold = Xf[p_prev]
+                                current.split_value.threshold = (Xf[p_prev] + Xf[p]) / 2.0
+                                if current.split_value.threshold == Xf[p]:
+                                    current.split_value.threshold = Xf[p_prev]
 
                                 best = current
 
@@ -1358,7 +1357,7 @@ cdef class BestSparseSplitter(BaseSparseSplitter):
             self.extract_nnz(best.feature, &end_negative, &start_positive,
                              &is_samples_sorted)
 
-            self._partition(best.threshold, end_negative, start_positive,
+            self._partition(best.split_value.threshold, end_negative, start_positive,
                             best.pos)
 
             self.criterion.reset()
@@ -1542,15 +1541,14 @@ cdef class RandomSparseSplitter(BaseSparseSplitter):
                     features[f_i], features[f_j] = features[f_j], features[f_i]
 
                     # Draw a random threshold
-                    current.threshold = rand_uniform(min_feature_value,
-                                                     max_feature_value,
-                                                     random_state)
+                    current.split_value.threshold = rand_uniform(
+                        min_feature_value, max_feature_value, random_state)
 
-                    if current.threshold == max_feature_value:
-                        current.threshold = min_feature_value
+                    if current.split_value.threshold == max_feature_value:
+                        current.split_value.threshold = min_feature_value
 
                     # Partition
-                    current.pos = self._partition(current.threshold,
+                    current.pos = self._partition(current.split_value.threshold,
                                                   end_negative,
                                                   start_positive,
                                                   start_positive +
@@ -1586,7 +1584,7 @@ cdef class RandomSparseSplitter(BaseSparseSplitter):
                 self.extract_nnz(best.feature, &end_negative, &start_positive,
                                  &is_samples_sorted)
 
-                self._partition(best.threshold, end_negative, start_positive,
+                self._partition(best.split_value.threshold, end_negative, start_positive,
                                 best.pos)
 
             self.criterion.reset()
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 6b34e51a07c8e..59bd414f0423b 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -17,9 +17,11 @@ ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
 ctypedef np.npy_intp SIZE_t              # Type for indices and counters
 ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
 ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
+ctypedef np.npy_uint64 UINT64_t          # Unsigned 64 bit integer
 
 from ._splitter cimport Splitter
 from ._splitter cimport SplitRecord
+from ._splitter cimport SplitValue
 
 cdef struct Node:
     # Base storage structure for the nodes in a Tree object
@@ -27,7 +29,8 @@ cdef struct Node:
     SIZE_t left_child                    # id of the left child of the node
     SIZE_t right_child                   # id of the right child of the node
     SIZE_t feature                       # Feature used for splitting the node
-    DOUBLE_t threshold                   # Threshold value at the node
+    SplitValue split_value               # Generalized threshold for categorical and
+                                         # non-categorical features.
     DOUBLE_t impurity                    # Impurity of the node (i.e., the value of the criterion)
     SIZE_t n_node_samples                # Number of samples at the node
     DOUBLE_t weighted_n_node_samples     # Weighted number of samples at the node
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 8bc02ef44d94d..58df97698e71d 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -66,16 +66,21 @@ cdef SIZE_t INITIAL_STACK_SIZE = 10
 cdef DTYPE_t MIN_IMPURITY_SPLIT = 1e-7
 
 # Repeat struct definition for numpy
+SPLITVALUE_DTYPE = np.dtype({
+    'names': ['threshold', 'cat_split'],
+    'formats': [np.float64, np.uint64],
+    'offsets': [0, 0]
+})
 NODE_DTYPE = np.dtype({
-    'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity',
-              'n_node_samples', 'weighted_n_node_samples'],
-    'formats': [np.intp, np.intp, np.intp, np.float64, np.float64, np.intp,
-                np.float64],
+    'names': ['left_child', 'right_child', 'feature', 'split_value',
+              'impurity', 'n_node_samples', 'weighted_n_node_samples'],
+    'formats': [np.intp, np.intp, np.intp, SPLITVALUE_DTYPE, np.float64,
+                np.intp, np.float64],
     'offsets': [
         <Py_ssize_t> &(<Node*> NULL).left_child,
         <Py_ssize_t> &(<Node*> NULL).right_child,
         <Py_ssize_t> &(<Node*> NULL).feature,
-        <Py_ssize_t> &(<Node*> NULL).threshold,
+        <Py_ssize_t> &(<Node*> NULL).split_value,
         <Py_ssize_t> &(<Node*> NULL).impurity,
         <Py_ssize_t> &(<Node*> NULL).n_node_samples,
         <Py_ssize_t> &(<Node*> NULL).weighted_n_node_samples
@@ -229,8 +234,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                     is_leaf = is_leaf or (split.pos >= end)
 
                 node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
-                                         split.threshold, impurity, n_node_samples,
-                                         weighted_n_node_samples)
+                                         split.split_value.threshold, impurity,
+                                         n_node_samples, weighted_n_node_samples)
 
                 if node_id == <SIZE_t>(-1):
                     rc = -1
@@ -356,7 +361,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                     node.left_child = _TREE_LEAF
                     node.right_child = _TREE_LEAF
                     node.feature = _TREE_UNDEFINED
-                    node.threshold = _TREE_UNDEFINED
+                    node.split_value.threshold = _TREE_UNDEFINED
 
                 else:
                     # Node is expandable
@@ -444,8 +449,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                                  if parent != NULL
                                  else _TREE_UNDEFINED,
                                  is_left, is_leaf,
-                                 split.feature, split.threshold, impurity, n_node_samples,
-                                 weighted_n_node_samples)
+                                 split.feature, split.split_value.threshold,
+                                 impurity, n_node_samples, weighted_n_node_samples)
         if node_id == <SIZE_t>(-1):
             return -1
 
@@ -558,7 +563,7 @@ cdef class Tree:
 
     property threshold:
         def __get__(self):
-            return self._get_node_ndarray()['threshold'][:self.node_count]
+            return self._get_node_ndarray()['split_value']['threshold'][:self.node_count]
 
     property impurity:
         def __get__(self):
@@ -725,12 +730,12 @@ cdef class Tree:
             node.left_child = _TREE_LEAF
             node.right_child = _TREE_LEAF
             node.feature = _TREE_UNDEFINED
-            node.threshold = _TREE_UNDEFINED
+            node.split_value.threshold = _TREE_UNDEFINED
 
         else:
             # left_child and right_child will be set later
             node.feature = feature
-            node.threshold = threshold
+            node.split_value.threshold = threshold
 
         self.node_count += 1
 
@@ -784,7 +789,7 @@ cdef class Tree:
                 while node.left_child != _TREE_LEAF:
                     # ... and node.right_child != _TREE_LEAF:
                     if X_ptr[X_sample_stride * i +
-                             X_fx_stride * node.feature] <= node.threshold:
+                             X_fx_stride * node.feature] <= node.split_value.threshold:
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
@@ -855,7 +860,7 @@ cdef class Tree:
                     else:
                         feature_value = 0.
 
-                    if feature_value <= node.threshold:
+                    if feature_value <= node.split_value.threshold:
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
@@ -918,7 +923,7 @@ cdef class Tree:
                     indptr_ptr[i + 1] += 1
 
                     if X_ptr[X_sample_stride * i +
-                             X_fx_stride * node.feature] <= node.threshold:
+                             X_fx_stride * node.feature] <= node.split_value.threshold:
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
@@ -1006,7 +1011,7 @@ cdef class Tree:
                     else:
                         feature_value = 0.
 
-                    if feature_value <= node.threshold:
+                    if feature_value <= node.split_value.threshold:
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]

From a9db0ea636ff0eb25a8a78eeb5795164bd593124 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Sun, 12 Apr 2015 21:49:30 -0700
Subject: [PATCH 02/35] Added attribute n_categories to Splitter and Tree, an
 array of ints that defaults to -1 for each feature (indicating
 non-categorical feature).

---
 sklearn/tree/_splitter.pxd |  2 ++
 sklearn/tree/_splitter.pyx |  8 ++++++++
 sklearn/tree/_tree.pxd     |  3 +++
 sklearn/tree/_tree.pyx     | 28 +++++++++++++++++++++++++++-
 sklearn/tree/_utils.pxd    |  1 +
 5 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 8756e94fa5f42..eb296d597b388 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -86,6 +86,8 @@ cdef class Splitter:
     cdef DOUBLE_t* y
     cdef SIZE_t y_stride
     cdef DOUBLE_t* sample_weight
+    cdef INT32_t* n_categories           # (n_features) array giving number of
+                                         # categories (<0 for non-categorical)
 
     # The samples vector `samples` is maintained by the Splitter object such
     # that the samples contained in a node are contiguous. With this setting,
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 4c20834259388..d111a0fd3a2f2 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -95,6 +95,7 @@ cdef class Splitter:
         self.y = NULL
         self.y_stride = 0
         self.sample_weight = NULL
+        self.n_categories = NULL
 
         self.max_features = max_features
         self.min_samples_leaf = min_samples_leaf
@@ -109,6 +110,7 @@ cdef class Splitter:
         free(self.features)
         free(self.constant_features)
         free(self.feature_values)
+        free(self.n_categories)
 
     def __getstate__(self):
         return {}
@@ -181,6 +183,12 @@ cdef class Splitter:
 
         self.sample_weight = sample_weight
 
+        # Initialize the number of categories for each feature
+        # A value of -1 indicates a non-categorical feature
+        safe_realloc(&self.n_categories, n_features)
+        for i in range(n_features):
+            self.n_categories[i] = -1
+
     cdef void node_reset(self, SIZE_t start, SIZE_t end,
                          double* weighted_n_node_samples) nogil:
         """Reset splitter on node samples[start:end].
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 59bd414f0423b..732a18597d590 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -55,6 +55,8 @@ cdef class Tree:
     cdef Node* nodes                     # Array of nodes
     cdef double* value                   # (capacity, n_outputs, max_n_classes) array of values
     cdef SIZE_t value_stride             # = n_outputs * max_n_classes
+    cdef INT32_t* n_categories           # (n_features) array giving number of
+                                         # categories (<0 for non-categorical)
 
     # Methods
     cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
@@ -66,6 +68,7 @@ cdef class Tree:
 
     cdef np.ndarray _get_value_ndarray(self)
     cdef np.ndarray _get_node_ndarray(self)
+    cdef np.ndarray _get_ncat_ndarray(self)
 
     cpdef np.ndarray predict(self, object X)
 
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 58df97698e71d..e56e628d804dd 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -603,6 +603,10 @@ cdef class Tree:
         self.capacity = 0
         self.value = NULL
         self.nodes = NULL
+        self.n_categories = NULL
+        safe_realloc(&self.n_categories, n_features)
+        for k in range(n_features):
+            self.n_categories[k] = -1
 
     def __dealloc__(self):
         """Destructor."""
@@ -610,6 +614,7 @@ cdef class Tree:
         free(self.n_classes)
         free(self.value)
         free(self.nodes)
+        free(self.n_categories)
 
     def __reduce__(self):
         """Reduce re-implementation, for pickling."""
@@ -625,6 +630,7 @@ cdef class Tree:
         d["node_count"] = self.node_count
         d["nodes"] = self._get_node_ndarray()
         d["values"] = self._get_value_ndarray()
+        d["n_categories"] = self._get_ncat_ndarray()
         return d
 
     def __setstate__(self, d):
@@ -638,6 +644,7 @@ cdef class Tree:
 
         node_ndarray = d['nodes']
         value_ndarray = d['values']
+        ncat_ndarray = d['n_categories']
 
         value_shape = (node_ndarray.shape[0], self.n_outputs,
                        self.max_n_classes)
@@ -646,7 +653,10 @@ cdef class Tree:
                 not node_ndarray.flags.c_contiguous or
                 value_ndarray.shape != value_shape or
                 not value_ndarray.flags.c_contiguous or
-                value_ndarray.dtype != np.float64):
+                value_ndarray.dtype != np.float64 or
+                ncat_ndarray.shape != (self.n_features,) or
+                ncat_ndarray.dtype != np.int32 or
+                not ncat_ndarray.flags.c_contiguous):
             raise ValueError('Did not recognise loaded array layout')
 
         self.capacity = node_ndarray.shape[0]
@@ -656,6 +666,8 @@ cdef class Tree:
                        self.capacity * sizeof(Node))
         value = memcpy(self.value, (<np.ndarray> value_ndarray).data,
                        self.capacity * self.value_stride * sizeof(double))
+        ncat = memcpy(self.n_categories, (<np.ndarray> ncat_ndarray).data,
+                      self.n_features * sizeof(INT32_t))
 
     cdef void _resize(self, SIZE_t capacity) except *:
         """Resize all inner arrays to `capacity`, if `capacity` == -1, then
@@ -1087,6 +1099,20 @@ cdef class Tree:
         arr.base = <PyObject*> self
         return arr
 
+    cdef np.ndarray _get_ncat_ndarray(self):
+        """Wraps n_categories as a 3-d Numpy array
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory.
+        """
+        cdef np.npy_intp shape[1]
+        shape[0] = <np.npy_intp> self.n_features
+        cdef np.ndarray arr
+        arr = np.PyArray_SimpleNewFromData(1, shape, np.NPY_INT32, self.n_categories)
+        Py_INCREF(self)
+        arr.base = <PyObject*> self
+        return arr
+
     cdef np.ndarray _get_node_ndarray(self):
         """Wraps nodes as a NumPy struct array.
 
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 8f659aa86374e..cdb6ac1f54790 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -33,6 +33,7 @@ ctypedef fused realloc_ptr:
     (DTYPE_t*)
     (SIZE_t*)
     (unsigned char*)
+    (INT32_t*)
 
 cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except *
 

From 53457e5812703adfcd8758470344dc6f5a4002be Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Sun, 12 Apr 2015 22:39:11 -0700
Subject: [PATCH 03/35] Added a goes_left function for evaluating splits.

---
 sklearn/tree/_utils.pxd |  6 ++++++
 sklearn/tree/_utils.pyx | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index cdb6ac1f54790..5b73df0ebb27e 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -10,6 +10,8 @@
 import numpy as np
 cimport numpy as np
 
+from ._splitter cimport SplitValue
+
 ctypedef np.npy_float32 DTYPE_t          # Type of X
 ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
 ctypedef np.npy_intp SIZE_t              # Type for indices and counters
@@ -51,6 +53,10 @@ cdef double rand_uniform(double low, double high,
 
 cdef double log(double x) nogil
 
+# Function for traversing a tree
+cdef bint goes_left(DTYPE_t feature_value, SplitValue split,
+                    INT32_t n_categories) nogil
+
 # =============================================================================
 # Stack data structure
 # =============================================================================
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 70232e3851050..a2c975506a751 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -82,6 +82,26 @@ cdef inline double rand_uniform(double low, double high,
 cdef inline double log(double x) nogil:
     return ln(x) / ln(2.0)
 
+cdef bint goes_left(DTYPE_t feature_value, SplitValue split,
+                    INT32_t n_categories) nogil:
+    """Determine whether a sample goes to the left or right child node."""
+    cdef UINT32_t rng_seed
+
+    if n_categories < 1:
+        # Non-categorical feature
+        return feature_value <= split.threshold
+    elif (split.cat_split & 1 == 0):
+        # Bitfield model
+        return (split.cat_split >> <SIZE_t>feature_value) & 1
+    else:
+        # Random model
+        rng_seed = split.cat_split >> 32
+        for q in range((split.cat_split & <SIZE_t>0xFFFFFFFF) >> 1):
+            if (<SIZE_t>feature_value ==
+                    rand_int(0, n_categories, &rng_seed)):
+                return 1
+        return 0
+
 
 # =============================================================================
 # Stack data structure

From ba93458cad0a55ee08261bd891bae26c3d21f213 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Sun, 12 Apr 2015 22:46:49 -0700
Subject: [PATCH 04/35] Tree.apply() now uses the goes_left function.

---
 sklearn/tree/_splitter.pxd |  2 +-
 sklearn/tree/_tree.pxd     |  4 ++--
 sklearn/tree/_tree.pyx     | 17 +++++++++--------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index eb296d597b388..c75714af1c1af 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -21,7 +21,7 @@ ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
 ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
 ctypedef np.npy_uint64 UINT64_t          # Unsigned 64 bit integer
 
-cdef union SplitValue:
+ctypedef union SplitValue:
     # Union type to generalize the concept of a threshold to
     # categorical features. For non-categorical features, use the
     # threshold member. It acts just as before, where feature values
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 732a18597d590..dd22775484e7b 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -60,8 +60,8 @@ cdef class Tree:
 
     # Methods
     cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
-                          SIZE_t feature, double threshold, double impurity,
-                          SIZE_t n_node_samples,
+                          SIZE_t feature, SplitValue split_value,
+                          double impurity, SIZE_t n_node_samples,
                           double weighted_n_samples) nogil
     cdef void _resize(self, SIZE_t capacity) except *
     cdef int _resize_c(self, SIZE_t capacity=*) nogil
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index e56e628d804dd..9d557de88a857 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -35,6 +35,7 @@ from ._utils cimport StackRecord
 from ._utils cimport PriorityHeap
 from ._utils cimport PriorityHeapRecord
 from ._utils cimport safe_realloc
+from ._utils cimport goes_left
 from ._utils cimport sizet_ptr_to_ndarray
 
 cdef extern from "numpy/arrayobject.h":
@@ -186,7 +187,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef SplitRecord split
         cdef SIZE_t node_id
 
-        cdef double threshold
         cdef double impurity = INFINITY
         cdef SIZE_t n_constant_features
         cdef bint is_leaf
@@ -234,7 +234,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                     is_leaf = is_leaf or (split.pos >= end)
 
                 node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
-                                         split.split_value.threshold, impurity,
+                                         split.split_value, impurity,
                                          n_node_samples, weighted_n_node_samples)
 
                 if node_id == <SIZE_t>(-1):
@@ -449,7 +449,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                                  if parent != NULL
                                  else _TREE_UNDEFINED,
                                  is_left, is_leaf,
-                                 split.feature, split.split_value.threshold,
+                                 split.feature, split.split_value,
                                  impurity, n_node_samples, weighted_n_node_samples)
         if node_id == <SIZE_t>(-1):
             return -1
@@ -713,7 +713,7 @@ cdef class Tree:
         return 0
 
     cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
-                          SIZE_t feature, double threshold, double impurity,
+                          SIZE_t feature, SplitValue split_value, double impurity,
                           SIZE_t n_node_samples, double weighted_n_node_samples) nogil:
         """Add a node to the tree.
 
@@ -747,7 +747,7 @@ cdef class Tree:
         else:
             # left_child and right_child will be set later
             node.feature = feature
-            node.split_value.threshold = threshold
+            node.split_value = split_value
 
         self.node_count += 1
 
@@ -800,8 +800,8 @@ cdef class Tree:
                 # While node not a leaf
                 while node.left_child != _TREE_LEAF:
                     # ... and node.right_child != _TREE_LEAF:
-                    if X_ptr[X_sample_stride * i +
-                             X_fx_stride * node.feature] <= node.split_value.threshold:
+                    if goes_left(X_ptr[X_sample_stride * i + X_fx_stride * node.feature],
+                                 node.split_value, self.n_categories[node.feature]):
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
@@ -872,7 +872,8 @@ cdef class Tree:
                     else:
                         feature_value = 0.
 
-                    if feature_value <= node.split_value.threshold:
+                    if goes_left(feature_value, node.split_value,
+                                 self.n_categories[node.feature]):
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]

From d7d13b39f0c2061aa10aa2e958ea58d107e349b6 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Fri, 17 Apr 2015 14:56:58 -0700
Subject: [PATCH 05/35] Added categorical split code to BestSplitter.node_split

---
 sklearn/tree/_splitter.pyx | 93 +++++++++++++++++++++++++-------------
 1 file changed, 61 insertions(+), 32 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index d111a0fd3a2f2..f5caf3339f121 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -33,6 +33,7 @@ from ._utils cimport rand_int
 from ._utils cimport rand_uniform
 from ._utils cimport RAND_R_MAX
 from ._utils cimport safe_realloc
+from ._utils cimport goes_left
 
 cdef double INFINITY = np.inf
 
@@ -333,8 +334,8 @@ cdef class BestSplitter(BaseDenseSplitter):
 
         cdef SIZE_t f_i = n_features
         cdef SIZE_t f_j
-        cdef SIZE_t tmp
         cdef SIZE_t p
+        cdef SIZE_t q
         cdef SIZE_t feature_idx_offset
         cdef SIZE_t feature_offset
         cdef SIZE_t i
@@ -350,6 +351,7 @@ cdef class BestSplitter(BaseDenseSplitter):
         cdef SIZE_t n_total_constants = n_known_constants
         cdef DTYPE_t current_feature_value
         cdef SIZE_t partition_end
+        cdef bint is_categorical
 
         _init_split(&best, end)
 
@@ -391,9 +393,8 @@ cdef class BestSplitter(BaseDenseSplitter):
 
             if f_j < n_known_constants:
                 # f_j in the interval [n_drawn_constants, n_known_constants[
-                tmp = features[f_j]
-                features[f_j] = features[n_drawn_constants]
-                features[n_drawn_constants] = tmp
+                features[f_j], features[n_drawn_constants] = (
+                    features[n_drawn_constants], features[f_j])
 
                 n_drawn_constants += 1
 
@@ -437,44 +438,72 @@ cdef class BestSplitter(BaseDenseSplitter):
 
                     # Evaluate all splits
                     self.criterion.reset()
-                    p = start
+                    is_categorical = self.n_categories[current.feature] > 0
+                    p = 0 if is_categorical else start
+
+                    while True:
+                        if is_categorical:
+                            # WARNING: This is O(n_samples *
+                            # 2**n_categories), and will be very slow
+                            # for more than just a few categories.
+                            if p > (1 << self.n_categories[current.feature]) - 1:
+                                break
+                            else:
+                                p += 2  # LSB must always be 0
+
+                            # Partition
+                            q = start
+                            partition_end = end
+                            while q < partition_end:
+                                if ((p >> <SIZE_t>Xf[q]) & 1):
+                                    q += 1
+                                else:
+                                    partition_end -= 1
+                                    Xf[q], Xf[partition_end] = Xf[partition_end], Xf[q]
+                                    samples[q], samples[partition_end] = (
+                                        samples[partition_end], samples[q])
+                            current.pos = q
+                        else:
+                            # Non-categorical feature
+                            while (p + 1 < end and
+                                   Xf[p + 1] <= Xf[p] + FEATURE_THRESHOLD):
+                                p += 1
 
-                    while p < end:
-                        while (p + 1 < end and
-                               Xf[p + 1] <= Xf[p] + FEATURE_THRESHOLD):
+                            # (p + 1 >= end) or (X[samples[p + 1], current.feature] >
+                            #                    X[samples[p], current.feature])
                             p += 1
+                            # (p >= end) or (X[samples[p], current.feature] >
+                            #                X[samples[p - 1], current.feature])
 
-                        # (p + 1 >= end) or (X[samples[p + 1], current.feature] >
-                        #                    X[samples[p], current.feature])
-                        p += 1
-                        # (p >= end) or (X[samples[p], current.feature] >
-                        #                X[samples[p - 1], current.feature])
+                            if p >= end:
+                                break
 
-                        if p < end:
                             current.pos = p
 
-                            # Reject if min_samples_leaf is not guaranteed
-                            if (((current.pos - start) < min_samples_leaf) or
-                                    ((end - current.pos) < min_samples_leaf)):
-                                continue
+                        # Reject if min_samples_leaf is not guaranteed
+                        if (((current.pos - start) < min_samples_leaf) or
+                            ((end - current.pos) < min_samples_leaf)):
+                            continue
 
-                            self.criterion.update(current.pos)
+                        self.criterion.update(current.pos)
 
-                            # Reject if min_weight_leaf is not satisfied
-                            if ((self.criterion.weighted_n_left < min_weight_leaf) or
-                                    (self.criterion.weighted_n_right < min_weight_leaf)):
-                                continue
+                        # Reject if min_weight_leaf is not satisfied
+                        if ((self.criterion.weighted_n_left < min_weight_leaf) or
+                            (self.criterion.weighted_n_right < min_weight_leaf)):
+                            continue
 
-                            current_proxy_improvement = self.criterion.proxy_impurity_improvement()
+                        current_proxy_improvement = self.criterion.proxy_impurity_improvement()
 
-                            if current_proxy_improvement > best_proxy_improvement:
-                                best_proxy_improvement = current_proxy_improvement
+                        if current_proxy_improvement > best_proxy_improvement:
+                            best_proxy_improvement = current_proxy_improvement
+                            if is_categorical:
+                                current.split_value.cat_split = p
+                            else:
                                 current.split_value.threshold = (Xf[p - 1] + Xf[p]) / 2.0
-
                                 if current.split_value.threshold == Xf[p]:
                                     current.split_value.threshold = Xf[p - 1]
 
-                                best = current  # copy
+                            best = current  # copy
 
         # Reorganize into samples[start:best.pos] + samples[best.pos:end]
         if best.pos < end:
@@ -483,15 +512,15 @@ cdef class BestSplitter(BaseDenseSplitter):
             p = start
 
             while p < partition_end:
-                if X[X_sample_stride * samples[p] + feature_offset] <= best.split_value.threshold:
+                if goes_left(X[X_sample_stride * samples[p] + feature_offset],
+                             best.split_value, self.n_categories[best.feature]):
                     p += 1
 
                 else:
                     partition_end -= 1
 
-                    tmp = samples[partition_end]
-                    samples[partition_end] = samples[p]
-                    samples[p] = tmp
+                    samples[p], samples[partition_end] = (
+                        samples[partition_end], samples[p])
 
             self.criterion.reset()
             self.criterion.update(best.pos)

From ec37e11bda066fc13e49c7006b05a93c23599361 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Fri, 17 Apr 2015 15:01:42 -0700
Subject: [PATCH 06/35] Added categorical split code to
 RandomSplitter.node_split

---
 sklearn/tree/_splitter.pyx | 47 +++++++++++++++++++++++---------------
 sklearn/tree/_utils.pxd    |  3 +++
 2 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index f5caf3339f121..5171a7cf76a7e 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -29,6 +29,7 @@ np.import_array()
 from scipy.sparse import csc_matrix
 
 from ._utils cimport log
+from ._utils cimport our_rand_r
 from ._utils cimport rand_int
 from ._utils cimport rand_uniform
 from ._utils cimport RAND_R_MAX
@@ -696,7 +697,6 @@ cdef class RandomSplitter(BaseDenseSplitter):
         cdef SIZE_t f_i = n_features
         cdef SIZE_t f_j
         cdef SIZE_t p
-        cdef SIZE_t tmp
         cdef SIZE_t feature_stride
         # Number of features discovered to be constant during the split search
         cdef SIZE_t n_found_constants = 0
@@ -710,6 +710,9 @@ cdef class RandomSplitter(BaseDenseSplitter):
         cdef DTYPE_t max_feature_value
         cdef DTYPE_t current_feature_value
         cdef SIZE_t partition_end
+        cdef bint is_categorical
+        cdef UINT32_t split_n_draw
+        cdef UINT64_t split_seed
 
         _init_split(&best, end)
 
@@ -746,9 +749,8 @@ cdef class RandomSplitter(BaseDenseSplitter):
 
             if f_j < n_known_constants:
                 # f_j in the interval [n_drawn_constants, n_known_constants[
-                tmp = features[f_j]
-                features[f_j] = features[n_drawn_constants]
-                features[n_drawn_constants] = tmp
+                features[f_j], features[n_drawn_constants] = (
+                    features[n_drawn_constants], features[f_j])
 
                 n_drawn_constants += 1
 
@@ -785,19 +787,29 @@ cdef class RandomSplitter(BaseDenseSplitter):
                     f_i -= 1
                     features[f_i], features[f_j] = features[f_j], features[f_i]
 
-                    # Draw a random threshold
-                    current.split_value.threshold = rand_uniform(
-                        min_feature_value, max_feature_value, random_state)
-
-                    if current.split_value.threshold == max_feature_value:
-                        current.split_value.threshold = min_feature_value
+                    # Construct a random split
+                    is_categorical = self.n_categories[current.feature] > 0
+                    if is_categorical:
+                        # split_n_draw is the number of categories to send left
+                        # TODO: this should be a binomial draw
+                        split_n_draw = rand_int(
+                            1, self.n_categories[current.feature], random_state)
+                        split_seed = our_rand_r(random_state)
+                        current.split_value.cat_split = (
+                            (split_seed << 32) | (split_n_draw << 1) | 1)
+                    else:
+                        current.split_value.threshold = rand_uniform(
+                            min_feature_value, max_feature_value, random_state)
+                        if current.split_value.threshold == max_feature_value:
+                            current.split_value.threshold = min_feature_value
 
                     # Partition
                     partition_end = end
                     p = start
                     while p < partition_end:
                         current_feature_value = Xf[p]
-                        if current_feature_value <= current.split_value.threshold:
+                        if goes_left(current_feature_value, current.split_value,
+                                     self.n_categories[current.feature]):
                             p += 1
                         else:
                             partition_end -= 1
@@ -805,9 +817,8 @@ cdef class RandomSplitter(BaseDenseSplitter):
                             Xf[p] = Xf[partition_end]
                             Xf[partition_end] = current_feature_value
 
-                            tmp = samples[partition_end]
-                            samples[partition_end] = samples[p]
-                            samples[p] = tmp
+                            samples[p], samples[partition_end] = (
+                                samples[partition_end], samples[p])
 
                     current.pos = partition_end
 
@@ -839,15 +850,15 @@ cdef class RandomSplitter(BaseDenseSplitter):
                 p = start
 
                 while p < partition_end:
-                    if X[X_sample_stride * samples[p] + feature_stride] <= best.split_value.threshold:
+                    if goes_left(X[X_sample_stride * samples[p] + feature_stride],
+                                 best.split_value, self.n_categories[best.feature]):
                         p += 1
 
                     else:
                         partition_end -= 1
 
-                        tmp = samples[partition_end]
-                        samples[partition_end] = samples[p]
-                        samples[p] = tmp
+                        samples[p], samples[partition_end] = (
+                            samples[partition_end], samples[p])
 
 
             self.criterion.reset()
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 5b73df0ebb27e..856dfaf87aaf2 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -43,6 +43,9 @@ cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except *
 cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size)
 
 
+cdef UINT32_t our_rand_r(UINT32_t* seed) nogil
+
+
 cdef SIZE_t rand_int(SIZE_t low, SIZE_t high,
                             UINT32_t* random_state) nogil
 

From 94521965b2eb2fecac30d647f56555c0c1e0f5b0 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Mon, 11 May 2015 14:47:40 -0700
Subject: [PATCH 07/35] Added a 'categorical' parameter to
 BaseDecisionTree.fit(). It currently does nothing.

---
 sklearn/tree/tree.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index d33f2fbadcb80..cf14ae9ff2a37 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -110,8 +110,8 @@ def __init__(self,
         self.tree_ = None
         self.max_features_ = None
 
-    def fit(self, X, y, sample_weight=None, check_input=True,
-            X_idx_sorted=None):
+    def fit(self, X, y, sample_weight=None, categorical='None',
+            check_input=True, X_idx_sorted=None):
         """Build a decision tree from the training set (X, y).
 
         Parameters
@@ -133,9 +133,22 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
+        categorical : array-like or str
+            Array of feature indices, boolean array of length
+            n_features, ``'All'``, or ``'None'``.  Indicates which
+            features should be considered as categorical rather than
+            ordinal. For decision trees, the maximum number of
+            categories per feature is 64, though the real-world limit
+            will be much lower because evaluating splits has
+            :math:`O(2^N)` time complexity, for :math:`N`
+            categories. Extra-randomized trees do not have this
+            limitation because they do not try to find the best
+            split. For these trees, the maximum number of categories
+            per feature is :math:`2^{31}`.
+
         check_input : boolean, (default=True)
             Allow to bypass several input checking.
-            Don't use this parameter unless you know what you do.
+            Don't use this parameter unless you know what you are doing.
 
         X_idx_sorted : array-like, shape = [n_samples, n_features], optional
             The indexes of the sorted training input samples. If many tree

From 361f87cc79b6e508e5a020406dc5927222895c33 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Mon, 11 May 2015 14:56:30 -0700
Subject: [PATCH 08/35] Added a property getter/setter for Tree.n_categories

---
 sklearn/tree/_tree.pyx | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 9d557de88a857..138833cd99894 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -529,6 +529,10 @@ cdef class Tree:
     value : array of double, shape [node_count, n_outputs, max_n_classes]
         Contains the constant prediction value of each node.
 
+    n_categories : array of int, shape [n_features]
+        Number of expected categorie values for categorical features, or
+        -1 for non-categorical features.
+
     impurity : array of double, shape [node_count]
         impurity[i] holds the impurity (i.e., the value of the splitting
         criterion) at node i.
@@ -581,6 +585,15 @@ cdef class Tree:
         def __get__(self):
             return self._get_value_ndarray()[:self.node_count]
 
+    property n_categories:
+        def __get__(self):
+            return self._get_ncat_ndarray()[:self.n_features]
+
+        def __set__(self, np.ndarray[INT32_t, ndim=1] value):
+            cdef SIZE_t i
+            for i in range(self.n_features):
+                self.n_categories[i] = value[i]
+
     def __cinit__(self, int n_features, np.ndarray[SIZE_t, ndim=1] n_classes,
                   int n_outputs):
         """Constructor."""

From fc7efa64f92e922368b46a8425c5cb17f21d4704 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Mon, 11 May 2015 15:31:21 -0700
Subject: [PATCH 09/35] Added n_categories arguments to Splitter.init() and
 TreeBuilder.build(). Enforced max n_categories=2**31 for RandomSplitter.

---
 sklearn/tree/_splitter.pxd |  1 +
 sklearn/tree/_splitter.pyx | 14 +++++++++-----
 sklearn/tree/_tree.pxd     |  1 +
 sklearn/tree/_tree.pyx     | 23 +++++++++++++++++++++--
 4 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index c75714af1c1af..809feb2ac32fe 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -108,6 +108,7 @@ cdef class Splitter:
     # Methods
     cdef void init(self, object X, np.ndarray y,
                    DOUBLE_t* sample_weight,
+                   INT32_t* n_categories,
                    np.ndarray X_idx_sorted=*) except *
 
     cdef void node_reset(self, SIZE_t start, SIZE_t end,
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 5171a7cf76a7e..46cb3be42898a 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -124,6 +124,7 @@ cdef class Splitter:
                    object X,
                    np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
                    DOUBLE_t* sample_weight,
+                   INT32_t* n_categories,
                    np.ndarray X_idx_sorted=None) except *:
         """Initialize the splitter.
 
@@ -189,7 +190,8 @@ cdef class Splitter:
         # A value of -1 indicates a non-categorical feature
         safe_realloc(&self.n_categories, n_features)
         for i in range(n_features):
-            self.n_categories[i] = -1
+            self.n_categories[i] = (-1 if n_categories == NULL
+                                    else n_categories[i])
 
     cdef void node_reset(self, SIZE_t start, SIZE_t end,
                          double* weighted_n_node_samples) nogil:
@@ -271,11 +273,12 @@ cdef class BaseDenseSplitter(Splitter):
                    object X,
                    np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
                    DOUBLE_t* sample_weight,
+                   INT32_t* n_categories,
                    np.ndarray X_idx_sorted=None) except *:
         """Initialize the splitter."""
 
         # Call parent init
-        Splitter.init(self, X, y, sample_weight)
+        Splitter.init(self, X, y, sample_weight, n_categories)
 
         # Initialize X
         cdef np.ndarray X_ndarray = X
@@ -792,8 +795,8 @@ cdef class RandomSplitter(BaseDenseSplitter):
                     if is_categorical:
                         # split_n_draw is the number of categories to send left
                         # TODO: this should be a binomial draw
-                        split_n_draw = rand_int(
-                            1, self.n_categories[current.feature], random_state)
+                        split_n_draw = rand_int(1, self.n_categories[current.feature],
+                                                random_state) & <SIZE_t>0x7FFFFFFF
                         split_seed = our_rand_r(random_state)
                         current.split_value.cat_split = (
                             (split_seed << 32) | (split_n_draw << 1) | 1)
@@ -916,11 +919,12 @@ cdef class BaseSparseSplitter(Splitter):
                    object X,
                    np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
                    DOUBLE_t* sample_weight,
+                   INT32_t* n_categories,
                    np.ndarray X_idx_sorted=None) except *:
         """Initialize the splitter."""
 
         # Call parent init
-        Splitter.init(self, X, y, sample_weight)
+        Splitter.init(self, X, y, sample_weight, n_categories)
 
         if not isinstance(X, csc_matrix):
             raise ValueError("X should be in csc format")
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index dd22775484e7b..e877f66cf6d0b 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -104,5 +104,6 @@ cdef class TreeBuilder:
 
     cpdef build(self, Tree tree, object X, np.ndarray y,
                 np.ndarray sample_weight=*,
+                np.ndarray n_categories=*,
                 np.ndarray X_idx_sorted=*)
     cdef _check_input(self, object X, np.ndarray y, np.ndarray sample_weight)
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 138833cd99894..f88618a0762b5 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -97,6 +97,7 @@ cdef class TreeBuilder:
 
     cpdef build(self, Tree tree, object X, np.ndarray y,
                 np.ndarray sample_weight=None,
+                np.ndarray n_categories=None,
                 np.ndarray X_idx_sorted=None):
         """Build a decision tree from the training set (X, y)."""
         pass
@@ -146,6 +147,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
 
     cpdef build(self, Tree tree, object X, np.ndarray y,
                 np.ndarray sample_weight=None,
+                np.ndarray n_categories=None,
                 np.ndarray X_idx_sorted=None):
         """Build a decision tree from the training set (X, y)."""
 
@@ -156,6 +158,14 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         if sample_weight is not None:
             sample_weight_ptr = <DOUBLE_t*> sample_weight.data
 
+        cdef INT32_t *n_categories_ptr = NULL
+        if n_categories is not None:
+            if ((n_categories.dtype != np.int32) or
+                    (not n_categories.flags.contiguous)):
+                n_categories = np.asarray(n_categories,
+                                          dtype=np.int32, order="C")
+            n_categories_ptr = <INT32_t *> n_categories.data
+
         # Initial capacity
         cdef int init_capacity
 
@@ -174,7 +184,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef SIZE_t min_samples_split = self.min_samples_split
 
         # Recursive partition (without actual recursion)
-        splitter.init(X, y, sample_weight_ptr, X_idx_sorted)
+        splitter.init(X, y, sample_weight_ptr, n_categories_ptr, X_idx_sorted)
 
         cdef SIZE_t start
         cdef SIZE_t end
@@ -303,6 +313,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
 
     cpdef build(self, Tree tree, object X, np.ndarray y,
                 np.ndarray sample_weight=None,
+                np.ndarray n_categories=None,
                 np.ndarray X_idx_sorted=None):
         """Build a decision tree from the training set (X, y)."""
 
@@ -313,6 +324,14 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         if sample_weight is not None:
             sample_weight_ptr = <DOUBLE_t*> sample_weight.data
 
+        cdef INT32_t *n_categories_ptr = NULL
+        if n_categories is not None:
+            if ((n_categories.dtype != np.int32) or
+                    (not n_categories.flags.contiguous)):
+                n_categories = np.asarray(n_categories,
+                                          dtype=np.int32, order="C")
+            n_categories_ptr = <INT32_t *> n_categories.data
+
         # Parameters
         cdef Splitter splitter = self.splitter
         cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes
@@ -321,7 +340,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         cdef SIZE_t min_samples_split = self.min_samples_split
 
         # Recursive partition (without actual recursion)
-        splitter.init(X, y, sample_weight_ptr, X_idx_sorted)
+        splitter.init(X, y, sample_weight_ptr, n_categories_ptr, X_idx_sorted)
 
         cdef PriorityHeap frontier = PriorityHeap(INITIAL_STACK_SIZE)
         cdef PriorityHeapRecord record

From 144d665f910fa30d7d78e1fea6db7c99e7dceedb Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Wed, 24 Jun 2015 18:49:56 -0700
Subject: [PATCH 10/35] Added python interface for categorical features.

---
 sklearn/tree/tree.py | 55 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index cf14ae9ff2a37..80a19d573f8bd 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -20,6 +20,7 @@
 from abc import ABCMeta
 from abc import abstractmethod
 from math import ceil
+from itertools import izip, count
 
 import numpy as np
 from scipy.sparse import issparse
@@ -106,6 +107,7 @@ def __init__(self,
         self.n_outputs_ = None
         self.classes_ = None
         self.n_classes_ = None
+        self.category_map_ = None
 
         self.tree_ = None
         self.max_features_ = None
@@ -309,6 +311,42 @@ def fit(self, X, y, sample_weight=None, categorical='None',
             else:
                 sample_weight = expanded_class_weight
 
+        if isinstance(categorical, str):
+            if categorical == "None":
+                categorical = np.array([])
+            elif categorical == "All":
+                categorical = np.arange(self.n_features_)
+            else:
+                raise ValueError("Invalid value for categorical: %s. Allowed"
+                                 " strings are 'All' or 'None'" % categorical)
+        categorical = np.atleast_1d(categorical).flatten()
+        if categorical.dtype == np.bool:
+            if categorical.size != self.n_features_:
+                raise ValueError("Shape of boolean parameter categorical must"
+                                 " be [n_features]")
+            categorical = np.nonzero(categorical)[0]
+        if (categorical.size > self.n_features_ or
+            (categorical.size > 0 and
+             (np.min(categorical) < 0 or
+              np.max(categorical) >= self.n_features_))):
+            raise ValueError("Invalid shape or invalid feature index for"
+                             " parameter categorical")
+        if issparse(X) and len(categorical) > 0:
+            raise NotImplementedError("Categorical features not supported with"
+                                      " sparse inputs")
+
+        # Determine the number of categories in each categorical feature
+        n_categories = np.zeros(self.n_features_, dtype=np.int32) - 1
+        self.category_map_ = [None] * self.n_features_
+        if categorical.size > 0:
+            X = np.copy(X)
+        for feature in categorical:
+            rounded = np.round(X[:, feature]).astype(np.int64)
+            self.category_map_[feature] = dict(izip(set(rounded), count()))
+            X[:, feature] = np.array([self.category_map_[feature][x]
+                                       for x in rounded]).astype(DTYPE)
+            n_categories[feature] = len(self.category_map_[feature])
+
         # Set min_weight_leaf from min_weight_fraction_leaf
         if self.min_weight_fraction_leaf != 0. and sample_weight is not None:
             min_weight_leaf = (self.min_weight_fraction_leaf *
@@ -364,6 +402,7 @@ def fit(self, X, y, sample_weight=None, categorical='None',
                                                 self.presort)
 
         self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_)
+        self.tree_.n_categories = n_categories
 
         # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
         if max_leaf_nodes < 0:
@@ -378,7 +417,8 @@ def fit(self, X, y, sample_weight=None, categorical='None',
                                            max_depth,
                                            max_leaf_nodes)
 
-        builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)
+        builder.build(self.tree_, X, y, sample_weight, n_categories,
+                      X_idx_sorted)
 
         if self.n_outputs_ == 1:
             self.n_classes_ = self.n_classes_[0]
@@ -406,6 +446,19 @@ def _validate_X_predict(self, X, check_input):
                              "input n_features is %s "
                              % (self.n_features_, n_features))
 
+        # Map categorical features onto integers
+        n_categories = self.tree_.n_categories
+        categorical_features = np.nonzero(n_categories > 0)[0]
+        if categorical_features.size > 0:
+            X = np.copy(X)
+        for feature in categorical_features:
+            rounded = np.round(X[:, feature]).astype('int64')
+            new_cat = set(rounded) - set(self.category_map_[feature])
+            new_cat_map = dict(izip(new_cat, count(n_categories[feature])))
+            X[:, feature] = np.array(
+                [self.category_map_[feature].get(x, new_cat_map.get(x))
+                 for x in rounded]).astype(DTYPE)
+
         return X
 
     def predict(self, X, check_input=True):

From cf0955969c533cf421746fa2ddd3603f8e4cc0bf Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Mon, 11 May 2015 23:04:45 -0700
Subject: [PATCH 11/35] Adjusted _gradient_boosting.pyx to match the new
 splitting mechanism.

---
 sklearn/ensemble/_gradient_boosting.pyx | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx
index 00ad2c08a0ee3..f95d120943957 100644
--- a/sklearn/ensemble/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_gradient_boosting.pyx
@@ -14,6 +14,7 @@ np.import_array()
 
 from sklearn.tree._tree cimport Node
 from sklearn.tree._tree cimport Tree
+from sklearn.tree._utils cimport goes_left
 
 
 ctypedef np.int32_t int32
@@ -31,6 +32,7 @@ from numpy import float64 as np_float64
 DTYPE = np.float32
 ctypedef np.float32_t DTYPE_t
 ctypedef np.npy_intp SIZE_t
+ctypedef np.npy_int32 INT32_t
 
 
 # constant to mark tree leafs
@@ -44,6 +46,7 @@ cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X,
                                                 Py_ssize_t K,
                                                 Py_ssize_t n_samples,
                                                 Py_ssize_t n_features,
+                                                INT32_t* n_categories,
                                                 float64 *out):
     """Predicts output for regression tree and stores it in ``out[i, k]``.
 
@@ -78,6 +81,9 @@ cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X,
         ``n_samples == X.shape[0]``.
     n_features : int
         The number of features; ``n_samples == X.shape[1]``.
+    n_categories : INT32_t pointer
+        Pointer to array of shape [n_features] containing the number of
+        categories for each feature, or -1 for non-categorical features.
     out : np.float64_t pointer
         The pointer to the data array where the predictions are stored.
         ``out`` is assumed to be a two-dimensional array of
@@ -90,7 +96,8 @@ cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X,
         node = root_node
         # While node not a leaf
         while node.left_child != -1 and node.right_child != -1:
-            if X[i * n_features + node.feature] <= node.threshold:
+            if goes_left(X[i * n_features + node.feature], node.split_value,
+                         n_categories[node.feature]):
                 node = root_node + node.left_child
             else:
                 node = root_node + node.right_child
@@ -123,6 +130,7 @@ def predict_stages(np.ndarray[object, ndim=2] estimators,
                 <DTYPE_t*> X.data,
                 tree.nodes, tree.value,
                 scale, k, K, X.shape[0], X.shape[1],
+                tree.n_categories,
                 <float64 *> (<np.ndarray> out).data)
             ## out += scale * tree.predict(X).reshape((X.shape[0], 1))
 
@@ -226,7 +234,8 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
                 if feature_index != -1:
                     # split feature in target set
                     # push left or right child on stack
-                    if X[i, feature_index] <= current_node.threshold:
+                    if goes_left(X[i, feature_index], current_node.split_value,
+                                 tree.n_categories[current_node.feature]):
                         # left
                         node_stack[stack_size] = (root_node +
                                                   current_node.left_child)

From 25fcc8835639fbf3dca65a0e3aa9a3f16732a863 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Wed, 24 Jun 2015 10:50:38 -0700
Subject: [PATCH 12/35] Added interface code to forest.py and
 gradient_boosting.py for categorical features.

---
 sklearn/ensemble/forest.py            | 27 +++++++++++----
 sklearn/ensemble/gradient_boosting.py | 48 +++++++++++++++------------
 2 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index e26323f65bfee..bc88343261612 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -90,8 +90,8 @@ def _generate_unsampled_indices(random_state, n_samples):
 
     return unsampled_indices
 
-def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
-                          verbose=0, class_weight=None):
+def _parallel_build_trees(tree, forest, X, y, sample_weight, categorical,
+                          tree_idx, n_trees, verbose=0, class_weight=None):
     """Private function used to fit a single tree in parallel."""
     if verbose > 1:
         print("building tree %d of %d" % (tree_idx + 1, n_trees))
@@ -114,9 +114,11 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         elif class_weight == 'balanced_subsample':
             curr_sample_weight *= compute_sample_weight('balanced', y, indices)
 
-        tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
+        tree.fit(X, y, sample_weight=curr_sample_weight,
+                 categorical=categorical, check_input=False)
     else:
-        tree.fit(X, y, sample_weight=sample_weight, check_input=False)
+        tree.fit(X, y, sample_weight=sample_weight,
+                 categorical=categorical, check_input=False)
 
     return tree
 
@@ -211,7 +213,7 @@ def decision_path(self, X):
 
         return sparse_hstack(indicators).tocsr(), n_nodes_ptr
 
-    def fit(self, X, y, sample_weight=None):
+    def fit(self, X, y, sample_weight=None, categorical='None'):
         """Build a forest of trees from the training set (X, y).
 
         Parameters
@@ -232,6 +234,19 @@ def fit(self, X, y, sample_weight=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
+        categorical : array-like or str
+            Array of feature indices, boolean array of length
+            n_features, ``'All'``, or ``'None'``.  Indicates which
+            features should be considered as categorical rather than
+            ordinal. For decision trees, the maximum number of
+            categories per feature is 64, though the real-world limit
+            will be much lower because evaluating splits has
+            :math:`O(2^N)` time complexity, for :math:`N`
+            categories. Extra-randomized trees do not have this
+            limitation because they do not try to find the best
+            split. For these trees, the maximum number of categories
+            per feature is :math:`2^{31}`.
+
         Returns
         -------
         self : object
@@ -315,7 +330,7 @@ def fit(self, X, y, sample_weight=None):
             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                              backend="threading")(
                 delayed(_parallel_build_trees)(
-                    t, self, X, y, sample_weight, i, len(trees),
+                    t, self, X, y, sample_weight, categorical, i, len(trees),
                     verbose=self.verbose, class_weight=self.class_weight)
                 for i, t in enumerate(trees))
 
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
index b17d726cb122a..087a8430f0b15 100644
--- a/sklearn/ensemble/gradient_boosting.py
+++ b/sklearn/ensemble/gradient_boosting.py
@@ -746,7 +746,8 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
         self.estimators_ = np.empty((0, 0), dtype=np.object)
 
     def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
-                   random_state, X_idx_sorted, X_csc=None, X_csr=None):
+                   categorical, random_state, X_idx_sorted, X_csc=None,
+                   X_csr=None):
         """Fit another stage of ``n_classes_`` trees to the boosting model. """
 
         assert sample_mask.dtype == np.bool
@@ -777,22 +778,14 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
                 # no inplace multiplication!
                 sample_weight = sample_weight * sample_mask.astype(np.float64)
 
-            if X_csc is not None:
-                tree.fit(X_csc, residual, sample_weight=sample_weight,
-                         check_input=False, X_idx_sorted=X_idx_sorted)
-            else:
-                tree.fit(X, residual, sample_weight=sample_weight,
-                         check_input=False, X_idx_sorted=X_idx_sorted)
+            tree.fit(X_csc if X_csc is not None else X, residual,
+                     sample_weight=sample_weight, categorical=categorical,
+                     check_input=False, X_idx_sorted=X_idx_sorted)
 
             # update tree leaves
-            if X_csr is not None:
-                loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred,
-                                             sample_weight, sample_mask,
-                                             self.learning_rate, k=k)
-            else:
-                loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
-                                             sample_weight, sample_mask,
-                                             self.learning_rate, k=k)
+            loss.update_terminal_regions(
+                tree.tree_, X_csr if X_csr is not None else X, y, residual,
+                y_pred, sample_weight, sample_mask, self.learning_rate, k=k)
 
             # add tree to ensemble
             self.estimators_[i, k] = tree
@@ -928,7 +921,7 @@ def _check_initialized(self):
             raise NotFittedError("Estimator not fitted, call `fit`"
                                  " before making predictions`.")
 
-    def fit(self, X, y, sample_weight=None, monitor=None):
+    def fit(self, X, y, sample_weight=None, categorical='None', monitor=None):
         """Fit the gradient boosting model.
 
         Parameters
@@ -949,6 +942,15 @@ def fit(self, X, y, sample_weight=None, monitor=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
+        categorical : array-like or str
+            Array of feature indices, boolean array of length
+            n_features, ``'All'``, or ``'None'``.  Indicates which
+            features should be considered as categorical rather than
+            ordinal. The maximum number of categories per feature is
+            64, though the real-world limit will be much lower because
+            evaluating splits has :math:`O(2^N)` time complexity, for
+            :math:`N` categories.
+
         monitor : callable, optional
             The monitor is called after each iteration with the current
             iteration, a reference to the estimator and the local variables of
@@ -1022,8 +1024,9 @@ def fit(self, X, y, sample_weight=None, monitor=None):
                                                  dtype=np.int32)
 
         # fit the boosting stages
-        n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state,
-                                    begin_at_stage, monitor, X_idx_sorted)
+        n_stages = self._fit_stages(X, y, y_pred, sample_weight, categorical,
+                                    random_state, begin_at_stage, monitor,
+                                    X_idx_sorted)
         # change shape of arrays after fit (early-stopping or additional ests)
         if n_stages != self.estimators_.shape[0]:
             self.estimators_ = self.estimators_[:n_stages]
@@ -1033,8 +1036,9 @@ def fit(self, X, y, sample_weight=None, monitor=None):
 
         return self
 
-    def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
-                    begin_at_stage=0, monitor=None, X_idx_sorted=None):
+    def _fit_stages(self, X, y, y_pred, sample_weight, categorical,
+                    random_state, begin_at_stage=0, monitor=None,
+                    X_idx_sorted=None):
         """Iteratively fits the stages.
 
         For each stage it computes the progress (OOB, train score)
@@ -1077,8 +1081,8 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
 
             # fit next stage of trees
             y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
-                                     sample_mask, random_state, X_idx_sorted,
-                                     X_csc, X_csr)
+                                     sample_mask, categorical, random_state,
+                                     X_idx_sorted, X_csc, X_csr)
 
             # track deviance (= loss)
             if do_oob:

From 30fe8296c69f813a696b4625bb0c1b9eb241cbff Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Wed, 13 May 2015 23:37:00 -0700
Subject: [PATCH 13/35] Added bit caches to Splitter and Tree. These are used
 to avoid regenerating a big list of random numbers for every sample in every
 node when sending it left or right.

---
 sklearn/ensemble/_gradient_boosting.pyx | 13 +++++--
 sklearn/tree/_splitter.pxd              |  2 ++
 sklearn/tree/_splitter.pyx              | 25 ++++++++++---
 sklearn/tree/_tree.pxd                  |  4 +++
 sklearn/tree/_tree.pyx                  | 39 ++++++++++++++++++--
 sklearn/tree/_utils.pxd                 | 11 ++++--
 sklearn/tree/_utils.pyx                 | 47 ++++++++++++++++++-------
 7 files changed, 118 insertions(+), 23 deletions(-)

diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx
index f95d120943957..a26762bad21d6 100644
--- a/sklearn/ensemble/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_gradient_boosting.pyx
@@ -97,7 +97,7 @@ cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X,
         # While node not a leaf
         while node.left_child != -1 and node.right_child != -1:
             if goes_left(X[i * n_features + node.feature], node.split_value,
-                         n_categories[node.feature]):
+                         n_categories[node.feature], node._bit_cache):
                 node = root_node + node.left_child
             else:
                 node = root_node + node.right_child
@@ -123,6 +123,8 @@ def predict_stages(np.ndarray[object, ndim=2] estimators,
         for k in range(K):
             tree = estimators[i, k].tree_
 
+            tree.populate_bit_caches()
+
             # avoid buffer validation by casting to ndarray
             # and get data pointer
             # need brackets because of casting operator priority
@@ -134,6 +136,8 @@ def predict_stages(np.ndarray[object, ndim=2] estimators,
                 <float64 *> (<np.ndarray> out).data)
             ## out += scale * tree.predict(X).reshape((X.shape[0], 1))
 
+            tree.delete_bit_caches()
+
 
 @cython.nonecheck(False)
 def predict_stage(np.ndarray[object, ndim=2] estimators,
@@ -212,6 +216,8 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
     underlying_stack = np_zeros((stack_capacity,), dtype=np.intp)
     node_stack = <Node **>(<np.ndarray> underlying_stack).data
 
+    tree.populate_bit_caches()
+
     for i in range(X.shape[0]):
         # init stacks for new example
         stack_size = 1
@@ -235,7 +241,8 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
                     # split feature in target set
                     # push left or right child on stack
                     if goes_left(X[i, feature_index], current_node.split_value,
-                                 tree.n_categories[current_node.feature]):
+                                 tree.n_categories[current_node.feature],
+                                 current_node._bit_cache):
                         # left
                         node_stack[stack_size] = (root_node +
                                                   current_node.left_child)
@@ -273,6 +280,8 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
             raise ValueError("Total weight should be 1.0 but was %.9f" %
                              total_weight)
 
+    tree.delete_bit_caches()
+
 
 def _random_sample_mask(np.npy_intp n_total_samples,
                         np.npy_intp n_total_in_bag, random_state):
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 809feb2ac32fe..64bfb22bd9284 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -17,6 +17,7 @@ from ._criterion cimport Criterion
 ctypedef np.npy_float32 DTYPE_t          # Type of X
 ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
 ctypedef np.npy_intp SIZE_t              # Type for indices and counters
+ctypedef np.npy_uint8 UINT8_t            # Unsigned 8 bit integer
 ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
 ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
 ctypedef np.npy_uint64 UINT64_t          # Unsigned 64 bit integer
@@ -88,6 +89,7 @@ cdef class Splitter:
     cdef DOUBLE_t* sample_weight
     cdef INT32_t* n_categories           # (n_features) array giving number of
                                          # categories (<0 for non-categorical)
+    cdef UINT8_t* _bit_cache
 
     # The samples vector `samples` is maintained by the Splitter object such
     # that the samples contained in a node are contiguous. With this setting,
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 46cb3be42898a..49fc1af5ed3c6 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -34,6 +34,7 @@ from ._utils cimport rand_int
 from ._utils cimport rand_uniform
 from ._utils cimport RAND_R_MAX
 from ._utils cimport safe_realloc
+from ._utils cimport make_bit_cache
 from ._utils cimport goes_left
 
 cdef double INFINITY = np.inf
@@ -98,6 +99,7 @@ cdef class Splitter:
         self.y_stride = 0
         self.sample_weight = NULL
         self.n_categories = NULL
+        self._bit_cache = NULL
 
         self.max_features = max_features
         self.min_samples_leaf = min_samples_leaf
@@ -113,6 +115,7 @@ cdef class Splitter:
         free(self.constant_features)
         free(self.feature_values)
         free(self.n_categories)
+        free(self._bit_cache)
 
     def __getstate__(self):
         return {}
@@ -193,6 +196,12 @@ cdef class Splitter:
             self.n_categories[i] = (-1 if n_categories == NULL
                                     else n_categories[i])
 
+        # If needed, allocate cache space to hold split info
+        cdef INT32_t max_n_categories = max(
+            [self.n_categories[i] for i in range(n_features)])
+        if max_n_categories > 0:
+            safe_realloc(&self._bit_cache, (max_n_categories + 7) // 8)
+
     cdef void node_reset(self, SIZE_t start, SIZE_t end,
                          double* weighted_n_node_samples) nogil:
         """Reset splitter on node samples[start:end].
@@ -459,7 +468,7 @@ cdef class BestSplitter(BaseDenseSplitter):
                             q = start
                             partition_end = end
                             while q < partition_end:
-                                if ((p >> <SIZE_t>Xf[q]) & 1):
+                                if (p >> <SIZE_t>Xf[q]) & 1:
                                     q += 1
                                 else:
                                     partition_end -= 1
@@ -511,13 +520,16 @@ cdef class BestSplitter(BaseDenseSplitter):
 
         # Reorganize into samples[start:best.pos] + samples[best.pos:end]
         if best.pos < end:
+            make_bit_cache(best.split_value, self.n_categories[best.feature],
+                           self._bit_cache)
             feature_offset = X_feature_stride * best.feature
             partition_end = end
             p = start
 
             while p < partition_end:
                 if goes_left(X[X_sample_stride * samples[p] + feature_offset],
-                             best.split_value, self.n_categories[best.feature]):
+                             best.split_value, self.n_categories[best.feature],
+                             self._bit_cache):
                     p += 1
 
                 else:
@@ -807,12 +819,14 @@ cdef class RandomSplitter(BaseDenseSplitter):
                             current.split_value.threshold = min_feature_value
 
                     # Partition
+                    make_bit_cache(current.split_value, self.n_categories[current.feature],
+                                   self._bit_cache)
                     partition_end = end
                     p = start
                     while p < partition_end:
                         current_feature_value = Xf[p]
                         if goes_left(current_feature_value, current.split_value,
-                                     self.n_categories[current.feature]):
+                                     self.n_categories[current.feature], self._bit_cache):
                             p += 1
                         else:
                             partition_end -= 1
@@ -848,13 +862,16 @@ cdef class RandomSplitter(BaseDenseSplitter):
         # Reorganize into samples[start:best.pos] + samples[best.pos:end]
         feature_stride = X_feature_stride * best.feature
         if best.pos < end:
+            make_bit_cache(best.split_value, self.n_categories[best.feature],
+                           self._bit_cache)
             if current.feature != best.feature:
                 partition_end = end
                 p = start
 
                 while p < partition_end:
                     if goes_left(X[X_sample_stride * samples[p] + feature_stride],
-                                 best.split_value, self.n_categories[best.feature]):
+                                 best.split_value, self.n_categories[best.feature],
+                                 self._bit_cache):
                         p += 1
 
                     else:
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index e877f66cf6d0b..e330848fb85a5 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -15,6 +15,7 @@ cimport numpy as np
 ctypedef np.npy_float32 DTYPE_t          # Type of X
 ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
 ctypedef np.npy_intp SIZE_t              # Type for indices and counters
+ctypedef np.npy_uint8 UINT8_t            # Unsigned 8 bit integer
 ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
 ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
 ctypedef np.npy_uint64 UINT64_t          # Unsigned 64 bit integer
@@ -34,6 +35,7 @@ cdef struct Node:
     DOUBLE_t impurity                    # Impurity of the node (i.e., the value of the criterion)
     SIZE_t n_node_samples                # Number of samples at the node
     DOUBLE_t weighted_n_node_samples     # Weighted number of samples at the node
+    UINT8_t* _bit_cache
 
 
 cdef class Tree:
@@ -69,6 +71,8 @@ cdef class Tree:
     cdef np.ndarray _get_value_ndarray(self)
     cdef np.ndarray _get_node_ndarray(self)
     cdef np.ndarray _get_ncat_ndarray(self)
+    cdef void populate_bit_caches(self)
+    cdef void delete_bit_caches(self)
 
     cpdef np.ndarray predict(self, object X)
 
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index f88618a0762b5..8bbb24ffb2cd6 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -35,6 +35,7 @@ from ._utils cimport StackRecord
 from ._utils cimport PriorityHeap
 from ._utils cimport PriorityHeapRecord
 from ._utils cimport safe_realloc
+from ._utils cimport make_bit_cache
 from ._utils cimport goes_left
 from ._utils cimport sizet_ptr_to_ndarray
 
@@ -643,6 +644,7 @@ cdef class Tree:
     def __dealloc__(self):
         """Destructor."""
         # Free all inner structures
+        self.delete_bit_caches()
         free(self.n_classes)
         free(self.value)
         free(self.nodes)
@@ -781,10 +783,39 @@ cdef class Tree:
             node.feature = feature
             node.split_value = split_value
 
+        node._bit_cache = NULL
+
         self.node_count += 1
 
         return node_id
 
+    cdef void populate_bit_caches(self):
+        """Allocates and populates bit caches for nodes that split on
+        categorical features. Should be run before every tree traversal."""
+        cdef Node* node = self.nodes
+        cdef Node* end_node = self.nodes + self.node_count
+        cdef INT32_t n_categories = 0
+
+        while node != end_node:
+            if node.left_child != _TREE_LEAF:
+                n_categories = self.n_categories[node.feature]
+                if n_categories > 0:
+                    safe_realloc(&node._bit_cache, (n_categories + 7) // 8)
+                    make_bit_cache(node.split_value, n_categories,
+                                   node._bit_cache)
+            node += 1
+
+    cdef void delete_bit_caches(self):
+        """Deallocates the bit cache of each node in the tree. Should be run
+        after tree traversal."""
+        cdef Node* node = self.nodes
+        cdef Node* end_node = self.nodes + self.node_count
+
+        while node != end_node:
+            free(node._bit_cache)
+            node._bit_cache = NULL
+            node += 1
+
     cpdef np.ndarray predict(self, object X):
         """Predict target for X."""
         out = self._get_value_ndarray().take(self.apply(X), axis=0,
@@ -826,6 +857,8 @@ cdef class Tree:
         cdef Node* node = NULL
         cdef SIZE_t i = 0
 
+        self.populate_bit_caches()
+
         with nogil:
             for i in range(n_samples):
                 node = self.nodes
@@ -833,13 +866,15 @@ cdef class Tree:
                 while node.left_child != _TREE_LEAF:
                     # ... and node.right_child != _TREE_LEAF:
                     if goes_left(X_ptr[X_sample_stride * i + X_fx_stride * node.feature],
-                                 node.split_value, self.n_categories[node.feature]):
+                                 node.split_value, self.n_categories[node.feature], node._bit_cache):
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
 
                 out_ptr[i] = <SIZE_t>(node - self.nodes)  # node offset
 
+        self.delete_bit_caches()
+
         return out
 
     cdef inline np.ndarray _apply_sparse_csr(self, object X):
@@ -905,7 +940,7 @@ cdef class Tree:
                         feature_value = 0.
 
                     if goes_left(feature_value, node.split_value,
-                                 self.n_categories[node.feature]):
+                                 self.n_categories[node.feature], node._bit_cache):
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 856dfaf87aaf2..aea80233cb7c3 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -15,6 +15,7 @@ from ._splitter cimport SplitValue
 ctypedef np.npy_float32 DTYPE_t          # Type of X
 ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
 ctypedef np.npy_intp SIZE_t              # Type for indices and counters
+ctypedef np.npy_uint8 UINT8_t            # Unsigned 8 bit integer
 ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
 ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
 
@@ -56,9 +57,13 @@ cdef double rand_uniform(double low, double high,
 
 cdef double log(double x) nogil
 
-# Function for traversing a tree
-cdef bint goes_left(DTYPE_t feature_value, SplitValue split,
-                    INT32_t n_categories) nogil
+
+# Functions for traversing a tree
+cdef inline void make_bit_cache(SplitValue split, INT32_t n_categories,
+                                UINT8_t* bit_cache) nogil
+
+cdef inline bint goes_left(DTYPE_t feature_value, SplitValue split,
+                           INT32_t n_categories, UINT8_t* bit_cache) nogil
 
 # =============================================================================
 # Stack data structure
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index a2c975506a751..10d48db6c922e 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -82,25 +82,48 @@ cdef inline double rand_uniform(double low, double high,
 cdef inline double log(double x) nogil:
     return ln(x) / ln(2.0)
 
-cdef bint goes_left(DTYPE_t feature_value, SplitValue split,
-                    INT32_t n_categories) nogil:
-    """Determine whether a sample goes to the left or right child node."""
+cdef inline void make_bit_cache(SplitValue split, INT32_t n_categories,
+                                UINT8_t* bit_cache) nogil:
+    """Regenerate and store the random numbers for a split."""
     cdef UINT32_t rng_seed
+    cdef SIZE_t q
+    cdef UINT32_t val, idx, shift
 
-    if n_categories < 1:
-        # Non-categorical feature
-        return feature_value <= split.threshold
-    elif (split.cat_split & 1 == 0):
+    if (n_categories <= 0):
+        # Non-categorical feature; bit cache not used
+        return
+
+    if (split.cat_split & 1 == 0):
         # Bitfield model
-        return (split.cat_split >> <SIZE_t>feature_value) & 1
+        for q in range((n_categories + 7) // 8):
+            bit_cache[q] = (split.cat_split >> (q * 8)) & <SIZE_t>0xFF
     else:
         # Random model
+        for q in range((n_categories + 7) // 8):
+            bit_cache[q] = 0
         rng_seed = split.cat_split >> 32
         for q in range((split.cat_split & <SIZE_t>0xFFFFFFFF) >> 1):
-            if (<SIZE_t>feature_value ==
-                    rand_int(0, n_categories, &rng_seed)):
-                return 1
-        return 0
+            val = rand_int(0, n_categories, &rng_seed)
+            idx = val // 8
+            shift = val % 8
+            bit_cache[idx] |= (1 << shift)
+
+cdef inline bint goes_left(DTYPE_t feature_value, SplitValue split,
+                           INT32_t n_categories, UINT8_t* bit_cache) nogil:
+    """Determine whether a sample goes to the left or right child node."""
+    cdef SIZE_t idx, shift
+
+    if n_categories < 1:
+        # Non-categorical feature
+        return feature_value <= split.threshold
+    else:
+        # Categorical feature, using bit cache
+        if (<SIZE_t> feature_value) < n_categories:
+            idx = (<SIZE_t> feature_value) // 8
+            shift = (<SIZE_t> feature_value) % 8
+            return (bit_cache[idx] >> shift) & 1
+        else:
+            return 0
 
 
 # =============================================================================

From 1c1b7764a0271d23cf71a1f87ccf99adf25ee93a Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Wed, 13 May 2015 23:43:30 -0700
Subject: [PATCH 14/35] Added a check on the predict() and predict_proba()
 methods of trees that there are no sparse categorical variables, since
 they're currently not supported.

---
 sklearn/tree/tree.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 80a19d573f8bd..1335399af3505 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -450,6 +450,9 @@ def _validate_X_predict(self, X, check_input):
         n_categories = self.tree_.n_categories
         categorical_features = np.nonzero(n_categories > 0)[0]
         if categorical_features.size > 0:
+            if issparse(X):
+                raise NotImplementedError("Categorical features not supported"
+                                          " with sparse inputs")
             X = np.copy(X)
         for feature in categorical_features:
             rounded = np.round(X[:, feature]).astype('int64')

From b68be8cf376dc39c400acee9a43e66760d0eae0a Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Fri, 5 Jun 2015 19:09:07 -0700
Subject: [PATCH 15/35] Added property getter for a Tree's split_values in
 addition to thresholds.

---
 sklearn/tree/_tree.pyx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 8bbb24ffb2cd6..93a6d8cc89010 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -589,6 +589,10 @@ cdef class Tree:
         def __get__(self):
             return self._get_node_ndarray()['split_value']['threshold'][:self.node_count]
 
+    property split_value:
+        def __get__(self):
+            return self._get_node_ndarray()['split_value'][:self.node_count]
+
     property impurity:
         def __get__(self):
             return self._get_node_ndarray()['impurity'][:self.node_count]

From e187c1074dc880ff4e59e1280ee5c96b8168ae18 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Wed, 17 Jun 2015 18:55:58 -0700
Subject: [PATCH 16/35] Added a check to prevent use of DecisionTree (instead
 of ExtraTree) with more than 64 categories.

---
 sklearn/tree/tree.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 1335399af3505..e6575c7332b2d 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -344,7 +344,7 @@ def fit(self, X, y, sample_weight=None, categorical='None',
             rounded = np.round(X[:, feature]).astype(np.int64)
             self.category_map_[feature] = dict(izip(set(rounded), count()))
             X[:, feature] = np.array([self.category_map_[feature][x]
-                                       for x in rounded]).astype(DTYPE)
+                                      for x in rounded]).astype(DTYPE)
             n_categories[feature] = len(self.category_map_[feature])
 
         # Set min_weight_leaf from min_weight_fraction_leaf
@@ -401,6 +401,12 @@ def fit(self, X, y, sample_weight=None, categorical='None',
                                                 random_state,
                                                 self.presort)
 
+        if (not isinstance(self.splitter, _splitter.RandomSplitter) and
+                np.max(n_categories) > 64):
+            raise ValueError('A feature with {} categories was detected; to'
+                             ' use more than 64, use ExtraTree rather than'
+                             ' DecisionTree.'.format(np.max(n_categories)))
+
         self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_)
         self.tree_.n_categories = n_categories
 

From 909f09b299c28af0817b961f381514faab7f352b Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Tue, 23 Jun 2015 12:12:35 -0700
Subject: [PATCH 17/35] Fixed the numpy NODE dtype so it matches the cython
 struct.

---
 sklearn/tree/_tree.pyx | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 93a6d8cc89010..33756f0932c5b 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -75,9 +75,10 @@ SPLITVALUE_DTYPE = np.dtype({
 })
 NODE_DTYPE = np.dtype({
     'names': ['left_child', 'right_child', 'feature', 'split_value',
-              'impurity', 'n_node_samples', 'weighted_n_node_samples'],
+              'impurity', 'n_node_samples', 'weighted_n_node_samples',
+              '_bit_cache'],
     'formats': [np.intp, np.intp, np.intp, SPLITVALUE_DTYPE, np.float64,
-                np.intp, np.float64],
+                np.intp, np.float64, np.intp],
     'offsets': [
         <Py_ssize_t> &(<Node*> NULL).left_child,
         <Py_ssize_t> &(<Node*> NULL).right_child,
@@ -85,7 +86,8 @@ NODE_DTYPE = np.dtype({
         <Py_ssize_t> &(<Node*> NULL).split_value,
         <Py_ssize_t> &(<Node*> NULL).impurity,
         <Py_ssize_t> &(<Node*> NULL).n_node_samples,
-        <Py_ssize_t> &(<Node*> NULL).weighted_n_node_samples
+        <Py_ssize_t> &(<Node*> NULL).weighted_n_node_samples,
+        <Py_ssize_t> &(<Node*> NULL)._bit_cache
     ]
 })
 

From ede1e69ccaec78af821538287966f1ce029a1d9e Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Tue, 23 Jun 2015 12:13:38 -0700
Subject: [PATCH 18/35] Put the gradient boosting tree descent code into a
 try..finally block to make sure we free the bit cache.

---
 sklearn/ensemble/_gradient_boosting.pyx | 126 ++++++++++++------------
 1 file changed, 64 insertions(+), 62 deletions(-)

diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx
index a26762bad21d6..49524cd697751 100644
--- a/sklearn/ensemble/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_gradient_boosting.pyx
@@ -218,69 +218,71 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
 
     tree.populate_bit_caches()
 
-    for i in range(X.shape[0]):
-        # init stacks for new example
-        stack_size = 1
-        node_stack[0] = root_node
-        weight_stack[0] = 1.0
-        total_weight = 0.0
-
-        while stack_size > 0:
-            # get top node on stack
-            stack_size -= 1
-            current_node = node_stack[stack_size]
-
-            if current_node.left_child == LEAF:
-                out[i] += weight_stack[stack_size] * value[current_node - root_node] * \
-                          learn_rate
-                total_weight += weight_stack[stack_size]
-            else:
-                # non-terminal node
-                feature_index = array_index(current_node.feature, target_feature)
-                if feature_index != -1:
-                    # split feature in target set
-                    # push left or right child on stack
-                    if goes_left(X[i, feature_index], current_node.split_value,
-                                 tree.n_categories[current_node.feature],
-                                 current_node._bit_cache):
-                        # left
-                        node_stack[stack_size] = (root_node +
-                                                  current_node.left_child)
-                    else:
-                        # right
-                        node_stack[stack_size] = (root_node +
-                                                  current_node.right_child)
-                    stack_size += 1
+    try:
+        for i in range(X.shape[0]):
+            # init stacks for new example
+            stack_size = 1
+            node_stack[0] = root_node
+            weight_stack[0] = 1.0
+            total_weight = 0.0
+
+            while stack_size > 0:
+                # get top node on stack
+                stack_size -= 1
+                current_node = node_stack[stack_size]
+
+                if current_node.left_child == LEAF:
+                    out[i] += weight_stack[stack_size] * value[current_node - root_node] * \
+                              learn_rate
+                    total_weight += weight_stack[stack_size]
                 else:
-                    # split feature in complement set
-                    # push both children onto stack
-
-                    # push left child
-                    node_stack[stack_size] = root_node + current_node.left_child
-                    current_weight = weight_stack[stack_size]
-                    left_sample_frac = root_node[current_node.left_child].n_node_samples / \
-                                       <double>current_node.n_node_samples
-                    if left_sample_frac <= 0.0 or left_sample_frac >= 1.0:
-                        raise ValueError("left_sample_frac:%f, "
-                                         "n_samples current: %d, "
-                                         "n_samples left: %d"
-                                         % (left_sample_frac,
-                                            current_node.n_node_samples,
-                                            root_node[current_node.left_child].n_node_samples))
-                    weight_stack[stack_size] = current_weight * left_sample_frac
-                    stack_size +=1
-
-                    # push right child
-                    node_stack[stack_size] = root_node + current_node.right_child
-                    weight_stack[stack_size] = current_weight * \
-                                               (1.0 - left_sample_frac)
-                    stack_size +=1
-
-        if not (0.999 < total_weight < 1.001):
-            raise ValueError("Total weight should be 1.0 but was %.9f" %
-                             total_weight)
-
-    tree.delete_bit_caches()
+                    # non-terminal node
+                    feature_index = array_index(current_node.feature, target_feature)
+                    if feature_index != -1:
+                        # split feature in target set
+                        # push left or right child on stack
+                        if goes_left(X[i, feature_index], current_node.split_value,
+                                     tree.n_categories[current_node.feature],
+                                     current_node._bit_cache):
+                            # left
+                            node_stack[stack_size] = (root_node +
+                                                      current_node.left_child)
+                        else:
+                            # right
+                            node_stack[stack_size] = (root_node +
+                                                      current_node.right_child)
+                        stack_size += 1
+                    else:
+                        # split feature in complement set
+                        # push both children onto stack
+
+                        # push left child
+                        node_stack[stack_size] = root_node + current_node.left_child
+                        current_weight = weight_stack[stack_size]
+                        left_sample_frac = root_node[current_node.left_child].n_node_samples / \
+                                           <double>current_node.n_node_samples
+                        if left_sample_frac <= 0.0 or left_sample_frac >= 1.0:
+                            raise ValueError("left_sample_frac:%f, "
+                                             "n_samples current: %d, "
+                                             "n_samples left: %d"
+                                             % (left_sample_frac,
+                                                current_node.n_node_samples,
+                                                root_node[current_node.left_child].n_node_samples))
+                        weight_stack[stack_size] = current_weight * left_sample_frac
+                        stack_size +=1
+
+                        # push right child
+                        node_stack[stack_size] = root_node + current_node.right_child
+                        weight_stack[stack_size] = current_weight * \
+                                                   (1.0 - left_sample_frac)
+                        stack_size +=1
+
+            if not (0.999 < total_weight < 1.001):
+                raise ValueError("Total weight should be 1.0 but was %.9f" %
+                                 total_weight)
+
+    finally:
+        tree.delete_bit_caches()
 
 
 def _random_sample_mask(np.npy_intp n_total_samples,

From abd27dfa62b6564c46280322d0f321ff1f5e5571 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Thu, 25 Jun 2015 13:21:26 -0700
Subject: [PATCH 19/35] Fixed a python3 compatibility problem.

---
 sklearn/tree/tree.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index e6575c7332b2d..61bba4d6c8cf0 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -20,7 +20,7 @@
 from abc import ABCMeta
 from abc import abstractmethod
 from math import ceil
-from itertools import izip, count
+from itertools import count
 
 import numpy as np
 from scipy.sparse import issparse
@@ -29,6 +29,7 @@
 from ..base import ClassifierMixin
 from ..base import RegressorMixin
 from ..externals import six
+from ..externals.six.moves import zip
 from ..feature_selection.from_model import _LearntSelectorMixin
 from ..utils import check_array
 from ..utils import check_random_state
@@ -342,7 +343,7 @@ def fit(self, X, y, sample_weight=None, categorical='None',
             X = np.copy(X)
         for feature in categorical:
             rounded = np.round(X[:, feature]).astype(np.int64)
-            self.category_map_[feature] = dict(izip(set(rounded), count()))
+            self.category_map_[feature] = dict(zip(set(rounded), count()))
             X[:, feature] = np.array([self.category_map_[feature][x]
                                       for x in rounded]).astype(DTYPE)
             n_categories[feature] = len(self.category_map_[feature])
@@ -463,7 +464,7 @@ def _validate_X_predict(self, X, check_input):
         for feature in categorical_features:
             rounded = np.round(X[:, feature]).astype('int64')
             new_cat = set(rounded) - set(self.category_map_[feature])
-            new_cat_map = dict(izip(new_cat, count(n_categories[feature])))
+            new_cat_map = dict(zip(new_cat, count(n_categories[feature])))
             X[:, feature] = np.array(
                 [self.category_map_[feature].get(x, new_cat_map.get(x))
                  for x in rounded]).astype(DTYPE)

From ee398817e4e0215403e23ed4c1842dd2a87fc43a Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Tue, 13 Oct 2015 17:36:49 -0700
Subject: [PATCH 20/35] Fixed a Splitter validation bug in tree.py

---
 sklearn/tree/tree.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 61bba4d6c8cf0..02133db98f96b 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -394,7 +394,7 @@ def fit(self, X, y, sample_weight=None, categorical='None',
         SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
 
         splitter = self.splitter
-        if not isinstance(self.splitter, Splitter):
+        if not isinstance(splitter, Splitter):
             splitter = SPLITTERS[self.splitter](criterion,
                                                 self.max_features_,
                                                 min_samples_leaf,
@@ -402,7 +402,7 @@ def fit(self, X, y, sample_weight=None, categorical='None',
                                                 random_state,
                                                 self.presort)
 
-        if (not isinstance(self.splitter, _splitter.RandomSplitter) and
+        if (not isinstance(splitter, _splitter.RandomSplitter) and
                 np.max(n_categories) > 64):
             raise ValueError('A feature with {} categories was detected; to'
                              ' use more than 64, use ExtraTree rather than'

From b81b29cc13c865ff26eca34bec42515840cae13d Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Tue, 13 Oct 2015 17:42:37 -0700
Subject: [PATCH 21/35] RandomSplitter now flips a coin to send each category
 value left or right

---
 sklearn/tree/_splitter.pxd |  7 +++----
 sklearn/tree/_splitter.pyx |  8 +-------
 sklearn/tree/_utils.pyx    | 10 ++++------
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 64bfb22bd9284..0565a87dde706 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -35,10 +35,9 @@ ctypedef union SplitValue:
     # for up to 64 categories, sending samples left if the bit
     # corresponding to their category is 1 or right if it is 0. If the
     # LSB is 1, then the more significant 32 bits of cat_split is a
-    # random seed and the next 31 bits are the number of deviates to
-    # draw. To evaluate a sample, draw the required set of categories
-    # and check if the sample's feature value is in the set. If so,
-    # send it left; otherwise right. This second method allows up to
+    # random seed. To evaluate a sample, use the random seed to flip a
+    # coin (category_value + 1) times and send it left if the last
+    # flip gives 1; otherwise right. This second method allows up to
     # 2**31 category values, but can only be used for RandomSplitter.
     DOUBLE_t threshold
     UINT64_t cat_split
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 49fc1af5ed3c6..f4b406f0dd391 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -726,7 +726,6 @@ cdef class RandomSplitter(BaseDenseSplitter):
         cdef DTYPE_t current_feature_value
         cdef SIZE_t partition_end
         cdef bint is_categorical
-        cdef UINT32_t split_n_draw
         cdef UINT64_t split_seed
 
         _init_split(&best, end)
@@ -805,13 +804,8 @@ cdef class RandomSplitter(BaseDenseSplitter):
                     # Construct a random split
                     is_categorical = self.n_categories[current.feature] > 0
                     if is_categorical:
-                        # split_n_draw is the number of categories to send left
-                        # TODO: this should be a binomial draw
-                        split_n_draw = rand_int(1, self.n_categories[current.feature],
-                                                random_state) & <SIZE_t>0x7FFFFFFF
                         split_seed = our_rand_r(random_state)
-                        current.split_value.cat_split = (
-                            (split_seed << 32) | (split_n_draw << 1) | 1)
+                        current.split_value.cat_split = (split_seed << 32) | 1
                     else:
                         current.split_value.threshold = rand_uniform(
                             min_feature_value, max_feature_value, random_state)
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 10d48db6c922e..4f4e2488bc12e 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -87,7 +87,7 @@ cdef inline void make_bit_cache(SplitValue split, INT32_t n_categories,
     """Regenerate and store the random numbers for a split."""
     cdef UINT32_t rng_seed
     cdef SIZE_t q
-    cdef UINT32_t val, idx, shift
+    cdef UINT32_t val
 
     if (n_categories <= 0):
         # Non-categorical feature; bit cache not used
@@ -102,11 +102,9 @@ cdef inline void make_bit_cache(SplitValue split, INT32_t n_categories,
         for q in range((n_categories + 7) // 8):
             bit_cache[q] = 0
         rng_seed = split.cat_split >> 32
-        for q in range((split.cat_split & <SIZE_t>0xFFFFFFFF) >> 1):
-            val = rand_int(0, n_categories, &rng_seed)
-            idx = val // 8
-            shift = val % 8
-            bit_cache[idx] |= (1 << shift)
+        for q in range(n_categories):
+            val = rand_int(0, 2, &rng_seed)
+            bit_cache[q // 8] |= val << (q % 8)
 
 cdef inline bint goes_left(DTYPE_t feature_value, SplitValue split,
                            INT32_t n_categories, UINT8_t* bit_cache) nogil:

From 548e4e5ede16173cb53fde28b621eb0478c4fb9c Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Tue, 13 Oct 2015 17:49:06 -0700
Subject: [PATCH 22/35] For categorical features, RandomSplitter now retries up
 to 20 random splits until a non-trivial one is found

---
 sklearn/tree/_splitter.pyx | 60 +++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 26 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index f4b406f0dd391..e259f524bc704 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -712,6 +712,7 @@ cdef class RandomSplitter(BaseDenseSplitter):
         cdef SIZE_t f_i = n_features
         cdef SIZE_t f_j
         cdef SIZE_t p
+        cdef SIZE_t q
         cdef SIZE_t feature_stride
         # Number of features discovered to be constant during the split search
         cdef SIZE_t n_found_constants = 0
@@ -802,36 +803,43 @@ cdef class RandomSplitter(BaseDenseSplitter):
                     features[f_i], features[f_j] = features[f_j], features[f_i]
 
                     # Construct a random split
-                    is_categorical = self.n_categories[current.feature] > 0
-                    if is_categorical:
-                        split_seed = our_rand_r(random_state)
-                        current.split_value.cat_split = (split_seed << 32) | 1
-                    else:
-                        current.split_value.threshold = rand_uniform(
-                            min_feature_value, max_feature_value, random_state)
-                        if current.split_value.threshold == max_feature_value:
-                            current.split_value.threshold = min_feature_value
-
-                    # Partition
-                    make_bit_cache(current.split_value, self.n_categories[current.feature],
-                                   self._bit_cache)
-                    partition_end = end
-                    p = start
-                    while p < partition_end:
-                        current_feature_value = Xf[p]
-                        if goes_left(current_feature_value, current.split_value,
-                                     self.n_categories[current.feature], self._bit_cache):
-                            p += 1
+                    # Repeat up to 20 times if a trivial split is constructed
+                    # (this can only happen with a categorical feature)
+                    for q in range(20):
+                        is_categorical = self.n_categories[current.feature] > 0
+                        if is_categorical:
+                            split_seed = our_rand_r(random_state)
+                            current.split_value.cat_split = (split_seed << 32) | 1
                         else:
-                            partition_end -= 1
+                            current.split_value.threshold = rand_uniform(
+                                min_feature_value, max_feature_value, random_state)
+                            if current.split_value.threshold == max_feature_value:
+                                current.split_value.threshold = min_feature_value
+
+                        # Partition
+                        make_bit_cache(current.split_value, self.n_categories[current.feature],
+                                       self._bit_cache)
+                        partition_end = end
+                        p = start
+                        while p < partition_end:
+                            current_feature_value = Xf[p]
+                            if goes_left(current_feature_value, current.split_value,
+                                         self.n_categories[current.feature], self._bit_cache):
+                                p += 1
+                            else:
+                                partition_end -= 1
+
+                                Xf[p] = Xf[partition_end]
+                                Xf[partition_end] = current_feature_value
 
-                            Xf[p] = Xf[partition_end]
-                            Xf[partition_end] = current_feature_value
+                                samples[p], samples[partition_end] = (
+                                    samples[partition_end], samples[p])
 
-                            samples[p], samples[partition_end] = (
-                                samples[partition_end], samples[p])
+                        current.pos = partition_end
 
-                    current.pos = partition_end
+                        # Break early if a non-trivial split was found
+                        if partition_end != start and partition_end != end:
+                            break
 
                     # Reject if min_samples_leaf is not guaranteed
                     if (((current.pos - start) < min_samples_leaf) or

From f28074f903402cb1ab08c9038dc2c287a1766c35 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Wed, 14 Oct 2015 19:10:29 -0700
Subject: [PATCH 23/35] Fixed a nasty bug where categorical normalization was
 happening incorrectly with ensembles. Refactored the categorical
 transformations into functions.

---
 sklearn/ensemble/forest.py            |  13 +-
 sklearn/ensemble/gradient_boosting.py |  16 ++-
 sklearn/tree/__init__.py              |   1 +
 sklearn/tree/tree.py                  | 180 ++++++++++++++++++--------
 4 files changed, 153 insertions(+), 57 deletions(-)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index bc88343261612..e393d7aface71 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -57,7 +57,8 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from ..metrics import r2_score
 from ..preprocessing import OneHotEncoder
 from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor,
-                    ExtraTreeClassifier, ExtraTreeRegressor)
+                    ExtraTreeClassifier, ExtraTreeRegressor,
+                    preproc_categorical, validate_categorical)
 from ..tree._tree import DTYPE, DOUBLE
 from ..utils import check_random_state, check_array, compute_sample_weight
 from ..exceptions import DataConversionWarning, NotFittedError
@@ -155,6 +156,7 @@ def __init__(self,
         self.verbose = verbose
         self.warm_start = warm_start
         self.class_weight = class_weight
+        self.category_map_ = None
 
     def apply(self, X):
         """Apply trees in the forest to X, return leaf indices.
@@ -260,6 +262,10 @@ def fit(self, X, y, sample_weight=None, categorical='None'):
             # ensemble sorts the indices.
             X.sort_indices()
 
+        # Preprocess categorical variables
+        X, _, self.category_map_ = preproc_categorical(
+            X, categorical, check_input=True)
+
         # Remap output
         n_samples, self.n_features_ = X.shape
 
@@ -361,7 +367,10 @@ def _validate_X_predict(self, X):
             raise NotFittedError("Estimator not fitted, "
                                  "call `fit` before exploiting the model.")
 
-        return self.estimators_[0]._validate_X_predict(X, check_input=True)
+        X = self.estimators_[0]._validate_X_predict(X, check_input=True)
+        X = validate_categorical(X, self.category_map_)
+
+        return X
 
     @property
     def feature_importances_(self):
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
index 087a8430f0b15..93f965e6b13cc 100644
--- a/sklearn/ensemble/gradient_boosting.py
+++ b/sklearn/ensemble/gradient_boosting.py
@@ -47,6 +47,8 @@
 
 from time import time
 from ..tree.tree import DecisionTreeRegressor
+from ..tree.tree import preproc_categorical
+from ..tree.tree import validate_categorical
 from ..tree._tree import DTYPE
 from ..tree._tree import TREE_LEAF
 
@@ -742,6 +744,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
         self.max_leaf_nodes = max_leaf_nodes
         self.warm_start = warm_start
         self.presort = presort
+        self.category_map_ = None
 
         self.estimators_ = np.empty((0, 0), dtype=np.object)
 
@@ -981,6 +984,10 @@ def fit(self, X, y, sample_weight=None, categorical='None', monitor=None):
 
         y = self._validate_y(y)
 
+        # Preprocess categorical variables
+        X, _, self.category_map_ = preproc_categorical(
+            X, categorical, check_input=True)
+
         random_state = check_random_state(self.random_state)
         self._check_params()
 
@@ -1144,9 +1151,10 @@ def decision_function(self, X):
             Regression and binary classification produce an array of shape
             [n_samples].
         """
-
         self._check_initialized()
         X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
+        X = validate_categorical(X, self.category_map_)
+
         score = self._decision_function(X)
         if score.shape[1] == 1:
             return score.ravel()
@@ -1172,6 +1180,8 @@ def _staged_decision_function(self, X):
             ``k == 1``, otherwise ``k==n_classes``.
         """
         X = check_array(X, dtype=DTYPE, order="C")
+        X = validate_categorical(X, self.category_map_)
+
         score = self._init_decision_function(X)
         for i in range(self.estimators_.shape[0]):
             predict_stage(self.estimators_, i, X, self.learning_rate, score)
@@ -1471,6 +1481,8 @@ def decision_function(self, X):
             [n_samples].
         """
         X = check_array(X, dtype=DTYPE, order="C")
+        X = validate_categorical(X, self.category_map_)
+
         score = self._decision_function(X)
         if score.shape[1] == 1:
             return score.ravel()
@@ -1807,6 +1819,8 @@ def predict(self, X):
             The predicted values.
         """
         X = check_array(X, dtype=DTYPE, order="C")
+        X = validate_categorical(X, self.category_map_)
+
         return self._decision_function(X).ravel()
 
     def staged_predict(self, X):
diff --git a/sklearn/tree/__init__.py b/sklearn/tree/__init__.py
index 1394bd914d27c..42bab93f089ab 100644
--- a/sklearn/tree/__init__.py
+++ b/sklearn/tree/__init__.py
@@ -7,6 +7,7 @@
 from .tree import DecisionTreeRegressor
 from .tree import ExtraTreeClassifier
 from .tree import ExtraTreeRegressor
+from .tree import preproc_categorical, validate_categorical
 from .export import export_graphviz
 
 __all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 02133db98f96b..96e02a1a3d063 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -66,6 +66,117 @@
 SPARSE_SPLITTERS = {"best": _splitter.BestSparseSplitter,
                     "random": _splitter.RandomSparseSplitter}
 
+
+# =============================================================================
+# Support functions
+# =============================================================================
+def preproc_categorical(X, categorical, check_input):
+    """Preprocess categorical features by mapping them to
+    range(n_categories). Used for fitting.
+
+    Parameters
+    ----------
+    X : array-like, shape=(n_samples, n_features)
+        Feature array
+    categorical : array-like or str
+        Specification of which features are categorical. See fit().
+    check_input : bool
+        If False, bypass creation of category map and transformation
+        of X. Use only if you know what you are doing.
+
+    Returns
+    -------
+    X : array, shape=(n_samples, n_features)
+        Transformed copy of the feature array (or the original if
+        there are no categorical features)
+    n_categories : array, shape=(n_features,)
+        Number of categories for each feature (-1 if non-categorical)
+    category_map : list, length n_features
+        For each feature, a dictionary relating values to transformed
+        values, or an empty dictionary for non-categorical features
+
+    """
+    n_features = np.shape(X)[1]
+    if isinstance(categorical, str):
+        if categorical == "None":
+            categorical = np.array([])
+        elif categorical == "All":
+            categorical = np.arange(n_features)
+        else:
+            raise ValueError("Invalid value for categorical: %s. Allowed"
+                             " strings are 'All' or 'None'" % categorical)
+    categorical = np.asarray(categorical)
+    if categorical.dtype == np.bool:
+        if categorical.shape != (n_features,):
+            raise ValueError("Shape of boolean parameter categorical must"
+                             " be (n_features,)")
+        categorical = np.nonzero(categorical)[0]
+    if (len(categorical.shape) != 1 or
+        categorical.size > n_features or
+        (categorical.size > 0 and
+         (np.min(categorical) < 0 or
+          np.max(categorical) >= n_features))):
+        raise ValueError("Invalid shape or invalid feature index for"
+                         " parameter categorical")
+    if issparse(X) and categorical.size > 0:
+        raise NotImplementedError("Categorical features not supported with"
+                                  " sparse inputs")
+
+    n_categories = np.full(n_features, -1, dtype=np.int32)
+    category_map = [{}] * n_features
+    if categorical.size > 0 and check_input:
+        X = np.copy(X)
+    for feature in categorical:
+        rounded = np.round(X[:, feature]).astype(np.int64)
+        unique_rounded = np.unique(rounded)
+        if check_input:
+            category_map[feature] = dict(zip(unique_rounded, count()))
+            X[:, feature] = np.array([category_map[feature][x]
+                                      for x in rounded], dtype=DTYPE)
+        n_categories[feature] = len(unique_rounded)
+
+    return X, n_categories, category_map
+
+
+def validate_categorical(X, category_map):
+    """Map categorical features onto sequential integers. Used for
+    predicting.
+
+    Parameters
+    ----------
+    X : array-like, shape=(n_samples, n_features)
+        Feature array
+    category_map : list, length n_features
+        For each feature, a dictionary relating values to transformed
+        values, or an empty dictionary for non-categorical features
+
+    Returns
+    -------
+    X : array, shape=(n_samples, n_features)
+        Transformed copy of the feature array (or the original if
+        there are no categorical features)
+    """
+    if category_map is None:
+        return X
+
+    n_categories = np.array([len(x) for x in category_map])
+    categorical_features = np.nonzero(n_categories > 0)[0]
+    if categorical_features.size > 0:
+        if issparse(X):
+            raise NotImplementedError("Categorical features not supported"
+                                      " with sparse inputs")
+        X = np.copy(X)
+    for feature in categorical_features:
+        rounded = np.round(X[:, feature]).astype('int64')
+        new_cat = set(rounded) - set(category_map[feature])
+        new_cat_map = dict(zip(new_cat, count(n_categories[feature])))
+        X[:, feature] = np.array(
+            [category_map[feature].get(x, new_cat_map.get(x))
+             for x in rounded]).astype(DTYPE)
+
+    return X
+
+
 # =============================================================================
 # Base decision tree
 # =============================================================================
@@ -312,41 +423,9 @@ def fit(self, X, y, sample_weight=None, categorical='None',
             else:
                 sample_weight = expanded_class_weight
 
-        if isinstance(categorical, str):
-            if categorical == "None":
-                categorical = np.array([])
-            elif categorical == "All":
-                categorical = np.arange(self.n_features_)
-            else:
-                raise ValueError("Invalid value for categorical: %s. Allowed"
-                                 " strings are 'All' or 'None'" % categorical)
-        categorical = np.atleast_1d(categorical).flatten()
-        if categorical.dtype == np.bool:
-            if categorical.size != self.n_features_:
-                raise ValueError("Shape of boolean parameter categorical must"
-                                 " be [n_features]")
-            categorical = np.nonzero(categorical)[0]
-        if (categorical.size > self.n_features_ or
-            (categorical.size > 0 and
-             (np.min(categorical) < 0 or
-              np.max(categorical) >= self.n_features_))):
-            raise ValueError("Invalid shape or invalid feature index for"
-                             " parameter categorical")
-        if issparse(X) and len(categorical) > 0:
-            raise NotImplementedError("Categorical features not supported with"
-                                      " sparse inputs")
-
-        # Determine the number of categories in each categorical feature
-        n_categories = np.zeros(self.n_features_, dtype=np.int32) - 1
-        self.category_map_ = [None] * self.n_features_
-        if categorical.size > 0:
-            X = np.copy(X)
-        for feature in categorical:
-            rounded = np.round(X[:, feature]).astype(np.int64)
-            self.category_map_[feature] = dict(zip(set(rounded), count()))
-            X[:, feature] = np.array([self.category_map_[feature][x]
-                                      for x in rounded]).astype(DTYPE)
-            n_categories[feature] = len(self.category_map_[feature])
+        # Do preprocessing of categorical variables
+        X, n_categories, self.category_map_ = preproc_categorical(
+            X, categorical, check_input)
 
         # Set min_weight_leaf from min_weight_fraction_leaf
         if self.min_weight_fraction_leaf != 0. and sample_weight is not None:
@@ -453,22 +532,6 @@ def _validate_X_predict(self, X, check_input):
                              "input n_features is %s "
                              % (self.n_features_, n_features))
 
-        # Map categorical features onto integers
-        n_categories = self.tree_.n_categories
-        categorical_features = np.nonzero(n_categories > 0)[0]
-        if categorical_features.size > 0:
-            if issparse(X):
-                raise NotImplementedError("Categorical features not supported"
-                                          " with sparse inputs")
-            X = np.copy(X)
-        for feature in categorical_features:
-            rounded = np.round(X[:, feature]).astype('int64')
-            new_cat = set(rounded) - set(self.category_map_[feature])
-            new_cat_map = dict(zip(new_cat, count(n_categories[feature])))
-            X[:, feature] = np.array(
-                [self.category_map_[feature].get(x, new_cat_map.get(x))
-                 for x in rounded]).astype(DTYPE)
-
         return X
 
     def predict(self, X, check_input=True):
@@ -487,7 +550,7 @@ def predict(self, X, check_input=True):
 
         check_input : boolean, (default=True)
             Allow to bypass several input checking.
-            Don't use this parameter unless you know what you do.
+            Don't use this parameter unless you know what you are doing.
 
         Returns
         -------
@@ -496,6 +559,9 @@ def predict(self, X, check_input=True):
         """
 
         X = self._validate_X_predict(X, check_input)
+        if check_input:
+            X = validate_categorical(X, self.category_map_)
+
         proba = self.tree_.predict(X)
         n_samples = X.shape[0]
 
@@ -537,7 +603,7 @@ def apply(self, X, check_input=True):
 
         check_input : boolean, (default=True)
             Allow to bypass several input checking.
-            Don't use this parameter unless you know what you do.
+            Don't use this parameter unless you know what you are doing.
 
         Returns
         -------
@@ -548,6 +614,9 @@ def apply(self, X, check_input=True):
             numbering.
         """
         X = self._validate_X_predict(X, check_input)
+        if check_input:
+            X = validate_categorical(X, self.category_map_)
+
         return self.tree_.apply(X)
 
     def decision_path(self, X, check_input=True):
@@ -782,7 +851,7 @@ class in a leaf.
 
         check_input : boolean, (default=True)
             Allow to bypass several input checking.
-            Don't use this parameter unless you know what you do.
+            Don't use this parameter unless you know what you are doing.
 
         Parameters
         ----------
@@ -799,6 +868,9 @@ class in a leaf.
             classes corresponds to that in the attribute `classes_`.
         """
         X = self._validate_X_predict(X, check_input)
+        if check_input:
+            X = validate_categorical(X, self.category_map_)
+
         proba = self.tree_.predict(X)
 
         if self.n_outputs_ == 1:

From 36c2e47faf96ec50a067c842f655f3816f46b5db Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Wed, 9 Dec 2015 19:26:49 -0800
Subject: [PATCH 24/35] Added some unit tests. More to come.

---
 sklearn/tree/tests/test_tree.py | 41 +++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index e4ca2be5e452a..1b55529bbca7f 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -1443,3 +1443,44 @@ def test_no_sparse_y_support():
     # Currently we don't support sparse y
     for name in ALL_TREES:
         yield (check_no_sparse_y_support, name)
+
+
+def test_invalid_categorical_str():
+    check = lambda nm: assert_raises(ValueError, ALL_TREES[nm]().fit, X, y,
+                                     categorical='example invalid string')
+    for name in ALL_TREES:
+        yield check, name
+
+
+def test_invalid_categorical_bool():
+    check = lambda nm: assert_raises(ValueError, ALL_TREES[nm]().fit, X, y,
+                                     categorical=[False, False, False])
+    for name in ALL_TREES:
+        yield check, name
+
+
+def check_invalid_categorical_idx(name):
+    Tree = ALL_TREES[name]
+    bad_catvals = [[1, 2], [-3], [[0]], [0, 0, 1]]
+    for catval in bad_catvals:
+        assert_raises(ValueError, Tree().fit, X, y, categorical=catval)
+
+
+def test_invalid_categorical_idx():
+    for name in ALL_TREES:
+        yield check_invalid_categorical_idx, name
+
+
+def check_no_sparse_with_categorical(name):
+    X, y, X_sparse = [DATASETS['toy'][z] for z in
+                      ['X', 'y', 'X_sparse']]
+    Tree = ALL_TREES[name]
+    assert_raises(NotImplementedError, Tree().fit, X_sparse, y,
+                  categorical='All')
+    assert_raises(NotImplementedError,
+                  Tree().fit(X, y, categorical='All').predict, X_sparse)
+
+
+def test_no_sparse_with_categorical():
+    for name in SPARSE_TREES:
+        yield check_no_sparse_with_categorical, name

From d555231c0043984b0c4c95fed87bb94653112133 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Mon, 21 Mar 2016 18:49:02 -0700
Subject: [PATCH 25/35] Upped the maximum number of times RandomSplitter will
 retry if it accidentally creates a trivial split.

---
 sklearn/tree/_splitter.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index e259f524bc704..028f346c814d8 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -803,9 +803,9 @@ cdef class RandomSplitter(BaseDenseSplitter):
                     features[f_i], features[f_j] = features[f_j], features[f_i]
 
                     # Construct a random split
-                    # Repeat up to 20 times if a trivial split is constructed
+                    # Repeat up to 60 times if a trivial split is constructed
                     # (this can only happen with a categorical feature)
-                    for q in range(20):
+                    for q in range(60):
                         is_categorical = self.n_categories[current.feature] > 0
                         if is_categorical:
                             split_seed = our_rand_r(random_state)

From e2fb46ed6022e8e82348e75ba7809d39cf94407d Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Sat, 26 Mar 2016 00:08:46 -0700
Subject: [PATCH 26/35] Added code to BestSplitter to restrict split trials to
 categories represented in the local sample.

---
 sklearn/tree/_splitter.pyx | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 028f346c814d8..f002a5bdb5e06 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -365,6 +365,9 @@ cdef class BestSplitter(BaseDenseSplitter):
         cdef DTYPE_t current_feature_value
         cdef SIZE_t partition_end
         cdef bint is_categorical
+        cdef UINT64_t cat_split
+        cdef INT32_t ncat_present
+        cdef INT32_t cat_offs[64]
 
         _init_split(&best, end)
 
@@ -452,23 +455,40 @@ cdef class BestSplitter(BaseDenseSplitter):
                     # Evaluate all splits
                     self.criterion.reset()
                     is_categorical = self.n_categories[current.feature] > 0
-                    p = 0 if is_categorical else start
+                    if is_categorical:
+                        p = 0
+                        # Identify the subset of categories present (for performance reasons)
+                        cat_split = 0
+                        ncat_present = 0
+                        for q in range(start, end):
+                            cat_split |= 1 << (<SIZE_t>Xf[q])
+                        for q in range(self.n_categories[current.feature]):
+                            if cat_split & (1 << q):
+                                cat_offs[ncat_present] = q - ncat_present
+                                ncat_present += 1
+                    else:
+                        p = start
 
                     while True:
                         if is_categorical:
                             # WARNING: This is O(n_samples *
                             # 2**n_categories), and will be very slow
                             # for more than just a few categories.
-                            if p > (1 << self.n_categories[current.feature]) - 1:
+                            if p > (1 << ncat_present) - 1:
                                 break
                             else:
                                 p += 2  # LSB must always be 0
 
+                            # Expand the bits of p out into cat_split
+                            cat_split = 0
+                            for q in range(ncat_present):
+                                cat_split |= (p & (1 << q)) << cat_offs[q]
+
                             # Partition
                             q = start
                             partition_end = end
                             while q < partition_end:
-                                if (p >> <SIZE_t>Xf[q]) & 1:
+                                if (cat_split >> <SIZE_t>Xf[q]) & 1:
                                     q += 1
                                 else:
                                     partition_end -= 1
@@ -510,7 +530,7 @@ cdef class BestSplitter(BaseDenseSplitter):
                         if current_proxy_improvement > best_proxy_improvement:
                             best_proxy_improvement = current_proxy_improvement
                             if is_categorical:
-                                current.split_value.cat_split = p
+                                current.split_value.cat_split = cat_split
                             else:
                                 current.split_value.threshold = (Xf[p - 1] + Xf[p]) / 2.0
                                 if current.split_value.threshold == Xf[p]:

From 05cd4fefcf0275056433371d27d3c6a2efd06fea Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Thu, 5 May 2016 21:57:18 -0700
Subject: [PATCH 27/35] Fixed a warning, which was actually a compile error in
 clang.

---
 sklearn/tree/_utils.pxd | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index aea80233cb7c3..5d2bf007c3926 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -59,11 +59,11 @@ cdef double log(double x) nogil
 
 
 # Functions for traversing a tree
-cdef inline void make_bit_cache(SplitValue split, INT32_t n_categories,
-                                UINT8_t* bit_cache) nogil
+cdef void make_bit_cache(SplitValue split, INT32_t n_categories,
+                         UINT8_t* bit_cache) nogil
 
-cdef inline bint goes_left(DTYPE_t feature_value, SplitValue split,
-                           INT32_t n_categories, UINT8_t* bit_cache) nogil
+cdef bint goes_left(DTYPE_t feature_value, SplitValue split,
+                    INT32_t n_categories, UINT8_t* bit_cache) nogil
 
 # =============================================================================
 # Stack data structure

From 227d65ee3809eff505fe087a3349ec4d4a6424fd Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Sun, 29 May 2016 22:22:03 -0700
Subject: [PATCH 28/35] Fixed a bug where BestSplitter was miscalculating
 impurities for categorical features.

---
 sklearn/tree/_splitter.pyx | 3 +++
 sklearn/tree/log           | 0
 2 files changed, 3 insertions(+)
 create mode 100644 sklearn/tree/log

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index f002a5bdb5e06..bfc1e0c7a787d 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -496,6 +496,9 @@ cdef class BestSplitter(BaseDenseSplitter):
                                     samples[q], samples[partition_end] = (
                                         samples[partition_end], samples[q])
                             current.pos = q
+
+                            # Must reset criterion since we've reordered the samples
+                            self.criterion.reset()
                         else:
                             # Non-categorical feature
                             while (p + 1 < end and
diff --git a/sklearn/tree/log b/sklearn/tree/log
new file mode 100644
index 0000000000000..e69de29bb2d1d

From 548c94506a9fc0984e328d92489845266d5a10a4 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Thu, 26 May 2016 23:21:51 -0700
Subject: [PATCH 29/35] Added an implementation of the Breiman shortcut. Turned
 off for now.

---
 sklearn/tree/_splitter.pyx | 76 +++++++++++++++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 10 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index bfc1e0c7a787d..ed3136c204cc3 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -317,8 +317,51 @@ cdef class BestSplitter(BaseDenseSplitter):
                                self.random_state,
                                self.presort), self.__getstate__())
 
+
+    cdef void _shortcut_catlist(self, SIZE_t start, SIZE_t end, INT32_t ncat,
+                                INT32_t ncat_present, const INT32_t *cat_offs,
+                                SIZE_t *shortcut_cat) nogil:
+        """The Breiman shortcut for finding the best split involves a
+        preprocessing step wherein we sort the categories by
+        increasing (weighted) mean of the outcome y (whether 0/1
+        binary for classification or quantitative for
+        regression). This function implements this preprocessing step
+        and produces a sorted list of category values.
+
+        """
+        cdef SIZE_t *samples = self.samples
+        cdef DTYPE_t *Xf = self.feature_values
+        cdef DOUBLE_t *y = self.y
+        cdef SIZE_t y_stride = self.y_stride
+        cdef DOUBLE_t *sample_weight = self.sample_weight
+        cdef DOUBLE_t w
+        cdef SIZE_t cat, localcat
+        cdef SIZE_t q, partition_end
+        cdef DTYPE_t sort_value[64]
+        cdef DTYPE_t sort_den[64]
+
+        for cat in range(ncat):
+            sort_value[cat] = 0
+            sort_den[cat] = 0
+
+        for q in range(start, end):
+            cat = <SIZE_t> Xf[q]
+            w = sample_weight[samples[q]] if sample_weight else 1.0
+            sort_value[cat] += w * (y[y_stride * samples[q]])
+            sort_den[cat] += w
+
+        for localcat in range(ncat_present):
+            cat = localcat + cat_offs[localcat]
+            sort_value[localcat] = sort_value[cat] / sort_den[cat]
+            shortcut_cat[localcat] = cat
+
+        # Second step: sort by decreasing impurity
+        sort(&sort_value[0], shortcut_cat, ncat_present)
+
+
     cdef void node_split(self, double impurity, SplitRecord* split,
                          SIZE_t* n_constant_features) nogil:
+
         """Find the best split on node samples[start:end]."""
         # Find the best split
         cdef SIZE_t* samples = self.samples
@@ -368,6 +411,8 @@ cdef class BestSplitter(BaseDenseSplitter):
         cdef UINT64_t cat_split
         cdef INT32_t ncat_present
         cdef INT32_t cat_offs[64]
+        cdef SIZE_t shortcut_cat[64]
+        cdef bint shortcut = 0
 
         _init_split(&best, end)
 
@@ -466,23 +511,34 @@ cdef class BestSplitter(BaseDenseSplitter):
                             if cat_split & (1 << q):
                                 cat_offs[ncat_present] = q - ncat_present
                                 ncat_present += 1
+                        if shortcut:
+                            self._shortcut_catlist(start, end, self.n_categories[current.feature],
+                                                   ncat_present, cat_offs, &shortcut_cat[0])
                     else:
                         p = start
 
                     while True:
                         if is_categorical:
-                            # WARNING: This is O(n_samples *
-                            # 2**n_categories), and will be very slow
-                            # for more than just a few categories.
-                            if p > (1 << ncat_present) - 1:
-                                break
+                            if shortcut:
+                                p += 1
+                                if p >= ncat_present:
+                                    break
+                                cat_split = 0
+                                for q in range(p):
+                                    cat_split |= (<SIZE_t> 1) << shortcut_cat[q]
+                                if cat_split & 1:
+                                    cat_split = (~cat_split) & (
+                                        ((<SIZE_t> 1) << self.n_categories[current.feature]) - 1)
                             else:
-                                p += 2  # LSB must always be 0
+                                if p > (1 << ncat_present) - 1:
+                                    break
+                                else:
+                                    p += 2  # LSB must always be 0
 
-                            # Expand the bits of p out into cat_split
-                            cat_split = 0
-                            for q in range(ncat_present):
-                                cat_split |= (p & (1 << q)) << cat_offs[q]
+                                # Expand the bits of p out into cat_split
+                                cat_split = 0
+                                for q in range(ncat_present):
+                                    cat_split |= (p & ((<SIZE_t> 1) << q)) << cat_offs[q]
 
                             # Partition
                             q = start

From 1f901ea38dc0efa0aa33a2ec697fac2488d47883 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Mon, 30 May 2016 23:34:18 -0700
Subject: [PATCH 30/35] Added code to automatically trigger the Breiman
 shortcut when appropriate for categorical features.

---
 sklearn/tree/_splitter.pxd |  2 ++
 sklearn/tree/_splitter.pyx | 10 ++++++----
 sklearn/tree/tree.py       |  9 ++++++++-
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 0565a87dde706..df009c988f925 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -82,6 +82,8 @@ cdef class Splitter:
 
     cdef bint presort                    # Whether to use presorting, only
                                          # allowed on dense data
+    cdef bint shortcut                   # Whether decision trees are allowed to use the
+                                         # Breiman shortcut for categorical features
 
     cdef DOUBLE_t* y
     cdef SIZE_t y_stride
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index ed3136c204cc3..d155a6769fbdf 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -63,7 +63,7 @@ cdef class Splitter:
 
     def __cinit__(self, Criterion criterion, SIZE_t max_features,
                   SIZE_t min_samples_leaf, double min_weight_leaf,
-                  object random_state, bint presort):
+                  object random_state, bint presort, bint shortcut):
         """
         Parameters
         ----------
@@ -106,6 +106,7 @@ cdef class Splitter:
         self.min_weight_leaf = min_weight_leaf
         self.random_state = random_state
         self.presort = presort
+        self.shortcut = shortcut
 
     def __dealloc__(self):
         """Destructor."""
@@ -263,7 +264,7 @@ cdef class BaseDenseSplitter(Splitter):
 
     def __cinit__(self, Criterion criterion, SIZE_t max_features,
                   SIZE_t min_samples_leaf, double min_weight_leaf,
-                  object random_state, bint presort):
+                  object random_state, bint presort, bint shortcut):
 
         self.X = NULL
         self.X_sample_stride = 0
@@ -412,7 +413,7 @@ cdef class BestSplitter(BaseDenseSplitter):
         cdef INT32_t ncat_present
         cdef INT32_t cat_offs[64]
         cdef SIZE_t shortcut_cat[64]
-        cdef bint shortcut = 0
+        cdef bint shortcut = self.shortcut
 
         _init_split(&best, end)
 
@@ -511,6 +512,7 @@ cdef class BestSplitter(BaseDenseSplitter):
                             if cat_split & (1 << q):
                                 cat_offs[ncat_present] = q - ncat_present
                                 ncat_present += 1
+                        shortcut = self.shortcut if ncat_present > 3 else 0  # No benefit for small N
                         if shortcut:
                             self._shortcut_catlist(start, end, self.n_categories[current.feature],
                                                    ncat_present, cat_offs, &shortcut_cat[0])
@@ -996,7 +998,7 @@ cdef class BaseSparseSplitter(Splitter):
 
     def __cinit__(self, Criterion criterion, SIZE_t max_features,
                   SIZE_t min_samples_leaf, double min_weight_leaf,
-                  object random_state, bint presort):
+                  object random_state, bint presort, bint shortcut):
         # Parent __cinit__ is automatically called
 
         self.X_data = NULL
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 96e02a1a3d063..20b5b9857be06 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -469,6 +469,12 @@ def fit(self, X, y, sample_weight=None, categorical='None',
                                                          self.n_classes_)
             else:
                 criterion = CRITERIA_REG[self.criterion](self.n_outputs_)
+        if is_classification:
+            use_shortcut = (self.n_classes_.tolist() == [2] and
+                            (isinstance(criterion, _criterion.Gini) or
+                             isinstance(criterion, _criterion.Entropy)))
+        else:
+            use_shortcut = isinstance(criterion, _criterion.MSE)
 
         SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
 
@@ -479,7 +485,8 @@ def fit(self, X, y, sample_weight=None, categorical='None',
                                                 min_samples_leaf,
                                                 min_weight_leaf,
                                                 random_state,
-                                                self.presort)
+                                                self.presort,
+                                                use_shortcut)
 
         if (not isinstance(splitter, _splitter.RandomSplitter) and
                 np.max(n_categories) > 64):

From 18cecc5040b592d12be4613d57d1260c7e586ac1 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Sun, 5 Jun 2016 13:49:23 -0700
Subject: [PATCH 31/35] Fixed a left-shift-too-far (undefined behavior) bug.

---
 sklearn/tree/_splitter.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index d155a6769fbdf..aebe988e15e6d 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -527,12 +527,12 @@ cdef class BestSplitter(BaseDenseSplitter):
                                     break
                                 cat_split = 0
                                 for q in range(p):
-                                    cat_split |= (<SIZE_t> 1) << shortcut_cat[q]
+                                    cat_split |= (<UINT64_t> 1) << shortcut_cat[q]
                                 if cat_split & 1:
                                     cat_split = (~cat_split) & (
-                                        ((<SIZE_t> 1) << self.n_categories[current.feature]) - 1)
+                                        (~(<UINT64_t> 0)) >> (64 - self.n_categories[current.feature]))
                             else:
-                                if p > (1 << ncat_present) - 1:
+                                if p > ((~(<UINT64_t> 0)) >> (64 - ncat_present)):
                                     break
                                 else:
                                     p += 2  # LSB must always be 0
@@ -540,7 +540,7 @@ cdef class BestSplitter(BaseDenseSplitter):
                                 # Expand the bits of p out into cat_split
                                 cat_split = 0
                                 for q in range(ncat_present):
-                                    cat_split |= (p & ((<SIZE_t> 1) << q)) << cat_offs[q]
+                                    cat_split |= (p & ((<UINT64_t> 1) << q)) << cat_offs[q]
 
                             # Partition
                             q = start

From 138c8d4b71e6dee78581459b1433c4f6e5d3b939 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Mon, 4 Jul 2016 22:48:10 -0700
Subject: [PATCH 32/35] Removed numpy access to the split value union to pacify
 unit tests run with numpy v1.6. This should be reverted when support for
 numpy v1.6 is dropped.

---
 sklearn/tree/_tree.pyx | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 33756f0932c5b..401de169975a8 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -68,16 +68,11 @@ cdef SIZE_t INITIAL_STACK_SIZE = 10
 cdef DTYPE_t MIN_IMPURITY_SPLIT = 1e-7
 
 # Repeat struct definition for numpy
-SPLITVALUE_DTYPE = np.dtype({
-    'names': ['threshold', 'cat_split'],
-    'formats': [np.float64, np.uint64],
-    'offsets': [0, 0]
-})
 NODE_DTYPE = np.dtype({
-    'names': ['left_child', 'right_child', 'feature', 'split_value',
+    'names': ['left_child', 'right_child', 'feature', 'threshold',
               'impurity', 'n_node_samples', 'weighted_n_node_samples',
               '_bit_cache'],
-    'formats': [np.intp, np.intp, np.intp, SPLITVALUE_DTYPE, np.float64,
+    'formats': [np.intp, np.intp, np.intp, np.float64, np.float64,
                 np.intp, np.float64, np.intp],
     'offsets': [
         <Py_ssize_t> &(<Node*> NULL).left_child,
@@ -589,11 +584,7 @@ cdef class Tree:
 
     property threshold:
         def __get__(self):
-            return self._get_node_ndarray()['split_value']['threshold'][:self.node_count]
-
-    property split_value:
-        def __get__(self):
-            return self._get_node_ndarray()['split_value'][:self.node_count]
+            return self._get_node_ndarray()['threshold'][:self.node_count]
 
     property impurity:
         def __get__(self):

From 1330942d70b0161fe4ae4e86b1beab8171aaaf19 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Wed, 13 Jul 2016 22:19:43 -0700
Subject: [PATCH 33/35] Replaced a call to np.full with np.ones, to accomodate
 older versions of numpy.

---
 sklearn/tree/tree.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 20b5b9857be06..c10303d395ff6 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -122,7 +122,7 @@ def preproc_categorical(X, categorical, check_input):
         raise NotImplementedError("Categorical features not supported with"
                                   " sparse inputs")
 
-    n_categories = np.full(n_features, -1, dtype=np.int32)
+    n_categories = -np.ones(n_features, dtype=np.int32)
     category_map = [{}] * n_features
     if categorical.size > 0 and check_input:
         X = np.copy(X)

From 2a555b17c772b687ab5344092a9f5bb90ac0d308 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Thu, 14 Jul 2016 08:57:43 -0700
Subject: [PATCH 34/35] Moved the categorical parameter from fit() to the
 constructor, for trees, forests, and gradient boosting. Tweaked unit tests to
 match.

---
 sklearn/ensemble/forest.py            | 116 +++++++++++++++++++-------
 sklearn/ensemble/gradient_boosting.py |  64 +++++++-------
 sklearn/tree/tests/test_tree.py       |  23 +++--
 sklearn/tree/tree.py                  |  92 +++++++++++++-------
 4 files changed, 202 insertions(+), 93 deletions(-)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index e393d7aface71..8afabe098bb27 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -91,8 +91,8 @@ def _generate_unsampled_indices(random_state, n_samples):
 
     return unsampled_indices
 
-def _parallel_build_trees(tree, forest, X, y, sample_weight, categorical,
-                          tree_idx, n_trees, verbose=0, class_weight=None):
+def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx,
+                          n_trees, verbose=0, class_weight=None):
     """Private function used to fit a single tree in parallel."""
     if verbose > 1:
         print("building tree %d of %d" % (tree_idx + 1, n_trees))
@@ -115,11 +115,9 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, categorical,
         elif class_weight == 'balanced_subsample':
             curr_sample_weight *= compute_sample_weight('balanced', y, indices)
 
-        tree.fit(X, y, sample_weight=curr_sample_weight,
-                 categorical=categorical, check_input=False)
+        tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
     else:
-        tree.fit(X, y, sample_weight=sample_weight,
-                 categorical=categorical, check_input=False)
+        tree.fit(X, y, sample_weight=sample_weight, check_input=False)
 
     return tree
 
@@ -215,7 +213,7 @@ def decision_path(self, X):
 
         return sparse_hstack(indicators).tocsr(), n_nodes_ptr
 
-    def fit(self, X, y, sample_weight=None, categorical='None'):
+    def fit(self, X, y, sample_weight=None):
         """Build a forest of trees from the training set (X, y).
 
         Parameters
@@ -236,19 +234,6 @@ def fit(self, X, y, sample_weight=None, categorical='None'):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
-        categorical : array-like or str
-            Array of feature indices, boolean array of length
-            n_features, ``'All'``, or ``'None'``.  Indicates which
-            features should be considered as categorical rather than
-            ordinal. For decision trees, the maximum number of
-            categories per feature is 64, though the real-world limit
-            will be much lower because evaluating splits has
-            :math:`O(2^N)` time complexity, for :math:`N`
-            categories. Extra-randomized trees do not have this
-            limitation because they do not try to find the best
-            split. For these trees, the maximum number of categories
-            per feature is :math:`2^{31}`.
-
         Returns
         -------
         self : object
@@ -264,7 +249,7 @@ def fit(self, X, y, sample_weight=None, categorical='None'):
 
         # Preprocess categorical variables
         X, _, self.category_map_ = preproc_categorical(
-            X, categorical, check_input=True)
+            X, self.categorical, check_input=True)
 
         # Remap output
         n_samples, self.n_features_ = X.shape
@@ -336,7 +321,7 @@ def fit(self, X, y, sample_weight=None, categorical='None'):
             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                              backend="threading")(
                 delayed(_parallel_build_trees)(
-                    t, self, X, y, sample_weight, categorical, i, len(trees),
+                    t, self, X, y, sample_weight, i, len(trees),
                     verbose=self.verbose, class_weight=self.class_weight)
                 for i, t in enumerate(trees))
 
@@ -829,6 +814,19 @@ class RandomForestClassifier(ForestClassifier):
         If None then unlimited number of leaf nodes.
         If not None then ``max_depth`` will be ignored.
 
+    categorical : array-like or str
+        Array of feature indices, boolean array of length
+        n_features, ``'all'``, or ``'none'``.  Indicates which
+        features should be considered as categorical rather than
+        ordinal. For decision trees, the maximum number of
+        categories per feature is 64, though the real-world limit
+        will be much lower because evaluating splits has
+        :math:`O(2^N)` time complexity, for :math:`N`
+        categories. Extra-randomized trees do not have this
+        limitation because they do not try to find the best
+        split. For these trees, the maximum number of categories
+        per feature is :math:`2^{31}`.
+
     bootstrap : boolean, optional (default=True)
         Whether bootstrap samples are used when building trees.
 
@@ -922,6 +920,7 @@ def __init__(self,
                  min_weight_fraction_leaf=0.,
                  max_features="auto",
                  max_leaf_nodes=None,
+                 categorical="none",
                  bootstrap=True,
                  oob_score=False,
                  n_jobs=1,
@@ -935,7 +934,7 @@ def __init__(self,
             estimator_params=("criterion", "max_depth", "min_samples_split",
                               "min_samples_leaf", "min_weight_fraction_leaf",
                               "max_features", "max_leaf_nodes",
-                              "random_state"),
+                              "random_state", "categorical"),
             bootstrap=bootstrap,
             oob_score=oob_score,
             n_jobs=n_jobs,
@@ -951,6 +950,7 @@ def __init__(self,
         self.min_weight_fraction_leaf = min_weight_fraction_leaf
         self.max_features = max_features
         self.max_leaf_nodes = max_leaf_nodes
+        self.categorical = categorical
 
 
 class RandomForestRegressor(ForestRegressor):
@@ -1022,6 +1022,19 @@ class RandomForestRegressor(ForestRegressor):
         If None then unlimited number of leaf nodes.
         If not None then ``max_depth`` will be ignored.
 
+    categorical : array-like or str
+        Array of feature indices, boolean array of length
+        n_features, ``'all'``, or ``'none'``.  Indicates which
+        features should be considered as categorical rather than
+        ordinal. For decision trees, the maximum number of
+        categories per feature is 64, though the real-world limit
+        will be much lower because evaluating splits has
+        :math:`O(2^N)` time complexity, for :math:`N`
+        categories. Extra-randomized trees do not have this
+        limitation because they do not try to find the best
+        split. For these trees, the maximum number of categories
+        per feature is :math:`2^{31}`.
+
     bootstrap : boolean, optional (default=True)
         Whether bootstrap samples are used when building trees.
 
@@ -1085,6 +1098,7 @@ def __init__(self,
                  min_weight_fraction_leaf=0.,
                  max_features="auto",
                  max_leaf_nodes=None,
+                 categorical="none",
                  bootstrap=True,
                  oob_score=False,
                  n_jobs=1,
@@ -1097,7 +1111,7 @@ def __init__(self,
             estimator_params=("criterion", "max_depth", "min_samples_split",
                               "min_samples_leaf", "min_weight_fraction_leaf",
                               "max_features", "max_leaf_nodes",
-                              "random_state"),
+                              "random_state", "categorical"),
             bootstrap=bootstrap,
             oob_score=oob_score,
             n_jobs=n_jobs,
@@ -1112,6 +1126,7 @@ def __init__(self,
         self.min_weight_fraction_leaf = min_weight_fraction_leaf
         self.max_features = max_features
         self.max_leaf_nodes = max_leaf_nodes
+        self.categorical = categorical
 
 
 class ExtraTreesClassifier(ForestClassifier):
@@ -1181,6 +1196,19 @@ class ExtraTreesClassifier(ForestClassifier):
         If None then unlimited number of leaf nodes.
         If not None then ``max_depth`` will be ignored.
 
+    categorical : array-like or str
+        Array of feature indices, boolean array of length
+        n_features, ``'all'``, or ``'none'``.  Indicates which
+        features should be considered as categorical rather than
+        ordinal. For decision trees, the maximum number of
+        categories per feature is 64, though the real-world limit
+        will be much lower because evaluating splits has
+        :math:`O(2^N)` time complexity, for :math:`N`
+        categories. Extra-randomized trees do not have this
+        limitation because they do not try to find the best
+        split. For these trees, the maximum number of categories
+        per feature is :math:`2^{31}`.
+
     bootstrap : boolean, optional (default=False)
         Whether bootstrap samples are used when building trees.
 
@@ -1276,6 +1304,7 @@ def __init__(self,
                  min_weight_fraction_leaf=0.,
                  max_features="auto",
                  max_leaf_nodes=None,
+                 categorical="none",
                  bootstrap=False,
                  oob_score=False,
                  n_jobs=1,
@@ -1289,7 +1318,7 @@ def __init__(self,
             estimator_params=("criterion", "max_depth", "min_samples_split",
                               "min_samples_leaf", "min_weight_fraction_leaf",
                               "max_features", "max_leaf_nodes",
-                              "random_state"),
+                              "random_state", "categorical"),
             bootstrap=bootstrap,
             oob_score=oob_score,
             n_jobs=n_jobs,
@@ -1305,6 +1334,7 @@ def __init__(self,
         self.min_weight_fraction_leaf = min_weight_fraction_leaf
         self.max_features = max_features
         self.max_leaf_nodes = max_leaf_nodes
+        self.categorical = categorical
 
 
 class ExtraTreesRegressor(ForestRegressor):
@@ -1374,6 +1404,19 @@ class ExtraTreesRegressor(ForestRegressor):
         If None then unlimited number of leaf nodes.
         If not None then ``max_depth`` will be ignored.
 
+    categorical : array-like or str
+        Array of feature indices, boolean array of length
+        n_features, ``'all'``, or ``'none'``.  Indicates which
+        features should be considered as categorical rather than
+        ordinal. For decision trees, the maximum number of
+        categories per feature is 64, though the real-world limit
+        will be much lower because evaluating splits has
+        :math:`O(2^N)` time complexity, for :math:`N`
+        categories. Extra-randomized trees do not have this
+        limitation because they do not try to find the best
+        split. For these trees, the maximum number of categories
+        per feature is :math:`2^{31}`.
+
     bootstrap : boolean, optional (default=False)
         Whether bootstrap samples are used when building trees.
 
@@ -1438,6 +1481,7 @@ def __init__(self,
                  min_weight_fraction_leaf=0.,
                  max_features="auto",
                  max_leaf_nodes=None,
+                 categorical="none",
                  bootstrap=False,
                  oob_score=False,
                  n_jobs=1,
@@ -1450,7 +1494,7 @@ def __init__(self,
             estimator_params=("criterion", "max_depth", "min_samples_split",
                               "min_samples_leaf", "min_weight_fraction_leaf",
                               "max_features", "max_leaf_nodes",
-                              "random_state"),
+                              "random_state", "categorical"),
             bootstrap=bootstrap,
             oob_score=oob_score,
             n_jobs=n_jobs,
@@ -1465,6 +1509,7 @@ def __init__(self,
         self.min_weight_fraction_leaf = min_weight_fraction_leaf
         self.max_features = max_features
         self.max_leaf_nodes = max_leaf_nodes
+        self.categorical = categorical
 
 
 class RandomTreesEmbedding(BaseForest):
@@ -1541,6 +1586,19 @@ class RandomTreesEmbedding(BaseForest):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
+    categorical : array-like or str
+        Array of feature indices, boolean array of length
+        n_features, ``'all'``, or ``'none'``.  Indicates which
+        features should be considered as categorical rather than
+        ordinal. For decision trees, the maximum number of
+        categories per feature is 64, though the real-world limit
+        will be much lower because evaluating splits has
+        :math:`O(2^N)` time complexity, for :math:`N`
+        categories. Extra-randomized trees do not have this
+        limitation because they do not try to find the best
+        split. For these trees, the maximum number of categories
+        per feature is :math:`2^{31}`.
+
     Attributes
     ----------
     estimators_ : list of DecisionTreeClassifier
@@ -1567,14 +1625,15 @@ def __init__(self,
                  n_jobs=1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 categorical="none"):
         super(RandomTreesEmbedding, self).__init__(
             base_estimator=ExtraTreeRegressor(),
             n_estimators=n_estimators,
             estimator_params=("criterion", "max_depth", "min_samples_split",
                               "min_samples_leaf", "min_weight_fraction_leaf",
                               "max_features", "max_leaf_nodes",
-                              "random_state"),
+                              "random_state", "categorical"),
             bootstrap=False,
             oob_score=False,
             n_jobs=n_jobs,
@@ -1590,6 +1649,7 @@ def __init__(self,
         self.max_features = 1
         self.max_leaf_nodes = max_leaf_nodes
         self.sparse_output = sparse_output
+        self.categorical = categorical
 
     def _set_oob_score(self, X, y):
         raise NotImplementedError("OOB score not supported by tree embedding")
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
index 93f965e6b13cc..66af4b3741f39 100644
--- a/sklearn/ensemble/gradient_boosting.py
+++ b/sklearn/ensemble/gradient_boosting.py
@@ -726,7 +726,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
                  min_samples_leaf, min_weight_fraction_leaf,
                  max_depth, init, subsample, max_features,
                  random_state, alpha=0.9, verbose=0, max_leaf_nodes=None,
-                 warm_start=False, presort='auto'):
+                 warm_start=False, presort='auto', categorical='none'):
 
         self.n_estimators = n_estimators
         self.learning_rate = learning_rate
@@ -744,13 +744,13 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
         self.max_leaf_nodes = max_leaf_nodes
         self.warm_start = warm_start
         self.presort = presort
+        self.categorical = categorical
         self.category_map_ = None
 
         self.estimators_ = np.empty((0, 0), dtype=np.object)
 
     def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
-                   categorical, random_state, X_idx_sorted, X_csc=None,
-                   X_csr=None):
+                   random_state, X_idx_sorted, X_csc=None, X_csr=None):
         """Fit another stage of ``n_classes_`` trees to the boosting model. """
 
         assert sample_mask.dtype == np.bool
@@ -775,15 +775,16 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
                 max_features=self.max_features,
                 max_leaf_nodes=self.max_leaf_nodes,
                 random_state=random_state,
-                presort=self.presort)
+                presort=self.presort,
+                categorical=self.categorical)
 
             if self.subsample < 1.0:
                 # no inplace multiplication!
                 sample_weight = sample_weight * sample_mask.astype(np.float64)
 
             tree.fit(X_csc if X_csc is not None else X, residual,
-                     sample_weight=sample_weight, categorical=categorical,
-                     check_input=False, X_idx_sorted=X_idx_sorted)
+                     sample_weight=sample_weight, check_input=False,
+                     X_idx_sorted=X_idx_sorted)
 
             # update tree leaves
             loss.update_terminal_regions(
@@ -924,7 +925,7 @@ def _check_initialized(self):
             raise NotFittedError("Estimator not fitted, call `fit`"
                                  " before making predictions`.")
 
-    def fit(self, X, y, sample_weight=None, categorical='None', monitor=None):
+    def fit(self, X, y, sample_weight=None, monitor=None):
         """Fit the gradient boosting model.
 
         Parameters
@@ -945,15 +946,6 @@ def fit(self, X, y, sample_weight=None, categorical='None', monitor=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
-        categorical : array-like or str
-            Array of feature indices, boolean array of length
-            n_features, ``'All'``, or ``'None'``.  Indicates which
-            features should be considered as categorical rather than
-            ordinal. The maximum number of categories per feature is
-            64, though the real-world limit will be much lower because
-            evaluating splits has :math:`O(2^N)` time complexity, for
-            :math:`N` categories.
-
         monitor : callable, optional
             The monitor is called after each iteration with the current
             iteration, a reference to the estimator and the local variables of
@@ -986,7 +978,7 @@ def fit(self, X, y, sample_weight=None, categorical='None', monitor=None):
 
         # Preprocess categorical variables
         X, _, self.category_map_ = preproc_categorical(
-            X, categorical, check_input=True)
+            X, self.categorical, check_input=True)
 
         random_state = check_random_state(self.random_state)
         self._check_params()
@@ -1031,9 +1023,8 @@ def fit(self, X, y, sample_weight=None, categorical='None', monitor=None):
                                                  dtype=np.int32)
 
         # fit the boosting stages
-        n_stages = self._fit_stages(X, y, y_pred, sample_weight, categorical,
-                                    random_state, begin_at_stage, monitor,
-                                    X_idx_sorted)
+        n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state,
+                                    begin_at_stage, monitor, X_idx_sorted)
         # change shape of arrays after fit (early-stopping or additional ests)
         if n_stages != self.estimators_.shape[0]:
             self.estimators_ = self.estimators_[:n_stages]
@@ -1043,9 +1034,8 @@ def fit(self, X, y, sample_weight=None, categorical='None', monitor=None):
 
         return self
 
-    def _fit_stages(self, X, y, y_pred, sample_weight, categorical,
-                    random_state, begin_at_stage=0, monitor=None,
-                    X_idx_sorted=None):
+    def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
+                    begin_at_stage=0, monitor=None, X_idx_sorted=None):
         """Iteratively fits the stages.
 
         For each stage it computes the progress (OOB, train score)
@@ -1088,7 +1078,7 @@ def _fit_stages(self, X, y, y_pred, sample_weight, categorical,
 
             # fit next stage of trees
             y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
-                                     sample_mask, categorical, random_state,
+                                     sample_mask, random_state,
                                      X_idx_sorted, X_csc, X_csr)
 
             # track deviance (= loss)
@@ -1393,6 +1383,15 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
         .. versionadded:: 0.17
            *presort* parameter.
 
+    categorical : array-like or str
+        Array of feature indices, boolean array of length
+        n_features, ``'all'``, or ``'none'``.  Indicates which
+        features should be considered as categorical rather than
+        ordinal. The maximum number of categories per feature is
+        64, though the real-world limit will be much lower because
+        evaluating splits has :math:`O(2^N)` time complexity, for
+        :math:`N` categories.
+
     Attributes
     ----------
     feature_importances_ : array, shape = [n_features]
@@ -1445,7 +1444,7 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
                  max_depth=3, init=None, random_state=None,
                  max_features=None, verbose=0,
                  max_leaf_nodes=None, warm_start=False,
-                 presort='auto'):
+                 presort='auto', categorical='none'):
 
         super(GradientBoostingClassifier, self).__init__(
             loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
@@ -1456,7 +1455,7 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
             max_features=max_features,
             random_state=random_state, verbose=verbose,
             max_leaf_nodes=max_leaf_nodes, warm_start=warm_start,
-            presort=presort)
+            presort=presort, categorical=categorical)
 
     def _validate_y(self, y):
         check_classification_targets(y)
@@ -1744,6 +1743,15 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
         .. versionadded:: 0.17
            optional parameter *presort*.
 
+    categorical : array-like or str
+        Array of feature indices, boolean array of length
+        n_features, ``'all'``, or ``'none'``.  Indicates which
+        features should be considered as categorical rather than
+        ordinal. The maximum number of categories per feature is
+        64, though the real-world limit will be much lower because
+        evaluating splits has :math:`O(2^N)` time complexity, for
+        :math:`N` categories.
+
     Attributes
     ----------
     feature_importances_ : array, shape = [n_features]
@@ -1792,7 +1800,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
                  min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, init=None, random_state=None,
                  max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None,
-                 warm_start=False, presort='auto'):
+                 warm_start=False, presort='auto', categorical='none'):
 
         super(GradientBoostingRegressor, self).__init__(
             loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
@@ -1803,7 +1811,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
             max_features=max_features,
             random_state=random_state, alpha=alpha, verbose=verbose,
             max_leaf_nodes=max_leaf_nodes, warm_start=warm_start,
-            presort=presort)
+            presort=presort, categorical=categorical)
 
     def predict(self, X):
         """Predict regression target for X.
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 1b55529bbca7f..c03420def7723 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -1446,24 +1446,31 @@ def test_no_sparse_y_support():
 
 
 def test_invalid_categorical_str():
-    check = lambda nm: assert_raises(ValueError, ALL_TREES[nm]().fit, X, y,
+    check = lambda nm: assert_raises(ValueError, ALL_TREES[nm],
                                      categorical='example invalid string')
     for name in ALL_TREES:
         yield check, name
 
 
 def test_invalid_categorical_bool():
-    check = lambda nm: assert_raises(ValueError, ALL_TREES[nm]().fit, X, y,
-                                     categorical=[False, False, False])
+    check = lambda nm: assert_raises(
+        ValueError, ALL_TREES[nm](categorical=[False, False, False]).fit, X, y)
+    for name in ALL_TREES:
+        yield check, name
+
+
+def test_invalid_categorical_shape():
+    check = lambda nm: assert_raises(
+        ValueError, ALL_TREES[nm], categorical=[[0]])
     for name in ALL_TREES:
         yield check, name
 
 
 def check_invalid_categorical_idx(name):
     Tree = ALL_TREES[name]
-    bad_catvals = [[1, 2], [-3], [[0]], [0, 0, 1]]
+    bad_catvals = ([1, 2], [-3], [0, 0, 1])
     for catval in bad_catvals:
-        assert_raises(ValueError, Tree().fit, X, y, categorical=catval)
+        assert_raises(ValueError, Tree(categorical=catval).fit, X, y)
 
 
 def test_invalid_categorical_idx():
@@ -1475,10 +1482,10 @@ def check_no_sparse_with_categorical(name):
     X, y, X_sparse = [DATASETS['toy'][z] for z in
                       ['X', 'y', 'X_sparse']]
     Tree = ALL_TREES[name]
-    assert_raises(NotImplementedError, Tree().fit, X_sparse, y,
-                  categorical='All')
+    assert_raises(NotImplementedError, Tree(categorical='all').fit,
+                  X_sparse, y)
     assert_raises(NotImplementedError,
-                  Tree().fit(X, y, categorical='All').predict, X_sparse)
+                  Tree(categorical='all').fit(X, y).predict, X_sparse)
 
 
 def test_no_sparse_with_categorical():
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index c10303d395ff6..87a6919067633 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -98,13 +98,15 @@ def preproc_categorical(X, categorical, check_input):
     """
     n_features = np.shape(X)[1]
     if isinstance(categorical, str):
-        if categorical == "None":
+        if categorical == 'none':
             categorical = np.array([])
-        elif categorical == "All":
+        elif categorical == 'all':
             categorical = np.arange(n_features)
         else:
-            raise ValueError("Invalid value for categorical: %s. Allowed"
-                             " strings are 'All' or 'None'" % categorical)
+            # Should have been caught in the constructor, but just in case
+            raise ValueError("Invalid value for categorical: {}. Allowed"
+                             " strings are 'all' or 'none'"
+                             "".format(categorical))
     categorical = np.asarray(categorical)
     if categorical.dtype == np.bool:
         if categorical.shape != (n_features,):
@@ -202,7 +204,8 @@ def __init__(self,
                  max_leaf_nodes,
                  random_state,
                  class_weight=None,
-                 presort=False):
+                 presort=False,
+                 categorical='none'):
         self.criterion = criterion
         self.splitter = splitter
         self.max_depth = max_depth
@@ -214,6 +217,7 @@ def __init__(self,
         self.max_leaf_nodes = max_leaf_nodes
         self.class_weight = class_weight
         self.presort = presort
+        self.categorical = categorical
 
         self.n_features_ = None
         self.n_outputs_ = None
@@ -224,8 +228,17 @@ def __init__(self,
         self.tree_ = None
         self.max_features_ = None
 
-    def fit(self, X, y, sample_weight=None, categorical='None',
-            check_input=True, X_idx_sorted=None):
+        # Input validation for parameter categorical
+        if isinstance(self.categorical, str):
+            if categorical not in ('all', 'none'):
+                raise ValueError("Invalid value for categorical: {}. Allowed"
+                                 " strings are 'all' or 'none'"
+                                 "".format(categorical))
+        elif len(np.shape(categorical)) != 1:
+            raise ValueError("Invalid shape for parameter categorical")
+
+    def fit(self, X, y, sample_weight=None, check_input=True,
+            X_idx_sorted=None):
         """Build a decision tree from the training set (X, y).
 
         Parameters
@@ -247,19 +260,6 @@ def fit(self, X, y, sample_weight=None, categorical='None',
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
-        categorical : array-like or str
-            Array of feature indices, boolean array of length
-            n_features, ``'All'``, or ``'None'``.  Indicates which
-            features should be considered as categorical rather than
-            ordinal. For decision trees, the maximum number of
-            categories per feature is 64, though the real-world limit
-            will be much lower because evaluating splits has
-            :math:`O(2^N)` time complexity, for :math:`N`
-            categories. Extra-randomized trees do not have this
-            limitation because they do not try to find the best
-            split. For these trees, the maximum number of categories
-            per feature is :math:`2^{31}`.
-
         check_input : boolean, (default=True)
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you are doing.
@@ -425,7 +425,7 @@ def fit(self, X, y, sample_weight=None, categorical='None',
 
         # Do preprocessing of categorical variables
         X, n_categories, self.category_map_ = preproc_categorical(
-            X, categorical, check_input)
+            X, self.categorical, check_input)
 
         # Set min_weight_leaf from min_weight_fraction_leaf
         if self.min_weight_fraction_leaf != 0. and sample_weight is not None:
@@ -765,6 +765,19 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
         When using either a smaller dataset or a restricted depth, this may
         speed up the training.
 
+    categorical : array-like or str
+        Array of feature indices, boolean array of length
+        n_features, ``'all'``, or ``'none'``.  Indicates which
+        features should be considered as categorical rather than
+        ordinal. For decision trees, the maximum number of
+        categories per feature is 64, though the real-world limit
+        will be much lower because evaluating splits has
+        :math:`O(2^N)` time complexity, for :math:`N`
+        categories. Extra-randomized trees do not have this
+        limitation because they do not try to find the best
+        split. For these trees, the maximum number of categories
+        per feature is :math:`2^{31}`.
+
     Attributes
     ----------
     classes_ : array of shape = [n_classes] or a list of such arrays
@@ -836,7 +849,8 @@ def __init__(self,
                  random_state=None,
                  max_leaf_nodes=None,
                  class_weight=None,
-                 presort=False):
+                 presort=False,
+                 categorical='none'):
         super(DecisionTreeClassifier, self).__init__(
             criterion=criterion,
             splitter=splitter,
@@ -848,7 +862,8 @@ def __init__(self,
             max_leaf_nodes=max_leaf_nodes,
             class_weight=class_weight,
             random_state=random_state,
-            presort=presort)
+            presort=presort,
+            categorical=categorical)
 
     def predict_proba(self, X, check_input=True):
         """Predict class probabilities of the input samples X.
@@ -1007,6 +1022,19 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
         When using either a smaller dataset or a restricted depth, this may
         speed up the training.
 
+    categorical : array-like or str
+        Array of feature indices, boolean array of length
+        n_features, ``'all'``, or ``'none'``.  Indicates which
+        features should be considered as categorical rather than
+        ordinal. For decision trees, the maximum number of
+        categories per feature is 64, though the real-world limit
+        will be much lower because evaluating splits has
+        :math:`O(2^N)` time complexity, for :math:`N`
+        categories. Extra-randomized trees do not have this
+        limitation because they do not try to find the best
+        split. For these trees, the maximum number of categories
+        per feature is :math:`2^{31}`.
+
     Attributes
     ----------
     feature_importances_ : array of shape = [n_features]
@@ -1069,7 +1097,8 @@ def __init__(self,
                  max_features=None,
                  random_state=None,
                  max_leaf_nodes=None,
-                 presort=False):
+                 presort=False,
+                 categorical='none'):
         super(DecisionTreeRegressor, self).__init__(
             criterion=criterion,
             splitter=splitter,
@@ -1080,7 +1109,8 @@ def __init__(self,
             max_features=max_features,
             max_leaf_nodes=max_leaf_nodes,
             random_state=random_state,
-            presort=presort)
+            presort=presort,
+            categorical=categorical)
 
 
 class ExtraTreeClassifier(DecisionTreeClassifier):
@@ -1117,7 +1147,8 @@ def __init__(self,
                  max_features="auto",
                  random_state=None,
                  max_leaf_nodes=None,
-                 class_weight=None):
+                 class_weight=None,
+                 categorical='none'):
         super(ExtraTreeClassifier, self).__init__(
             criterion=criterion,
             splitter=splitter,
@@ -1128,7 +1159,8 @@ def __init__(self,
             max_features=max_features,
             max_leaf_nodes=max_leaf_nodes,
             class_weight=class_weight,
-            random_state=random_state)
+            random_state=random_state,
+            categorical=categorical)
 
 
 class ExtraTreeRegressor(DecisionTreeRegressor):
@@ -1164,7 +1196,8 @@ def __init__(self,
                  min_weight_fraction_leaf=0.,
                  max_features="auto",
                  random_state=None,
-                 max_leaf_nodes=None):
+                 max_leaf_nodes=None,
+                 categorical='none'):
         super(ExtraTreeRegressor, self).__init__(
             criterion=criterion,
             splitter=splitter,
@@ -1174,4 +1207,5 @@ def __init__(self,
             min_weight_fraction_leaf=min_weight_fraction_leaf,
             max_features=max_features,
             max_leaf_nodes=max_leaf_nodes,
-            random_state=random_state)
+            random_state=random_state,
+            categorical=categorical)

From 806951228e08afd86962bbf26033d3346d244927 Mon Sep 17 00:00:00 2001
From: Jeffrey Blackburne <jblackburne@gmail.com>
Date: Sat, 23 Jul 2016 13:32:31 -0700
Subject: [PATCH 35/35] Added printf debug statements.

---
 sklearn/ensemble/tests/test_bagging.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py
index 5d85713a76210..67dde9bdf32e6 100644
--- a/sklearn/ensemble/tests/test_bagging.py
+++ b/sklearn/ensemble/tests/test_bagging.py
@@ -5,6 +5,8 @@
 # Author: Gilles Louppe
 # License: BSD 3 clause
 
+import sys
+
 import numpy as np
 
 from sklearn.base import BaseEstimator
@@ -419,40 +421,49 @@ def test_parallel_classification():
     X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                         iris.target,
                                                         random_state=rng)
-
+    print('nocats checkpoint 1', file=sys.stderr)
     ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                  n_jobs=3,
                                  random_state=0).fit(X_train, y_train)
 
     # predict_proba
+    print('nocats checkpoint 2', file=sys.stderr)
     ensemble.set_params(n_jobs=1)
     y1 = ensemble.predict_proba(X_test)
+    print('nocats checkpoint 3', file=sys.stderr)
     ensemble.set_params(n_jobs=2)
     y2 = ensemble.predict_proba(X_test)
     assert_array_almost_equal(y1, y2)
 
+    print('nocats checkpoint 4', file=sys.stderr)
     ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                  n_jobs=1,
                                  random_state=0).fit(X_train, y_train)
 
+    print('nocats checkpoint 5', file=sys.stderr)
     y3 = ensemble.predict_proba(X_test)
     assert_array_almost_equal(y1, y3)
 
     # decision_function
+    print('nocats checkpoint 6', file=sys.stderr)
     ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
                                  n_jobs=3,
                                  random_state=0).fit(X_train, y_train)
 
+    print('nocats checkpoint 7', file=sys.stderr)
     ensemble.set_params(n_jobs=1)
     decisions1 = ensemble.decision_function(X_test)
+    print('nocats checkpoint 8', file=sys.stderr)
     ensemble.set_params(n_jobs=2)
     decisions2 = ensemble.decision_function(X_test)
     assert_array_almost_equal(decisions1, decisions2)
 
+    print('nocats checkpoint 9', file=sys.stderr)
     ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
                                  n_jobs=1,
                                  random_state=0).fit(X_train, y_train)
 
+    print('nocats checkpoint 10', file=sys.stderr)
     decisions3 = ensemble.decision_function(X_test)
     assert_array_almost_equal(decisions1, decisions3)