Skip to content

[WIP] nocats CI testing #7068

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
34355bb
Created SplitValue datatype to generalize the concept of a threshold …
jblackburne Apr 13, 2015
a9db0ea
Added attribute n_categories to Splitter and Tree, an array of ints t…
jblackburne Apr 13, 2015
53457e5
Added a goes_left function for evaluating splits.
jblackburne Apr 13, 2015
ba93458
Tree.apply() now uses the goes_left function.
jblackburne Apr 13, 2015
d7d13b3
Added categorical split code to BestSplitter.node_split
jblackburne Apr 17, 2015
ec37e11
Added categorical split code to RandomSplitter.node_split
jblackburne Apr 17, 2015
9452196
Added a 'categorical' parameter to BaseDecisionTree.fit(). It current…
jblackburne May 11, 2015
361f87c
Added a property getter/setter for Tree.n_categories
jblackburne May 11, 2015
fc7efa6
Added n_categories arguments to Splitter.init() and TreeBuilder.build…
jblackburne May 11, 2015
144d665
Added python interface for categorical features.
jblackburne Jun 25, 2015
cf09559
Adjusted _gradient_boosting.pyx to match the new splitting mechanism.
jblackburne May 12, 2015
25fcc88
Added interface code to forest.py and gradient_boosting.py for catego…
jblackburne Jun 24, 2015
30fe829
Added bit caches to Splitter and Tree. These are used to avoid regene…
jblackburne May 14, 2015
1c1b776
Added a check on the predict() and predict_proba() methods of trees t…
jblackburne May 14, 2015
b68be8c
Added property getter for a Tree's split_values in addition to thresh…
jblackburne Jun 6, 2015
e187c10
Added a check to prevent use of DecisionTree (instead of ExtraTree) w…
jblackburne Jun 18, 2015
909f09b
Fixed the numpy NODE dtype so it matches the cython struct.
jblackburne Jun 23, 2015
ede1e69
Put the gradient boosting tree descent code into a try..finally block…
jblackburne Jun 23, 2015
abd27df
Fixed a python3 compatibility problem.
jblackburne Jun 25, 2015
ee39881
Fixed a Splitter validation bug in tree.py
jblackburne Oct 14, 2015
b81b29c
RandomSplitter now flips a coin to send each category value left or r…
jblackburne Oct 14, 2015
548e4e5
For categorical features, RandomSplitter now retries up to 20 random …
jblackburne Oct 14, 2015
f28074f
Fixed a nasty bug where categorical normalization was happening incor…
jblackburne Oct 15, 2015
36c2e47
Added some unit tests. More to come.
jblackburne Dec 10, 2015
d555231
Upped the maximum number of times RandomSplitter will retry if it acc…
jblackburne Mar 22, 2016
e2fb46e
Added code to BestSplitter to restrict split trials to categories rep…
jblackburne Mar 26, 2016
05cd4fe
Fixed a warning, which was actually a compile error in clang.
jblackburne May 6, 2016
227d65e
Fixed a bug where BestSplitter was miscalculating impurities for cate…
jblackburne May 30, 2016
548c945
Added an implementation of the Breiman shortcut. Turned off for now.
jblackburne May 27, 2016
1f901ea
Added code to automatically trigger the Breiman shortcut when appropr…
jblackburne May 31, 2016
18cecc5
Fixed a left-shift-too-far (undefined behavior) bug.
jblackburne Jun 5, 2016
138c8d4
Removed numpy access to the split value union to pacify unit tests ru…
jblackburne Jul 5, 2016
1330942
Replaced a call to np.full with np.ones, to accomodate older versions…
jblackburne Jul 14, 2016
2a555b1
Moved the categorical parameter from fit() to the constructor, for tr…
jblackburne Jul 14, 2016
8069512
Added printf debug statements.
jblackburne Jul 23, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 79 additions & 59 deletions sklearn/ensemble/_gradient_boosting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ np.import_array()

from sklearn.tree._tree cimport Node
from sklearn.tree._tree cimport Tree
from sklearn.tree._utils cimport goes_left


ctypedef np.int32_t int32
Expand All @@ -31,6 +32,7 @@ from numpy import float64 as np_float64
DTYPE = np.float32
ctypedef np.float32_t DTYPE_t
ctypedef np.npy_intp SIZE_t
ctypedef np.npy_int32 INT32_t


# constant to mark tree leafs
Expand All @@ -44,6 +46,7 @@ cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X,
Py_ssize_t K,
Py_ssize_t n_samples,
Py_ssize_t n_features,
INT32_t* n_categories,
float64 *out):
"""Predicts output for regression tree and stores it in ``out[i, k]``.

Expand Down Expand Up @@ -78,6 +81,9 @@ cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X,
``n_samples == X.shape[0]``.
n_features : int
The number of features; ``n_samples == X.shape[1]``.
n_categories : INT32_t pointer
Pointer to array of shape [n_features] containing the number of
categories for each feature, or -1 for non-categorical features.
out : np.float64_t pointer
The pointer to the data array where the predictions are stored.
``out`` is assumed to be a two-dimensional array of
Expand All @@ -90,7 +96,8 @@ cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X,
node = root_node
# While node not a leaf
while node.left_child != -1 and node.right_child != -1:
if X[i * n_features + node.feature] <= node.threshold:
if goes_left(X[i * n_features + node.feature], node.split_value,
n_categories[node.feature], node._bit_cache):
node = root_node + node.left_child
else:
node = root_node + node.right_child
Expand All @@ -116,16 +123,21 @@ def predict_stages(np.ndarray[object, ndim=2] estimators,
for k in range(K):
tree = estimators[i, k].tree_

tree.populate_bit_caches()

# avoid buffer validation by casting to ndarray
# and get data pointer
# need brackets because of casting operator priority
_predict_regression_tree_inplace_fast(
<DTYPE_t*> X.data,
tree.nodes, tree.value,
scale, k, K, X.shape[0], X.shape[1],
tree.n_categories,
<float64 *> (<np.ndarray> out).data)
## out += scale * tree.predict(X).reshape((X.shape[0], 1))

tree.delete_bit_caches()


@cython.nonecheck(False)
def predict_stage(np.ndarray[object, ndim=2] estimators,
Expand Down Expand Up @@ -204,65 +216,73 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
underlying_stack = np_zeros((stack_capacity,), dtype=np.intp)
node_stack = <Node **>(<np.ndarray> underlying_stack).data

for i in range(X.shape[0]):
# init stacks for new example
stack_size = 1
node_stack[0] = root_node
weight_stack[0] = 1.0
total_weight = 0.0

while stack_size > 0:
# get top node on stack
stack_size -= 1
current_node = node_stack[stack_size]

if current_node.left_child == LEAF:
out[i] += weight_stack[stack_size] * value[current_node - root_node] * \
learn_rate
total_weight += weight_stack[stack_size]
else:
# non-terminal node
feature_index = array_index(current_node.feature, target_feature)
if feature_index != -1:
# split feature in target set
# push left or right child on stack
if X[i, feature_index] <= current_node.threshold:
# left
node_stack[stack_size] = (root_node +
current_node.left_child)
else:
# right
node_stack[stack_size] = (root_node +
current_node.right_child)
stack_size += 1
tree.populate_bit_caches()

try:
for i in range(X.shape[0]):
# init stacks for new example
stack_size = 1
node_stack[0] = root_node
weight_stack[0] = 1.0
total_weight = 0.0

while stack_size > 0:
# get top node on stack
stack_size -= 1
current_node = node_stack[stack_size]

if current_node.left_child == LEAF:
out[i] += weight_stack[stack_size] * value[current_node - root_node] * \
learn_rate
total_weight += weight_stack[stack_size]
else:
# split feature in complement set
# push both children onto stack

# push left child
node_stack[stack_size] = root_node + current_node.left_child
current_weight = weight_stack[stack_size]
left_sample_frac = root_node[current_node.left_child].n_node_samples / \
<double>current_node.n_node_samples
if left_sample_frac <= 0.0 or left_sample_frac >= 1.0:
raise ValueError("left_sample_frac:%f, "
"n_samples current: %d, "
"n_samples left: %d"
% (left_sample_frac,
current_node.n_node_samples,
root_node[current_node.left_child].n_node_samples))
weight_stack[stack_size] = current_weight * left_sample_frac
stack_size +=1

# push right child
node_stack[stack_size] = root_node + current_node.right_child
weight_stack[stack_size] = current_weight * \
(1.0 - left_sample_frac)
stack_size +=1

if not (0.999 < total_weight < 1.001):
raise ValueError("Total weight should be 1.0 but was %.9f" %
total_weight)
# non-terminal node
feature_index = array_index(current_node.feature, target_feature)
if feature_index != -1:
# split feature in target set
# push left or right child on stack
if goes_left(X[i, feature_index], current_node.split_value,
tree.n_categories[current_node.feature],
current_node._bit_cache):
# left
node_stack[stack_size] = (root_node +
current_node.left_child)
else:
# right
node_stack[stack_size] = (root_node +
current_node.right_child)
stack_size += 1
else:
# split feature in complement set
# push both children onto stack

# push left child
node_stack[stack_size] = root_node + current_node.left_child
current_weight = weight_stack[stack_size]
left_sample_frac = root_node[current_node.left_child].n_node_samples / \
<double>current_node.n_node_samples
if left_sample_frac <= 0.0 or left_sample_frac >= 1.0:
raise ValueError("left_sample_frac:%f, "
"n_samples current: %d, "
"n_samples left: %d"
% (left_sample_frac,
current_node.n_node_samples,
root_node[current_node.left_child].n_node_samples))
weight_stack[stack_size] = current_weight * left_sample_frac
stack_size +=1

# push right child
node_stack[stack_size] = root_node + current_node.right_child
weight_stack[stack_size] = current_weight * \
(1.0 - left_sample_frac)
stack_size +=1

if not (0.999 < total_weight < 1.001):
raise ValueError("Total weight should be 1.0 but was %.9f" %
total_weight)

finally:
tree.delete_bit_caches()


def _random_sample_mask(np.npy_intp n_total_samples,
Expand Down
Loading