Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sklearn/tree/_criterion.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ cdef class Criterion:
# weighted count of each label. For regression,
# the sum of w*y. sum_total[k] is equal to
# sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k],
# where k is output index.
# where k is output index.
cdef double* sum_left # Same as above, but for the left side of the split
cdef double* sum_right # same as above, but for the right side of the split

Expand Down
10 changes: 5 additions & 5 deletions sklearn/tree/_criterion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ cdef class ClassificationCriterion(Criterion):
self.sum_left = <double*> calloc(n_elements, sizeof(double))
self.sum_right = <double*> calloc(n_elements, sizeof(double))

if (self.sum_total == NULL or
if (self.sum_total == NULL or
self.sum_left == NULL or
self.sum_right == NULL):
raise MemoryError()
Expand Down Expand Up @@ -854,7 +854,7 @@ cdef class RegressionCriterion(Criterion):

self.weighted_n_left -= w

self.weighted_n_right = (self.weighted_n_node_samples -
self.weighted_n_right = (self.weighted_n_node_samples -
self.weighted_n_left)
for k in range(self.n_outputs):
sum_right[k] = sum_total[k] - sum_left[k]
Expand Down Expand Up @@ -965,7 +965,7 @@ cdef class MSE(RegressionCriterion):

for k in range(self.n_outputs):
impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0
impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0
impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0

impurity_left[0] /= self.n_outputs
impurity_right[0] /= self.n_outputs
Expand Down Expand Up @@ -1268,7 +1268,7 @@ cdef class MAE(RegressionCriterion):
cdef class FriedmanMSE(MSE):
"""Mean squared error impurity criterion with improvement score by Friedman

Uses the formula (35) in Friedmans original Gradient Boosting paper:
Uses the formula (35) in Friedman's original Gradient Boosting paper:

diff = mean_left - mean_right
improvement = n_left * n_right * diff^2 / (n_left + n_right)
Expand Down Expand Up @@ -1321,5 +1321,5 @@ cdef class FriedmanMSE(MSE):
diff = (self.weighted_n_right * total_sum_left -
self.weighted_n_left * total_sum_right) / self.n_outputs

return (diff * diff / (self.weighted_n_left * self.weighted_n_right *
return (diff * diff / (self.weighted_n_left * self.weighted_n_right *
self.weighted_n_node_samples))
2 changes: 2 additions & 0 deletions sklearn/tree/_utils.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ cdef class PriorityHeap:
cdef PriorityHeapRecord* heap_

cdef bint is_empty(self) nogil
cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil
cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, SIZE_t heap_length) nogil
cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
SIZE_t depth, bint is_leaf, double improvement,
double impurity, double impurity_left,
Expand Down
70 changes: 34 additions & 36 deletions sklearn/tree/_utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -173,40 +173,6 @@ cdef class Stack:
# PriorityHeap data structure
# =============================================================================

cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil:
"""Restore heap invariant parent.improvement > child.improvement from
``pos`` upwards. """
if pos == 0:
return

cdef SIZE_t parent_pos = (pos - 1) / 2

if heap[parent_pos].improvement < heap[pos].improvement:
heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
heapify_up(heap, parent_pos)


cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos,
SIZE_t heap_length) nogil:
"""Restore heap invariant parent.improvement > children.improvement from
``pos`` downwards. """
cdef SIZE_t left_pos = 2 * (pos + 1) - 1
cdef SIZE_t right_pos = 2 * (pos + 1)
cdef SIZE_t largest = pos

if (left_pos < heap_length and
heap[left_pos].improvement > heap[largest].improvement):
largest = left_pos

if (right_pos < heap_length and
heap[right_pos].improvement > heap[largest].improvement):
largest = right_pos

if largest != pos:
heap[pos], heap[largest] = heap[largest], heap[pos]
heapify_down(heap, largest, heap_length)


cdef class PriorityHeap:
"""A priority queue implemented as a binary heap.

Expand Down Expand Up @@ -240,6 +206,38 @@ cdef class PriorityHeap:
cdef bint is_empty(self) nogil:
return self.heap_ptr <= 0

cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jmschrei @glouppe This looks cleaner to me. Are you fine with this change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I originally moved it in because I made another priorityheap class for use with non-records, but we ended up deciding that was too slow. I do think it is cleaner, though.

"""Restore heap invariant parent.improvement > child.improvement from
``pos`` upwards. """
if pos == 0:
return

cdef SIZE_t parent_pos = (pos - 1) / 2

if heap[parent_pos].improvement < heap[pos].improvement:
heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
self.heapify_up(heap, parent_pos)

cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos,
SIZE_t heap_length) nogil:
"""Restore heap invariant parent.improvement > children.improvement from
``pos`` downwards. """
cdef SIZE_t left_pos = 2 * (pos + 1) - 1
cdef SIZE_t right_pos = 2 * (pos + 1)
cdef SIZE_t largest = pos

if (left_pos < heap_length and
heap[left_pos].improvement > heap[largest].improvement):
largest = left_pos

if (right_pos < heap_length and
heap[right_pos].improvement > heap[largest].improvement):
largest = right_pos

if largest != pos:
heap[pos], heap[largest] = heap[largest], heap[pos]
self.heapify_down(heap, largest, heap_length)

cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
SIZE_t depth, bint is_leaf, double improvement,
double impurity, double impurity_left,
Expand Down Expand Up @@ -276,7 +274,7 @@ cdef class PriorityHeap:
heap[heap_ptr].improvement = improvement

# Heapify up
heapify_up(heap, heap_ptr)
self.heapify_up(heap, heap_ptr)

# Increase element count
self.heap_ptr = heap_ptr + 1
Expand All @@ -298,7 +296,7 @@ cdef class PriorityHeap:

# Restore heap invariant
if heap_ptr > 1:
heapify_down(heap, 0, heap_ptr - 1)
self.heapify_down(heap, 0, heap_ptr - 1)

self.heap_ptr = heap_ptr - 1

Expand Down