diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index cf6d32d1b7fe1..57dacc0726c0a 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -45,7 +45,7 @@ cdef class Criterion: # weighted count of each label. For regression, # the sum of w*y. sum_total[k] is equal to # sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k], - # where k is output index. + # where k is output index. cdef double* sum_left # Same as above, but for the left side of the split cdef double* sum_right # same as above, but for the right side of the split diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index e7ad82f6dcd49..4cc81d0c178af 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -267,7 +267,7 @@ cdef class ClassificationCriterion(Criterion): self.sum_left = calloc(n_elements, sizeof(double)) self.sum_right = calloc(n_elements, sizeof(double)) - if (self.sum_total == NULL or + if (self.sum_total == NULL or self.sum_left == NULL or self.sum_right == NULL): raise MemoryError() @@ -854,7 +854,7 @@ cdef class RegressionCriterion(Criterion): self.weighted_n_left -= w - self.weighted_n_right = (self.weighted_n_node_samples - + self.weighted_n_right = (self.weighted_n_node_samples - self.weighted_n_left) for k in range(self.n_outputs): sum_right[k] = sum_total[k] - sum_left[k] @@ -965,7 +965,7 @@ cdef class MSE(RegressionCriterion): for k in range(self.n_outputs): impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 + impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 impurity_left[0] /= self.n_outputs impurity_right[0] /= self.n_outputs @@ -1268,7 +1268,7 @@ cdef class MAE(RegressionCriterion): cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman - Uses the formula (35) in Friedmans original Gradient Boosting paper: + Uses the formula (35) in Friedman's original Gradient Boosting paper: diff = mean_left - mean_right improvement = n_left * n_right * diff^2 / (n_left + n_right) @@ -1321,5 +1321,5 @@ cdef class FriedmanMSE(MSE): diff = (self.weighted_n_right * total_sum_left - self.weighted_n_left * total_sum_right) / self.n_outputs - return (diff * diff / (self.weighted_n_left * self.weighted_n_right * + return (diff * diff / (self.weighted_n_left * self.weighted_n_right * self.weighted_n_node_samples)) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index fce3abcb734db..cc9649030ed65 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -106,6 +106,8 @@ cdef class PriorityHeap: cdef PriorityHeapRecord* heap_ cdef bint is_empty(self) nogil + cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil + cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, SIZE_t heap_length) nogil cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos, SIZE_t depth, bint is_leaf, double improvement, double impurity, double impurity_left, diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index a4ccc71946bd1..465afebc99ffa 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -173,40 +173,6 @@ cdef class Stack: # PriorityHeap data structure # ============================================================================= -cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil: - """Restore heap invariant parent.improvement > child.improvement from - ``pos`` upwards. """ - if pos == 0: - return - - cdef SIZE_t parent_pos = (pos - 1) / 2 - - if heap[parent_pos].improvement < heap[pos].improvement: - heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] - heapify_up(heap, parent_pos) - - -cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos, - SIZE_t heap_length) nogil: - """Restore heap invariant parent.improvement > children.improvement from - ``pos`` downwards. """ - cdef SIZE_t left_pos = 2 * (pos + 1) - 1 - cdef SIZE_t right_pos = 2 * (pos + 1) - cdef SIZE_t largest = pos - - if (left_pos < heap_length and - heap[left_pos].improvement > heap[largest].improvement): - largest = left_pos - - if (right_pos < heap_length and - heap[right_pos].improvement > heap[largest].improvement): - largest = right_pos - - if largest != pos: - heap[pos], heap[largest] = heap[largest], heap[pos] - heapify_down(heap, largest, heap_length) - - cdef class PriorityHeap: """A priority queue implemented as a binary heap. @@ -240,6 +206,38 @@ cdef class PriorityHeap: cdef bint is_empty(self) nogil: return self.heap_ptr <= 0 + cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil: + """Restore heap invariant parent.improvement > child.improvement from + ``pos`` upwards. """ + if pos == 0: + return + + cdef SIZE_t parent_pos = (pos - 1) / 2 + + if heap[parent_pos].improvement < heap[pos].improvement: + heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] + self.heapify_up(heap, parent_pos) + + cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, + SIZE_t heap_length) nogil: + """Restore heap invariant parent.improvement > children.improvement from + ``pos`` downwards. """ + cdef SIZE_t left_pos = 2 * (pos + 1) - 1 + cdef SIZE_t right_pos = 2 * (pos + 1) + cdef SIZE_t largest = pos + + if (left_pos < heap_length and + heap[left_pos].improvement > heap[largest].improvement): + largest = left_pos + + if (right_pos < heap_length and + heap[right_pos].improvement > heap[largest].improvement): + largest = right_pos + + if largest != pos: + heap[pos], heap[largest] = heap[largest], heap[pos] + self.heapify_down(heap, largest, heap_length) + cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos, SIZE_t depth, bint is_leaf, double improvement, double impurity, double impurity_left, @@ -276,7 +274,7 @@ cdef class PriorityHeap: heap[heap_ptr].improvement = improvement # Heapify up - heapify_up(heap, heap_ptr) + self.heapify_up(heap, heap_ptr) # Increase element count self.heap_ptr = heap_ptr + 1 @@ -298,7 +296,7 @@ cdef class PriorityHeap: # Restore heap invariant if heap_ptr > 1: - heapify_down(heap, 0, heap_ptr - 1) + self.heapify_down(heap, 0, heap_ptr - 1) self.heap_ptr = heap_ptr - 1