From aae611e34e979cfd09f3c8c03f30452818ddf96c Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 17:05:07 -0500 Subject: [PATCH 1/7] RFC: move heap methods to class and remove trailing spaces --- sklearn/tree/_criterion.pyx | 8 ++--- sklearn/tree/_utils.pxd | 2 ++ sklearn/tree/_utils.pyx | 70 ++++++++++++++++++------------------- 3 files changed, 40 insertions(+), 40 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index e7ad82f6dcd49..83229816b53f1 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -854,7 +854,7 @@ cdef class RegressionCriterion(Criterion): self.weighted_n_left -= w - self.weighted_n_right = (self.weighted_n_node_samples - + self.weighted_n_right = (self.weighted_n_node_samples - self.weighted_n_left) for k in range(self.n_outputs): sum_right[k] = sum_total[k] - sum_left[k] @@ -965,7 +965,7 @@ cdef class MSE(RegressionCriterion): for k in range(self.n_outputs): impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 + impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 impurity_left[0] /= self.n_outputs impurity_right[0] /= self.n_outputs @@ -1268,7 +1268,7 @@ cdef class MAE(RegressionCriterion): cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman - Uses the formula (35) in Friedmans original Gradient Boosting paper: + Uses the formula (35) in Friedman's original Gradient Boosting paper: diff = mean_left - mean_right improvement = n_left * n_right * diff^2 / (n_left + n_right) @@ -1321,5 +1321,5 @@ cdef class FriedmanMSE(MSE): diff = (self.weighted_n_right * total_sum_left - self.weighted_n_left * total_sum_right) / self.n_outputs - return (diff * diff / (self.weighted_n_left * self.weighted_n_right * + return (diff * diff / (self.weighted_n_left * self.weighted_n_right * self.weighted_n_node_samples)) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index fce3abcb734db..cc9649030ed65 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -106,6 +106,8 @@ cdef class PriorityHeap: cdef PriorityHeapRecord* heap_ cdef bint is_empty(self) nogil + cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil + cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, SIZE_t heap_length) nogil cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos, SIZE_t depth, bint is_leaf, double improvement, double impurity, double impurity_left, diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index a4ccc71946bd1..465afebc99ffa 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -173,40 +173,6 @@ cdef class Stack: # PriorityHeap data structure # ============================================================================= -cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil: - """Restore heap invariant parent.improvement > child.improvement from - ``pos`` upwards. """ - if pos == 0: - return - - cdef SIZE_t parent_pos = (pos - 1) / 2 - - if heap[parent_pos].improvement < heap[pos].improvement: - heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] - heapify_up(heap, parent_pos) - - -cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos, - SIZE_t heap_length) nogil: - """Restore heap invariant parent.improvement > children.improvement from - ``pos`` downwards. """ - cdef SIZE_t left_pos = 2 * (pos + 1) - 1 - cdef SIZE_t right_pos = 2 * (pos + 1) - cdef SIZE_t largest = pos - - if (left_pos < heap_length and - heap[left_pos].improvement > heap[largest].improvement): - largest = left_pos - - if (right_pos < heap_length and - heap[right_pos].improvement > heap[largest].improvement): - largest = right_pos - - if largest != pos: - heap[pos], heap[largest] = heap[largest], heap[pos] - heapify_down(heap, largest, heap_length) - - cdef class PriorityHeap: """A priority queue implemented as a binary heap. @@ -240,6 +206,38 @@ cdef class PriorityHeap: cdef bint is_empty(self) nogil: return self.heap_ptr <= 0 + cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil: + """Restore heap invariant parent.improvement > child.improvement from + ``pos`` upwards. """ + if pos == 0: + return + + cdef SIZE_t parent_pos = (pos - 1) / 2 + + if heap[parent_pos].improvement < heap[pos].improvement: + heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] + self.heapify_up(heap, parent_pos) + + cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, + SIZE_t heap_length) nogil: + """Restore heap invariant parent.improvement > children.improvement from + ``pos`` downwards. """ + cdef SIZE_t left_pos = 2 * (pos + 1) - 1 + cdef SIZE_t right_pos = 2 * (pos + 1) + cdef SIZE_t largest = pos + + if (left_pos < heap_length and + heap[left_pos].improvement > heap[largest].improvement): + largest = left_pos + + if (right_pos < heap_length and + heap[right_pos].improvement > heap[largest].improvement): + largest = right_pos + + if largest != pos: + heap[pos], heap[largest] = heap[largest], heap[pos] + self.heapify_down(heap, largest, heap_length) + cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos, SIZE_t depth, bint is_leaf, double improvement, double impurity, double impurity_left, @@ -276,7 +274,7 @@ cdef class PriorityHeap: heap[heap_ptr].improvement = improvement # Heapify up - heapify_up(heap, heap_ptr) + self.heapify_up(heap, heap_ptr) # Increase element count self.heap_ptr = heap_ptr + 1 @@ -298,7 +296,7 @@ cdef class PriorityHeap: # Restore heap invariant if heap_ptr > 1: - heapify_down(heap, 0, heap_ptr - 1) + self.heapify_down(heap, 0, heap_ptr - 1) self.heap_ptr = heap_ptr - 1 From b58b0b5147b5932bc9638a1a798a8d3faa04714c Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 17:31:48 -0500 Subject: [PATCH 2/7] spurious comment to force recythonization of boosting --- sklearn/ensemble/_gradient_boosting.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index 71371f5c24a48..98a36d26efaf9 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -5,7 +5,7 @@ # Author: Peter Prettenhofer # # License: BSD 3 clause - +# spurious comment to force recythonization cimport cython from libc.stdlib cimport free From b395f8184f3e1b478d00565941bdf66d32562286 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 17:45:50 -0500 Subject: [PATCH 3/7] [ci skip] remove spurious comment --- sklearn/ensemble/_gradient_boosting.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index 98a36d26efaf9..71371f5c24a48 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -5,7 +5,7 @@ # Author: Peter Prettenhofer # # License: BSD 3 clause -# spurious comment to force recythonization + cimport cython from libc.stdlib cimport free From edb39568d01892eaf0db62b205cd9bc0f9c1a67b Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 18 Jul 2016 07:24:27 -0500 Subject: [PATCH 4/7] remove trailing whitespace on line --- sklearn/tree/_criterion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 83229816b53f1..4cc81d0c178af 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -267,7 +267,7 @@ cdef class ClassificationCriterion(Criterion): self.sum_left = calloc(n_elements, sizeof(double)) self.sum_right = calloc(n_elements, sizeof(double)) - if (self.sum_total == NULL or + if (self.sum_total == NULL or self.sum_left == NULL or self.sum_right == NULL): raise MemoryError() From 0c382475b35eeb4fe87af669dd344fa70934edde Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Tue, 19 Jul 2016 11:46:53 -0700 Subject: [PATCH 5/7] style: fix trailing whitespace in _criterion.pxd --- sklearn/tree/_criterion.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index cf6d32d1b7fe1..57dacc0726c0a 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -45,7 +45,7 @@ cdef class Criterion: # weighted count of each label. For regression, # the sum of w*y. sum_total[k] is equal to # sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k], - # where k is output index. + # where k is output index. cdef double* sum_left # Same as above, but for the left side of the split cdef double* sum_right # same as above, but for the right side of the split From 0b7cf88356dd43592116ca26446f8a22ebba214f Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 24 Oct 2016 08:16:18 -0700 Subject: [PATCH 6/7] add spurious comments to try to force recythonizing --- sklearn/tree/_criterion.pyx | 2 +- sklearn/tree/_splitter.pyx | 2 +- sklearn/tree/_tree.pyx | 2 +- sklearn/tree/_utils.pyx | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 4cc81d0c178af..7f530e62bc141 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -30,7 +30,7 @@ from ._utils cimport log from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray from ._utils cimport WeightedMedianCalculator - +# remove me cdef class Criterion: """Interface for impurity criteria. diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 0617508aab236..61a0011110ff9 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -14,7 +14,7 @@ # Jacob Schreiber # # License: BSD 3 clause - +# remove me from ._criterion cimport Criterion from libc.stdlib cimport free diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index f8632ab1640d8..278b5715fff65 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -15,7 +15,7 @@ # Nelson Liu # # License: BSD 3 clause - +# remove me from cpython cimport Py_INCREF, PyObject from libc.stdlib cimport free diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 465afebc99ffa..768ca90187099 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -20,7 +20,7 @@ from libc.math cimport log as ln import numpy as np cimport numpy as np np.import_array() - +# remove me # ============================================================================= # Helper functions # ============================================================================= From eda8953a5c8a4071f7d10e2fc9a688a9c66e768d Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 24 Oct 2016 08:51:57 -0700 Subject: [PATCH 7/7] remove changes for recythonization --- sklearn/tree/_criterion.pyx | 2 +- sklearn/tree/_splitter.pyx | 2 +- sklearn/tree/_tree.pyx | 2 +- sklearn/tree/_utils.pyx | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 7f530e62bc141..4cc81d0c178af 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -30,7 +30,7 @@ from ._utils cimport log from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray from ._utils cimport WeightedMedianCalculator -# remove me + cdef class Criterion: """Interface for impurity criteria. diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 61a0011110ff9..0617508aab236 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -14,7 +14,7 @@ # Jacob Schreiber # # License: BSD 3 clause -# remove me + from ._criterion cimport Criterion from libc.stdlib cimport free diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 278b5715fff65..f8632ab1640d8 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -15,7 +15,7 @@ # Nelson Liu # # License: BSD 3 clause -# remove me + from cpython cimport Py_INCREF, PyObject from libc.stdlib cimport free diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 768ca90187099..465afebc99ffa 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -20,7 +20,7 @@ from libc.math cimport log as ln import numpy as np cimport numpy as np np.import_array() -# remove me + # ============================================================================= # Helper functions # =============================================================================