-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
[MRG+3] FIX Memory leak in MAE; Use safe_realloc; Acquire GIL only when raising; Propagate all errors to python interpreter level (#7811) #8002
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e37ef50
208c336
b355ec0
086506c
caff5d0
c748c64
2d86faf
34e6992
96bac83
1be08df
8cb8b5d
f756bd7
e653f6f
34adc80
2f3bced
9b12a69
3ce3d52
c67ebd7
318705d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,11 +51,14 @@ cdef class Criterion: | |
def __setstate__(self, d): | ||
pass | ||
|
||
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, | ||
double weighted_n_samples, SIZE_t* samples, SIZE_t start, | ||
SIZE_t end) nogil: | ||
cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, | ||
double weighted_n_samples, SIZE_t* samples, SIZE_t start, | ||
SIZE_t end) nogil except -1: | ||
"""Placeholder for a method which will initialize the criterion. | ||
|
||
Returns -1 in case of failure to allocate memory (and raise MemoryError) | ||
or 0 otherwise. | ||
|
||
Parameters | ||
---------- | ||
y : array-like, dtype=DOUBLE_t | ||
|
@@ -79,22 +82,22 @@ cdef class Criterion: | |
|
||
pass | ||
|
||
cdef void reset(self) nogil: | ||
cdef int reset(self) nogil except -1: | ||
"""Reset the criterion at pos=start. | ||
|
||
This method must be implemented by the subclass. | ||
""" | ||
|
||
pass | ||
|
||
cdef void reverse_reset(self) nogil: | ||
cdef int reverse_reset(self) nogil except -1: | ||
"""Reset the criterion at pos=end. | ||
|
||
This method must be implemented by the subclass. | ||
""" | ||
pass | ||
|
||
cdef void update(self, SIZE_t new_pos) nogil: | ||
cdef int update(self, SIZE_t new_pos) nogil except -1: | ||
"""Updated statistics by moving samples[pos:new_pos] to the left child. | ||
|
||
This updates the collected statistics by moving samples[pos:new_pos] | ||
|
@@ -281,12 +284,15 @@ cdef class ClassificationCriterion(Criterion): | |
sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)), | ||
self.__getstate__()) | ||
|
||
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, | ||
DOUBLE_t* sample_weight, double weighted_n_samples, | ||
SIZE_t* samples, SIZE_t start, SIZE_t end) nogil: | ||
cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, | ||
DOUBLE_t* sample_weight, double weighted_n_samples, | ||
SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1: | ||
"""Initialize the criterion at node samples[start:end] and | ||
children samples[start:start] and samples[start:end]. | ||
|
||
Returns -1 in case of failure to allocate memory (and raise MemoryError) | ||
or 0 otherwise. | ||
|
||
Parameters | ||
---------- | ||
y : array-like, dtype=DOUBLE_t | ||
|
@@ -347,10 +353,14 @@ cdef class ClassificationCriterion(Criterion): | |
|
||
# Reset to pos=start | ||
self.reset() | ||
return 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should return the output of reset here, no? How can the Cython execption detection work otherwise? If we always return zero we basically tell Cython that it should never check for the presence of an exception to be raised. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure you're right:
From here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok good. I was wondering why the tests were passing :) |
||
|
||
cdef void reset(self) nogil: | ||
"""Reset the criterion at pos=start.""" | ||
cdef int reset(self) nogil except -1: | ||
"""Reset the criterion at pos=start | ||
|
||
Returns -1 in case of failure to allocate memory (and raise MemoryError) | ||
or 0 otherwise. | ||
""" | ||
self.pos = self.start | ||
|
||
self.weighted_n_left = 0.0 | ||
|
@@ -370,9 +380,14 @@ cdef class ClassificationCriterion(Criterion): | |
sum_total += self.sum_stride | ||
sum_left += self.sum_stride | ||
sum_right += self.sum_stride | ||
return 0 | ||
|
||
cdef void reverse_reset(self) nogil: | ||
"""Reset the criterion at pos=end.""" | ||
cdef int reverse_reset(self) nogil except -1: | ||
"""Reset the criterion at pos=end | ||
|
||
Returns -1 in case of failure to allocate memory (and raise MemoryError) | ||
or 0 otherwise. | ||
""" | ||
self.pos = self.end | ||
|
||
self.weighted_n_left = self.weighted_n_node_samples | ||
|
@@ -392,10 +407,14 @@ cdef class ClassificationCriterion(Criterion): | |
sum_total += self.sum_stride | ||
sum_left += self.sum_stride | ||
sum_right += self.sum_stride | ||
return 0 | ||
|
||
cdef void update(self, SIZE_t new_pos) nogil: | ||
cdef int update(self, SIZE_t new_pos) nogil except -1: | ||
"""Updated statistics by moving samples[pos:new_pos] to the left child. | ||
|
||
Returns -1 in case of failure to allocate memory (and raise MemoryError) | ||
or 0 otherwise. | ||
|
||
Parameters | ||
---------- | ||
new_pos : SIZE_t | ||
|
@@ -470,6 +489,7 @@ cdef class ClassificationCriterion(Criterion): | |
sum_total += self.sum_stride | ||
|
||
self.pos = new_pos | ||
return 0 | ||
|
||
cdef double node_impurity(self) nogil: | ||
pass | ||
|
@@ -736,9 +756,9 @@ cdef class RegressionCriterion(Criterion): | |
def __reduce__(self): | ||
return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) | ||
|
||
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, | ||
double weighted_n_samples, SIZE_t* samples, SIZE_t start, | ||
SIZE_t end) nogil: | ||
cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, | ||
double weighted_n_samples, SIZE_t* samples, SIZE_t start, | ||
SIZE_t end) nogil except -1: | ||
"""Initialize the criterion at node samples[start:end] and | ||
children samples[start:start] and samples[start:end].""" | ||
# Initialize fields | ||
|
@@ -778,8 +798,9 @@ cdef class RegressionCriterion(Criterion): | |
|
||
# Reset to pos=start | ||
self.reset() | ||
return 0 | ||
|
||
cdef void reset(self) nogil: | ||
cdef int reset(self) nogil except -1: | ||
"""Reset the criterion at pos=start.""" | ||
cdef SIZE_t n_bytes = self.n_outputs * sizeof(double) | ||
memset(self.sum_left, 0, n_bytes) | ||
|
@@ -788,8 +809,9 @@ cdef class RegressionCriterion(Criterion): | |
self.weighted_n_left = 0.0 | ||
self.weighted_n_right = self.weighted_n_node_samples | ||
self.pos = self.start | ||
return 0 | ||
|
||
cdef void reverse_reset(self) nogil: | ||
cdef int reverse_reset(self) nogil except -1: | ||
"""Reset the criterion at pos=end.""" | ||
cdef SIZE_t n_bytes = self.n_outputs * sizeof(double) | ||
memset(self.sum_right, 0, n_bytes) | ||
|
@@ -798,8 +820,9 @@ cdef class RegressionCriterion(Criterion): | |
self.weighted_n_right = 0.0 | ||
self.weighted_n_left = self.weighted_n_node_samples | ||
self.pos = self.end | ||
return 0 | ||
|
||
cdef void update(self, SIZE_t new_pos) nogil: | ||
cdef int update(self, SIZE_t new_pos) nogil except -1: | ||
"""Updated statistics by moving samples[pos:new_pos] to the left.""" | ||
|
||
cdef double* sum_left = self.sum_left | ||
|
@@ -859,6 +882,7 @@ cdef class RegressionCriterion(Criterion): | |
sum_right[k] = sum_total[k] - sum_left[k] | ||
|
||
self.pos = new_pos | ||
return 0 | ||
|
||
cdef double node_impurity(self) nogil: | ||
pass | ||
|
@@ -1018,19 +1042,16 @@ cdef class MAE(RegressionCriterion): | |
# Allocate memory for the accumulators | ||
safe_realloc(&self.node_medians, n_outputs) | ||
|
||
if (self.node_medians == NULL): | ||
raise MemoryError() | ||
|
||
self.left_child = np.empty(n_outputs, dtype='object') | ||
self.right_child = np.empty(n_outputs, dtype='object') | ||
# initialize WeightedMedianCalculators | ||
for k in range(n_outputs): | ||
self.left_child[k] = WeightedMedianCalculator(n_samples) | ||
self.right_child[k] = WeightedMedianCalculator(n_samples) | ||
|
||
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, | ||
double weighted_n_samples, SIZE_t* samples, SIZE_t start, | ||
SIZE_t end) nogil: | ||
cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, | ||
double weighted_n_samples, SIZE_t* samples, SIZE_t start, | ||
SIZE_t end) nogil except -1: | ||
"""Initialize the criterion at node samples[start:end] and | ||
children samples[start:start] and samples[start:end].""" | ||
|
||
|
@@ -1068,6 +1089,7 @@ cdef class MAE(RegressionCriterion): | |
for k in range(self.n_outputs): | ||
y_ik = y[i * y_stride + k] | ||
|
||
# push method ends up calling safe_realloc, hence `except -1` | ||
# push all values to the right side, | ||
# since pos = start initially anyway | ||
(<WeightedMedianCalculator> right_child[k]).push(y_ik, w) | ||
|
@@ -1079,9 +1101,14 @@ cdef class MAE(RegressionCriterion): | |
|
||
# Reset to pos=start | ||
self.reset() | ||
return 0 | ||
|
||
cdef void reset(self) nogil: | ||
"""Reset the criterion at pos=start.""" | ||
cdef int reset(self) nogil except -1: | ||
"""Reset the criterion at pos=start | ||
|
||
Returns -1 in case of failure to allocate memory (and raise MemoryError) | ||
or 0 otherwise. | ||
""" | ||
|
||
cdef SIZE_t i, k | ||
cdef DOUBLE_t value | ||
|
@@ -1103,11 +1130,17 @@ cdef class MAE(RegressionCriterion): | |
# remove everything from left and put it into right | ||
(<WeightedMedianCalculator> left_child[k]).pop(&value, | ||
&weight) | ||
# push method ends up calling safe_realloc, hence `except -1` | ||
(<WeightedMedianCalculator> right_child[k]).push(value, | ||
weight) | ||
return 0 | ||
|
||
cdef void reverse_reset(self) nogil: | ||
"""Reset the criterion at pos=end.""" | ||
cdef int reverse_reset(self) nogil except -1: | ||
"""Reset the criterion at pos=end | ||
|
||
Returns -1 in case of failure to allocate memory (and raise MemoryError) | ||
or 0 otherwise. | ||
""" | ||
|
||
self.weighted_n_right = 0.0 | ||
self.weighted_n_left = self.weighted_n_node_samples | ||
|
@@ -1126,11 +1159,17 @@ cdef class MAE(RegressionCriterion): | |
# remove everything from right and put it into left | ||
(<WeightedMedianCalculator> right_child[k]).pop(&value, | ||
&weight) | ||
# push method ends up calling safe_realloc, hence `except -1` | ||
(<WeightedMedianCalculator> left_child[k]).push(value, | ||
weight) | ||
return 0 | ||
|
||
cdef void update(self, SIZE_t new_pos) nogil: | ||
"""Updated statistics by moving samples[pos:new_pos] to the left.""" | ||
cdef int update(self, SIZE_t new_pos) nogil except -1: | ||
"""Updated statistics by moving samples[pos:new_pos] to the left | ||
|
||
Returns -1 in case of failure to allocate memory (and raise MemoryError) | ||
or 0 otherwise. | ||
""" | ||
|
||
cdef DOUBLE_t* sample_weight = self.sample_weight | ||
cdef SIZE_t* samples = self.samples | ||
|
@@ -1162,6 +1201,7 @@ cdef class MAE(RegressionCriterion): | |
y_ik = y[i * self.y_stride + k] | ||
# remove y_ik and its weight w from right and add to left | ||
(<WeightedMedianCalculator> right_child[k]).remove(y_ik, w) | ||
# push method ends up calling safe_realloc, hence except -1 | ||
(<WeightedMedianCalculator> left_child[k]).push(y_ik, w) | ||
|
||
self.weighted_n_left += w | ||
|
@@ -1185,6 +1225,7 @@ cdef class MAE(RegressionCriterion): | |
self.weighted_n_right = (self.weighted_n_node_samples - | ||
self.weighted_n_left) | ||
self.pos = new_pos | ||
return 0 | ||
|
||
cdef void node_value(self, double* dest) nogil: | ||
"""Computes the node value of samples[start:end] into dest.""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If these functions do not have real return values, I think "void ... except *" is most appropriate.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The problem with that is
void ... except *
has a tiny overhead that gets amplified as the functions are called multiple times.in ... except -1
has very low overhead as thePyErr_Occurred()
is not called to check for any Python errors that were raised...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(The above benchmarks confirm that...)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's fine have those C functions to use an int return code that is used to flag memory allocation error. It can indeed help Cython reduce it's overhead.