Skip to content

[WIP] added comments to tree splitter code #8779

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 130 additions & 38 deletions sklearn/tree/_splitter.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -387,32 +387,66 @@ cdef class BestSplitter(BaseDenseSplitter):
n_visited_features += 1

# Loop invariant: elements of features in
# - [:n_drawn_constant[ holds drawn and known constant features;
# - [n_drawn_constant:n_known_constant[ holds known constant
# features that haven't been drawn yet;
# - [n_known_constant:n_total_constant[ holds newly found constant
# features;
# - [n_total_constant:f_i[ holds features that haven't been drawn
# yet and aren't constant apriori.
# - [f_i:n_features[ holds features that have been drawn
# and aren't constant.
#
# - [:n_drawn_constants[ holds drawn and known constant
# features;
#
# - [n_drawn_constants:n_known_constants[ holds known constant
# features that haven't been
# drawn yet - these are known
# to be constant from previous
# calls to this function;
#
# - [n_known_constants:n_total_constants[ holds newly found constant
# features;
#
# - [n_total_constants:f_i[ holds features that haven't
# been drawn yet and aren't
# constant apriori.
#
# - [f_i:n_features[ holds features that have been
# drawn and aren't constant.

# Draw a feature at random
#
# we actually want to draw from two usually disjoint ranges
# [n_drawn_constants:n_known_constants[ and
# [n_total_constants:f_i[ but we shift the second range
# down to start at n_known_constants to have one consecutive range
#
# f_i - n_found_constants (the upper end argument to rand_int) is
# equal to (f_i - n_total_constants) + n_known_constants
# because n_total_constants = n_known_constants + n_found_constants
# (see cdef of n_total_constants)

f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
random_state)

if f_j < n_known_constants:
# f_j in the interval [n_drawn_constants, n_known_constants[
# i.e. this is a feature which is known to be constant
# (from previous calls to this function) but has not been
# drawn before

# swap features[f_j] with features[n_drawn_constants]
# effectively moving the drawn feature f_j to the first
# range in features which has drawn and known constant
# features

tmp = features[f_j]
features[f_j] = features[n_drawn_constants]
features[n_drawn_constants] = tmp

n_drawn_constants += 1

else:
# f_j in the interval [n_known_constants, f_i - n_found_constants[
# f_j is in the interval [n_known_constants, f_i - n_found_constants[
f_j += n_found_constants
# f_j in the interval [n_total_constants, f_i[

# f_j is now in the interval [n_total_constants, f_i[
# i.e. in the part of the features array where we keep
# features which have not been drawn yet and aren't
# constant a priori
current.feature = features[f_j]
feature_offset = self.X_feature_stride * current.feature

Expand All @@ -424,6 +458,8 @@ cdef class BestSplitter(BaseDenseSplitter):
p = start
feature_idx_offset = self.X_idx_sorted_stride * current.feature

# fill masked presorted samples into Xf

for i in range(self.n_total_samples):
j = X_idx_sorted[i + feature_idx_offset]
if sample_mask[j] == 1:
Expand All @@ -437,14 +473,33 @@ cdef class BestSplitter(BaseDenseSplitter):
sort(Xf + start, samples + start, end - start)

if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD:

# highest value is not greater than first value
# within tolerance, this is a newly discovered
# constant feature in the sample range [start:end[
# and will be constant in all children of this node

# swap features at f_j (which is current.feature)
# and at n_total_constants effectively
# moving feature f_j to the area in features
# which newly found constant features

features[f_j] = features[n_total_constants]
features[n_total_constants] = current.feature

n_found_constants += 1
n_total_constants += 1

else:
# f_i points to the first feature which has been drawn and isn't constant
# this is at the end of the features array, we fill this part of the
# features array starting from the end
f_i -= 1

# swap features at f_i (newest feature which has been drawn and isn't constant)
# and f_j (the drawn feature), i.e. put feature at f_j to the
# last area in features which contains drawn features which
# are not constant
features[f_i], features[f_j] = features[f_j], features[f_i]

# Evaluate all splits
Expand Down Expand Up @@ -488,6 +543,12 @@ cdef class BestSplitter(BaseDenseSplitter):

best = current # copy

# end of loop over values of this feature

# end if feature is not a newly discovered constant feature

# end if feature is not a previously known constant feature

# Reorganize into samples[start:best.pos] + samples[best.pos:end]
if best.pos < end:
feature_offset = X_feature_stride * best.feature
Expand All @@ -501,6 +562,7 @@ cdef class BestSplitter(BaseDenseSplitter):
else:
partition_end -= 1

# swap samples[p] and samples[partition_end]
tmp = samples[partition_end]
samples[partition_end] = samples[p]
samples[p] = tmp
Expand Down Expand Up @@ -720,15 +782,25 @@ cdef class RandomSplitter(BaseDenseSplitter):
n_visited_features += 1

# Loop invariant: elements of features in
# - [:n_drawn_constant[ holds drawn and known constant features;
# - [n_drawn_constant:n_known_constant[ holds known constant
# features that haven't been drawn yet;
# - [n_known_constant:n_total_constant[ holds newly found constant
# features;
# - [n_total_constant:f_i[ holds features that haven't been drawn
# yet and aren't constant apriori.
# - [f_i:n_features[ holds features that have been drawn
# and aren't constant.
#
# - [:n_drawn_constants[ holds drawn and known constant
# features;
#
# - [n_drawn_constants:n_known_constants[ holds known constant
# features that haven't been
# drawn yet - these are known
# to be constant from previous
# calls to this function;
#
# - [n_known_constants:n_total_constants[ holds newly found constant
# features;
#
# - [n_total_constants:f_i[ holds features that haven't
# been drawn yet and aren't
# constant apriori.
#
# - [f_i:n_features[ holds features that have been
# drawn and aren't constant.

# Draw a feature at random
f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
Expand Down Expand Up @@ -1271,15 +1343,25 @@ cdef class BestSparseSplitter(BaseSparseSplitter):
n_visited_features += 1

# Loop invariant: elements of features in
# - [:n_drawn_constant[ holds drawn and known constant features;
# - [n_drawn_constant:n_known_constant[ holds known constant
# features that haven't been drawn yet;
# - [n_known_constant:n_total_constant[ holds newly found constant
# features;
# - [n_total_constant:f_i[ holds features that haven't been drawn
# yet and aren't constant apriori.
# - [f_i:n_features[ holds features that have been drawn
# and aren't constant.
#
# - [:n_drawn_constants[ holds drawn and known constant
# features;
#
# - [n_drawn_constants:n_known_constants[ holds known constant
# features that haven't been
# drawn yet - these are known
# to be constant from previous
# calls to this function;
#
# - [n_known_constants:n_total_constants[ holds newly found constant
# features;
#
# - [n_total_constants:f_i[ holds features that haven't
# been drawn yet and aren't
# constant apriori.
#
# - [f_i:n_features[ holds features that have been
# drawn and aren't constant.

# Draw a feature at random
f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
Expand Down Expand Up @@ -1504,15 +1586,25 @@ cdef class RandomSparseSplitter(BaseSparseSplitter):
n_visited_features += 1

# Loop invariant: elements of features in
# - [:n_drawn_constant[ holds drawn and known constant features;
# - [n_drawn_constant:n_known_constant[ holds known constant
# features that haven't been drawn yet;
# - [n_known_constant:n_total_constant[ holds newly found constant
# features;
# - [n_total_constant:f_i[ holds features that haven't been drawn
# yet and aren't constant apriori.
# - [f_i:n_features[ holds features that have been drawn
# and aren't constant.
#
# - [:n_drawn_constants[ holds drawn and known constant
# features;
#
# - [n_drawn_constants:n_known_constants[ holds known constant
# features that haven't been
# drawn yet - these are known
# to be constant from previous
# calls to this function;
#
# - [n_known_constants:n_total_constants[ holds newly found constant
# features;
#
# - [n_total_constants:f_i[ holds features that haven't
# been drawn yet and aren't
# constant apriori.
#
# - [f_i:n_features[ holds features that have been
# drawn and aren't constant.

# Draw a feature at random
f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
Expand Down