Skip to content

ENH HGBT histograms in blocks of features #27168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
29 changes: 29 additions & 0 deletions sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,10 @@ def fit(self, X, y, sample_weight=None):
acc_find_split_time = 0.0 # time spent finding the best splits
acc_apply_split_time = 0.0 # time spent splitting nodes
acc_compute_hist_time = 0.0 # time spent computing histograms
time_hist_subtract = 0.0
time_hist_copy_gradients = 0.0
time_hist_root = 0.0
time_hist_node = 0.0
# time spent predicting X for gradient and hessians update
acc_prediction_time = 0.0
X, known_categories = self._preprocess_X(X, reset=True)
Expand Down Expand Up @@ -911,6 +915,10 @@ def fit(self, X, y, sample_weight=None):
acc_apply_split_time += grower.total_apply_split_time
acc_find_split_time += grower.total_find_split_time
acc_compute_hist_time += grower.total_compute_hist_time
time_hist_subtract += grower.time_hist_subtract
time_hist_copy_gradients += grower.time_hist_copy_gradients
time_hist_root += grower.time_hist_root
time_hist_node += grower.time_hist_node

if not self._loss.differentiable:
_update_leaves_values(
Expand Down Expand Up @@ -1004,6 +1012,27 @@ def fit(self, X, y, sample_weight=None):
"Time spent computing histograms:", acc_compute_hist_time
)
)
print(
"{:<32} {:.3f}s".format(
" Of wich time spent computing histograms subtractions:",
time_hist_subtract,
)
)
print(
"{:<32} {:.3f}s".format(
" Of wich time spent copying gradients:", time_hist_copy_gradients
)
)
print(
"{:<32} {:.3f}s".format(
" Of wich time spent computing root histograms:", time_hist_root
)
)
print(
"{:<32} {:.3f}s".format(
" Of wich time spent computing node histograms:", time_hist_node
)
)
print(
"{:<32} {:.3f}s".format(
"Time spent finding best splits:", acc_find_split_time
Expand Down
29 changes: 26 additions & 3 deletions sklearn/ensemble/_hist_gradient_boosting/grower.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,10 @@ def __init__(
self.finalized_leaves = []
self.total_find_split_time = 0.0 # time spent finding the best splits
self.total_compute_hist_time = 0.0 # time spent computing histograms
self.time_hist_subtract = 0.0 # time spent subtracting
self.time_hist_copy_gradients = 0.0 # time spent copying gradients
self.time_hist_root = 0.0 # time spent computing root histograms
self.time_hist_node = 0.0 # time spent computing histograms
self.total_apply_split_time = 0.0 # time spent splitting nodes
self.n_categorical_splits = 0
self._intilialize_root(gradients, hessians, hessians_are_constant)
Expand Down Expand Up @@ -425,10 +429,18 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant):
)

tic = time()
self.root.histograms = self.histogram_builder.compute_histograms_brute(
(
self.root.histograms,
time_hist_copy_gradients,
time_hist_root,
time_hist_node,
) = self.histogram_builder.compute_histograms_brute(
self.root.sample_indices, self.root.allowed_features
)
self.total_compute_hist_time += time() - tic
self.time_hist_copy_gradients += time_hist_copy_gradients
self.time_hist_root += time_hist_root
self.time_hist_node += time_hist_node

tic = time()
self._compute_best_split_and_push(self.root)
Expand Down Expand Up @@ -593,17 +605,28 @@ def split_next(self):
# on the other one.
# Note that both left and right child have the same allowed_features.
tic = time()
smallest_child.histograms = self.histogram_builder.compute_histograms_brute(
(
smallest_child.histograms,
time_hist_copy_gradients,
time_hist_root,
time_hist_node,
) = self.histogram_builder.compute_histograms_brute(
smallest_child.sample_indices, smallest_child.allowed_features
)
tic2 = time()
largest_child.histograms = (
self.histogram_builder.compute_histograms_subtraction(
node.histograms,
smallest_child.histograms,
smallest_child.allowed_features,
)
)
self.total_compute_hist_time += time() - tic
end_time = time()
self.time_hist_subtract = end_time - tic2
self.total_compute_hist_time += end_time - tic
self.time_hist_copy_gradients += time_hist_copy_gradients
self.time_hist_root += time_hist_root
self.time_hist_node += time_hist_node

tic = time()
if should_split_left:
Expand Down
Loading