From 564106d9c4c6f2f1171fd2aa42b065958413b79a Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Fri, 14 Aug 2020 19:47:05 -0400 Subject: [PATCH 1/6] don't keep histograms of inner nodes around --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index e0b54550d3082..6185226c7d2ed 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -6,6 +6,8 @@ """ # Author: Nicolas Hug +from memory_profiler import profile + from heapq import heappush, heappop import numpy as np from timeit import default_timer as time @@ -485,6 +487,7 @@ def split_next(self): if should_split_right: self._compute_best_split_and_push(right_child_node) self.total_find_split_time += time() - tic + del node.histograms return left_child_node, right_child_node From 1910e6c9acf827753119cb9358e301deeade9749 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Fri, 14 Aug 2020 19:55:36 -0400 Subject: [PATCH 2/6] also delete leaf histograms --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 6185226c7d2ed..c262e76590940 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -487,8 +487,13 @@ def split_next(self): if should_split_right: self._compute_best_split_and_push(right_child_node) self.total_find_split_time += time() - tic + + for child in [left_child_node, right_child_node]: + if child.is_leaf: + del child.histograms del node.histograms + return left_child_node, right_child_node def _finalize_leaf(self, node): From 90e949a61cb1bd468083e39d1444d08fbc7c7820 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Fri, 14 Aug 2020 19:57:15 -0400 Subject: [PATCH 3/6] don't import memory profiler --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index c262e76590940..e2d51f4bdced9 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -6,7 +6,6 @@ """ # Author: Nicolas Hug -from memory_profiler import profile from heapq import heappush, heappop import numpy as np From 1177bf0289c7212f17274b46268dd9da4ff21c45 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Tue, 18 Aug 2020 16:22:09 -0400 Subject: [PATCH 4/6] delete grower in each iteration, call gc. --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 3 +++ sklearn/ensemble/_hist_gradient_boosting/grower.py | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 5c824f5263d1b..4d831a2d02421 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -3,6 +3,7 @@ from abc import ABC, abstractmethod from functools import partial +import gc import numpy as np from timeit import default_timer as time @@ -391,6 +392,8 @@ def fit(self, X, y, sample_weight=None): _update_raw_predictions(raw_predictions[k, :], grower) toc_pred = time() acc_prediction_time += toc_pred - tic_pred + del grower + gc.collect() should_early_stop = False if self.do_early_stopping_: diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index e2d51f4bdced9..05c9763e24c22 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -492,7 +492,6 @@ def split_next(self): del child.histograms del node.histograms - return left_child_node, right_child_node def _finalize_leaf(self, node): From 1fa6d4117473407b214d7351aeca302d9da1db41 Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Fri, 21 Aug 2020 13:35:45 -0400 Subject: [PATCH 5/6] explicitly delete histograms, don't run the GC --- .../_hist_gradient_boosting/gradient_boosting.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 4d831a2d02421..d75ef81d81f97 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -342,6 +342,14 @@ def fit(self, X, y, sample_weight=None): sample_weight=sample_weight_train ) + def del_hists(node): + if node is None: + return + if getattr(node, 'histograms', None) is not None: + del node.histograms + del_hists(node.left_child) + del_hists(node.right_child) + for iteration in range(begin_at_stage, self.max_iter): if self.verbose: @@ -392,8 +400,9 @@ def fit(self, X, y, sample_weight=None): _update_raw_predictions(raw_predictions[k, :], grower) toc_pred = time() acc_prediction_time += toc_pred - tic_pred + + del_hists(grower.root) del grower - gc.collect() should_early_stop = False if self.do_early_stopping_: From 6c5011e7b5b289017b1753b19b6eaeba4105544b Mon Sep 17 00:00:00 2001 From: Andreas C Mueller Date: Fri, 21 Aug 2020 13:37:40 -0400 Subject: [PATCH 6/6] remove unused import --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index d75ef81d81f97..5a605cef2437b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -3,7 +3,6 @@ from abc import ABC, abstractmethod from functools import partial -import gc import numpy as np from timeit import default_timer as time