From a33909f574b3f8afb47c9e54b3af0ba677949c40 Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sat, 25 May 2019 12:11:43 -0500 Subject: [PATCH 01/12] Relaxed criterion specification so plot_tree no longer writes entropy for any criterion in decision tree --- sklearn/tree/export.py | 10 +++++----- sklearn/tree/tests/test_export.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py index 636ef03689a79..5320305478d36 100644 --- a/sklearn/tree/export.py +++ b/sklearn/tree/export.py @@ -547,15 +547,15 @@ def __init__(self, max_depth=None, feature_names=None, self.arrow_args = dict(arrowstyle="<-") - def _make_tree(self, node_id, et, depth=0): + def _make_tree(self, node_id, et, criterion, depth=0): # traverses _tree.Tree recursively, builds intermediate # "_reingold_tilford.Tree" object - name = self.node_to_str(et, node_id, criterion='entropy') + name = self.node_to_str(et, node_id, criterion=criterion) if (et.children_left[node_id] != _tree.TREE_LEAF and (self.max_depth is None or depth <= self.max_depth)): - children = [self._make_tree(et.children_left[node_id], et, + children = [self._make_tree(et.children_left[node_id], et, criterion, depth=depth + 1), - self._make_tree(et.children_right[node_id], et, + self._make_tree(et.children_right[node_id], et, criterion, depth=depth + 1)] else: return Tree(name, node_id) @@ -568,7 +568,7 @@ def export(self, decision_tree, ax=None): ax = plt.gca() ax.clear() ax.set_axis_off() - my_tree = self._make_tree(0, decision_tree.tree_) + my_tree = self._make_tree(0, decision_tree.tree_, decision_tree.criterion) draw_tree = buchheim(my_tree) # important to make sure we're still diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py index eed9be7bcb5d9..7fce48a5cb022 100644 --- a/sklearn/tree/tests/test_export.py +++ b/sklearn/tree/tests/test_export.py @@ -412,7 +412,7 @@ def test_plot_tree(pyplot): feature_names = ['first feat', 'sepal_width'] nodes = plot_tree(clf, feature_names=feature_names) assert len(nodes) == 3 - assert nodes[0].get_text() == ("first feat <= 0.0\nentropy = 0.5\n" + assert nodes[0].get_text() == ("first feat <= 0.0\ngini = 0.5\n" "samples = 6\nvalue = [3, 3]") - assert nodes[1].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [3, 0]" - assert nodes[2].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [0, 3]" + assert nodes[1].get_text() == "gini = 0.0\nsamples = 3\nvalue = [3, 0]" + assert nodes[2].get_text() == "gini = 0.0\nsamples = 3\nvalue = [0, 3]" From 9a7ba92d60203a8751d59258ff2c1d1aec1e49a1 Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sat, 25 May 2019 12:21:23 -0500 Subject: [PATCH 02/12]  Please enter the commit message for your changes. Lines starting --- sklearn/tree/export.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py index 5320305478d36..f1d70bf2558af 100644 --- a/sklearn/tree/export.py +++ b/sklearn/tree/export.py @@ -553,10 +553,10 @@ def _make_tree(self, node_id, et, criterion, depth=0): name = self.node_to_str(et, node_id, criterion=criterion) if (et.children_left[node_id] != _tree.TREE_LEAF and (self.max_depth is None or depth <= self.max_depth)): - children = [self._make_tree(et.children_left[node_id], et, criterion, - depth=depth + 1), - self._make_tree(et.children_right[node_id], et, criterion, - depth=depth + 1)] + children = [self._make_tree(et.children_left[node_id], et, + criterion, depth=depth + 1), + self._make_tree(et.children_right[node_id], et, + criterion, depth=depth + 1)] else: return Tree(name, node_id) return Tree(name, node_id, *children) From 6881f8555d7b7b4a3d26eede032c94191b97be4e Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sat, 25 May 2019 12:22:22 -0500 Subject: [PATCH 03/12] Revert " Please enter the commit message for your changes. Lines starting" This reverts commit 9a7ba92d --- sklearn/tree/export.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py index f1d70bf2558af..5320305478d36 100644 --- a/sklearn/tree/export.py +++ b/sklearn/tree/export.py @@ -553,10 +553,10 @@ def _make_tree(self, node_id, et, criterion, depth=0): name = self.node_to_str(et, node_id, criterion=criterion) if (et.children_left[node_id] != _tree.TREE_LEAF and (self.max_depth is None or depth <= self.max_depth)): - children = [self._make_tree(et.children_left[node_id], et, - criterion, depth=depth + 1), - self._make_tree(et.children_right[node_id], et, - criterion, depth=depth + 1)] + children = [self._make_tree(et.children_left[node_id], et, criterion, + depth=depth + 1), + self._make_tree(et.children_right[node_id], et, criterion, + depth=depth + 1)] else: return Tree(name, node_id) return Tree(name, node_id, *children) From 9d1083922ef350e62146223c1668314759a9f565 Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sat, 25 May 2019 12:26:45 -0500 Subject: [PATCH 04/12] Ensure changes conform to flake8 character limits --- sklearn/tree/export.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py index 5320305478d36..f1d70bf2558af 100644 --- a/sklearn/tree/export.py +++ b/sklearn/tree/export.py @@ -553,10 +553,10 @@ def _make_tree(self, node_id, et, criterion, depth=0): name = self.node_to_str(et, node_id, criterion=criterion) if (et.children_left[node_id] != _tree.TREE_LEAF and (self.max_depth is None or depth <= self.max_depth)): - children = [self._make_tree(et.children_left[node_id], et, criterion, - depth=depth + 1), - self._make_tree(et.children_right[node_id], et, criterion, - depth=depth + 1)] + children = [self._make_tree(et.children_left[node_id], et, + criterion, depth=depth + 1), + self._make_tree(et.children_right[node_id], et, + criterion, depth=depth + 1)] else: return Tree(name, node_id) return Tree(name, node_id, *children) From 8c0fb6b5af71009774e238bd1acffeac08b21c0b Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sat, 25 May 2019 12:31:19 -0500 Subject: [PATCH 05/12] Resolved last line that violated pep8 --- sklearn/tree/export.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py index f1d70bf2558af..c012632d455a5 100644 --- a/sklearn/tree/export.py +++ b/sklearn/tree/export.py @@ -568,7 +568,8 @@ def export(self, decision_tree, ax=None): ax = plt.gca() ax.clear() ax.set_axis_off() - my_tree = self._make_tree(0, decision_tree.tree_, decision_tree.criterion) + my_tree = self._make_tree(0, decision_tree.tree_, + decision_tree.criterion) draw_tree = buchheim(my_tree) # important to make sure we're still From 40f8ddf8adc93199ea4417a2849f6cf69abb86bc Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sat, 25 May 2019 13:44:23 -0500 Subject: [PATCH 06/12] Add test to test that entropy is calculated & displayed in tree diagram when decision tree classifier computes entropy --- sklearn/tree/tests/test_export.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py index 7fce48a5cb022..06ca9e446fdcc 100644 --- a/sklearn/tree/tests/test_export.py +++ b/sklearn/tree/tests/test_export.py @@ -399,9 +399,28 @@ def test_export_text(): assert export_text(reg, decimals=1, show_weights=True) == expected_report -def test_plot_tree(pyplot): +def test_plot_tree_entropy(pyplot): # mostly smoke tests - # Check correctness of export_graphviz + # Check correctness of export_graphviz for criterion = entropy + clf = DecisionTreeClassifier(max_depth=3, + min_samples_split=2, + criterion="entropy", + random_state=2) + clf.fit(X, y) + + # Test export code + feature_names = ['first feat', 'sepal_width'] + nodes = plot_tree(clf, feature_names=feature_names) + assert len(nodes) == 3 + assert nodes[0].get_text() == ("first feat <= 0.0\nentropy = 1.0\n" + "samples = 6\nvalue = [3, 3]") + assert nodes[1].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [3, 0]" + assert nodes[2].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [0, 3]" + + +def test_plot_tree_gini(pyplot): + # mostly smoke tests + # Check correctness of export_graphviz for criterion = gini clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2, criterion="gini", From a6823c9f36bea570f9ff5a3e1ead9d73b1c23a7e Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sat, 25 May 2019 22:29:40 -0500 Subject: [PATCH 07/12] Update v0.21.rst Added #13947 to change log --- doc/whats_new/v0.21.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index bf650629e43fc..f23b21485c315 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -885,6 +885,10 @@ Support for Python 3.4 and below has been officially dropped. ``max_depth`` were both specified by the user. Please note that this also affects all ensemble methods using decision trees. :pr:`12344` by :user:`Adrin Jalali `. + +- |Fix| Fixed an issue with :class:`MLPTreeExporter` where it display + entropy calculations even for `gini` criterion in DecisionTreeClassifiers. + :pr:`13947` by :user:`Frank Hoang `. :mod:`sklearn.utils` .................... @@ -965,8 +969,8 @@ Baibak, daten-kieker, Denis Kataev, Didi Bar-Zev, Dillon Gardner, Dmitry Mottl, Dmitry Vukolov, Dougal J. Sutherland, Dowon, drewmjohnston, Dror Atariah, Edward J Brown, Ekaterina Krivich, Elizabeth Sander, Emmanuel Arias, Eric Chang, Eric Larson, Erich Schubert, esvhd, Falak, Feda Curic, Federico Caselli, -Fibinse Xavier`, Finn O'Shea, Gabriel Marzinotto, Gabriel Vacaliuc, Gabriele -Calvo, Gael Varoquaux, GauravAhlawat, Giuseppe Vettigli, Greg Gandenberger, +Frank Hoang, Fibinse Xavier`, Finn O'Shea, Gabriel Marzinotto, Gabriel Vacaliuc, +Gabriele Calvo, Gael Varoquaux, GauravAhlawat, Giuseppe Vettigli, Greg Gandenberger, Guillaume Fournier, Guillaume Lemaitre, Gustavo De Mari Pereira, Hanmin Qin, haroldfox, hhu-luqi, Hunter McGushion, Ian Sanders, JackLangerman, Jacopo Notarstefano, jakirkham, James Bourbeau, Jan Koch, Jan S, janvanrijn, Jarrod From d66c9d4f7ffeb3c4314d722d4c4a6e32779edcd5 Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sun, 26 May 2019 20:48:51 -0500 Subject: [PATCH 08/12] Update splitting.pyx Initialized gain variable with parent node negative loss and removing from _split_gain function in order to improve performance of splitting. --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 2f7c7d3453326..4815803de85cd 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -435,6 +435,7 @@ cdef class Splitter: best_split.gain = -1. sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 + gain = -negative_loss(sum_gradients, sum_hessians, l2_regularization) for bin_idx in range(self.actual_n_bins[feature_idx]): n_samples_left += histograms[feature_idx, bin_idx].count @@ -462,7 +463,7 @@ cdef class Splitter: # won't get any better (hessians are > 0 since loss is convex) break - gain = _split_gain(sum_gradient_left, sum_hessian_left, + gain += _split_gain(sum_gradient_left, sum_hessian_left, sum_gradient_right, sum_hessian_right, sum_gradients, sum_hessians, self.l2_regularization) @@ -500,11 +501,10 @@ cdef inline Y_DTYPE_C _split_gain( """ cdef: Y_DTYPE_C gain - gain = negative_loss(sum_gradient_left, sum_hessian_left, + gain += negative_loss(sum_gradient_left, sum_hessian_left, l2_regularization) gain += negative_loss(sum_gradient_right, sum_hessian_right, l2_regularization) - gain -= negative_loss(sum_gradients, sum_hessians, l2_regularization) return gain cdef inline Y_DTYPE_C negative_loss( From 7011bb5b00b46d8e39f380b044f95c9ede293189 Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sun, 26 May 2019 20:57:19 -0500 Subject: [PATCH 09/12] Fix global variable accessing --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 4815803de85cd..aa133be4159c5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -435,7 +435,7 @@ cdef class Splitter: best_split.gain = -1. sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 - gain = -negative_loss(sum_gradients, sum_hessians, l2_regularization) + gain = -negative_loss(sum_gradients, sum_hessians, self.l2_regularization) for bin_idx in range(self.actual_n_bins[feature_idx]): n_samples_left += histograms[feature_idx, bin_idx].count From 8d822846ce1baeeb9a367d93027aee9769fac852 Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sun, 26 May 2019 21:03:31 -0500 Subject: [PATCH 10/12] Revert "Fix global variable accessing" This reverts commit 7011bb5b00b46d8e39f380b044f95c9ede293189. --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index aa133be4159c5..4815803de85cd 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -435,7 +435,7 @@ cdef class Splitter: best_split.gain = -1. sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 - gain = -negative_loss(sum_gradients, sum_hessians, self.l2_regularization) + gain = -negative_loss(sum_gradients, sum_hessians, l2_regularization) for bin_idx in range(self.actual_n_bins[feature_idx]): n_samples_left += histograms[feature_idx, bin_idx].count From 61f0e7f8090d4688b04dd0ee7ed60643cf1ec393 Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sun, 26 May 2019 21:04:14 -0500 Subject: [PATCH 11/12] Revert "Update splitting.pyx" This reverts commit d66c9d4f7ffeb3c4314d722d4c4a6e32779edcd5. --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 4815803de85cd..2f7c7d3453326 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -435,7 +435,6 @@ cdef class Splitter: best_split.gain = -1. sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 - gain = -negative_loss(sum_gradients, sum_hessians, l2_regularization) for bin_idx in range(self.actual_n_bins[feature_idx]): n_samples_left += histograms[feature_idx, bin_idx].count @@ -463,7 +462,7 @@ cdef class Splitter: # won't get any better (hessians are > 0 since loss is convex) break - gain += _split_gain(sum_gradient_left, sum_hessian_left, + gain = _split_gain(sum_gradient_left, sum_hessian_left, sum_gradient_right, sum_hessian_right, sum_gradients, sum_hessians, self.l2_regularization) @@ -501,10 +500,11 @@ cdef inline Y_DTYPE_C _split_gain( """ cdef: Y_DTYPE_C gain - gain += negative_loss(sum_gradient_left, sum_hessian_left, + gain = negative_loss(sum_gradient_left, sum_hessian_left, l2_regularization) gain += negative_loss(sum_gradient_right, sum_hessian_right, l2_regularization) + gain -= negative_loss(sum_gradients, sum_hessians, l2_regularization) return gain cdef inline Y_DTYPE_C negative_loss( From 2a1aa7f20a4aab36ea13370934e0188627ae95cd Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Mon, 27 May 2019 01:04:02 -0500 Subject: [PATCH 12/12] Move change log update into proper spot and reference public API --- doc/whats_new/v0.21.rst | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index f23b21485c315..e6e0e6cf620dc 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -44,6 +44,13 @@ Changelog `drop` parameter was not reflected in `get_feature_names`. :pr:`13894` by :user:`James Myatt `. +:mod:`sklearn.tree` +................................ + +- |Fix| Fixed an issue with :func:`plot_tree` where it display + entropy calculations even for `gini` criterion in DecisionTreeClassifiers. + :pr:`13947` by :user:`Frank Hoang `. + :mod:`sklearn.utils.sparsefuncs` ................................ @@ -885,10 +892,6 @@ Support for Python 3.4 and below has been officially dropped. ``max_depth`` were both specified by the user. Please note that this also affects all ensemble methods using decision trees. :pr:`12344` by :user:`Adrin Jalali `. - -- |Fix| Fixed an issue with :class:`MLPTreeExporter` where it display - entropy calculations even for `gini` criterion in DecisionTreeClassifiers. - :pr:`13947` by :user:`Frank Hoang `. :mod:`sklearn.utils` ....................