From 74d369f4432fab68710891f8466898d681f79c4c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 6 Dec 2019 16:32:38 -0500 Subject: [PATCH 01/16] WIP --- sklearn/inspection/_partial_dependence.py | 18 ++++++++-- .../tests/test_partial_dependence.py | 35 +++++++++++++++++-- sklearn/tree/_classes.py | 28 +++++++++++++++ 3 files changed, 76 insertions(+), 5 deletions(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 40a7f073ca818..60fd5362b8fef 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -26,6 +26,7 @@ from ..utils import _get_column_indices from ..utils.validation import check_is_fitted from ..tree._tree import DTYPE +from ..tree import DecisionTreeRegressor from ..exceptions import NotFittedError from ..ensemble._gb import BaseGradientBoosting from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import ( @@ -105,7 +106,14 @@ def _grid_from_X(X, percentiles, grid_resolution): def _partial_dependence_recursion(est, grid, features): - return est._compute_partial_dependence_recursion(grid, features) + averaged_predictions = est._compute_partial_dependence_recursion(grid, + features) + if averaged_predictions.ndim == 1: + # reshape to (1, n_points) for consistency with + # _partial_dependence_brute + averaged_predictions = averaged_predictions.reshape(1, -1) + + return averaged_predictions def _partial_dependence_brute(est, grid, features, X, response_method): @@ -351,19 +359,23 @@ def partial_dependence(estimator, X, features, response_method='auto', if (isinstance(estimator, BaseGradientBoosting) and estimator.init is None): method = 'recursion' - elif isinstance(estimator, BaseHistGradientBoosting): + elif isinstance(estimator, (BaseHistGradientBoosting, + DecisionTreeRegressor)): method = 'recursion' else: method = 'brute' if method == 'recursion': if not isinstance(estimator, - (BaseGradientBoosting, BaseHistGradientBoosting)): + (BaseGradientBoosting, BaseHistGradientBoosting, + DecisionTreeRegressor)): supported_classes_recursion = ( 'GradientBoostingClassifier', 'GradientBoostingRegressor', 'HistGradientBoostingClassifier', 'HistGradientBoostingRegressor', + 'HistGradientBoostingRegressor', + 'DecisionTreeRegressor', ) raise ValueError( "Only the following estimators support the 'recursion' " diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index fd154356b9e0c..c796a5fb6ec3e 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -206,6 +206,36 @@ def test_partial_dependence_helpers(est, method, target_feature): assert np.allclose(pdp, mean_predictions, rtol=rtol) +@pytest.mark.parametrize('target_feature', range(1)) +def test_decision_tree_vs_gradient_boosting(target_feature): + + X, y = make_regression(random_state=0, n_features=5, n_informative=5) + # The 'init' estimator for GBDT (here the average prediction) isn't taken + # into account with the recursion method, for technical reasons. We set + # the mean to 0 to that this 'bug' doesn't have any effect. + y = y - y.mean() + + # gbdt = HistGradientBoostingRegressor(max_iter=1, learning_rate=1, random_state=0) + gbdt = GradientBoostingRegressor(n_estimators=1, learning_rate=1, random_state=0, min_samples_leaf=1, max_leaf_nodes=None) + gbdt.fit(X, y) + + tree = DecisionTreeRegressor(random_state=0, min_samples_leaf=1) + tree.fit(X, y) + + # target feature will be set to .5 and then to 123 + features = np.array([target_feature], dtype=np.int32) + grid = np.array([[.5], + [123]]) + + pdp_gbdt = _partial_dependence_brute(gbdt, grid, features, X, + response_method='auto') + pdp_tree = _partial_dependence_brute(tree, grid, features, X, + response_method='auto') + assert np.allclose(pdp_gbdt, pdp_tree) + print(gbdt.predict(X)) + print(tree.predict(X)) + + @pytest.mark.parametrize('est', ( GradientBoostingClassifier(random_state=0), HistGradientBoostingClassifier(random_state=0), @@ -236,8 +266,9 @@ def test_recursion_decision_function(est, target_feature): LinearRegression(), GradientBoostingRegressor(random_state=0), HistGradientBoostingRegressor(random_state=0, min_samples_leaf=1, - max_leaf_nodes=None, max_iter=1)) -) + max_leaf_nodes=None, max_iter=1), + DecisionTreeRegressor(random_state=0), +)) @pytest.mark.parametrize('power', (1, 2)) def test_partial_dependence_easy_target(est, power): # If the target y only depends on one feature in an obvious way (linear or diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index ea43716e20ae6..d2a644cb53ee8 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1242,6 +1242,34 @@ def n_classes_(self): warnings.warn(msg, FutureWarning) return np.array([1] * self.n_outputs_, dtype=np.intp) + def _compute_partial_dependence_recursion(self, grid, target_features): + """Fast partial dependence computation. + + Parameters + ---------- + grid : ndarray, shape (n_samples, n_target_features) + The grid points on which the partial dependence should be + evaluated. + target_features : ndarray, shape (n_target_features) + The set of target features for which the partial dependence + should be evaluated. + + Returns + ------- + averaged_predictions : ndarray, shape \ + (n_trees_per_iteration, n_samples) + The value of the partial dependence function on each grid point. + """ + check_is_fitted(self, + msg="'estimator' parameter must be a fitted estimator") + grid = np.asarray(grid, dtype=DTYPE, order='C') + averaged_predictions = np.zeros(shape=grid.shape[0], + dtype=np.float64, order='C') + + self.tree_.compute_partial_dependence( + grid, target_features, averaged_predictions) + return averaged_predictions + class ExtraTreeClassifier(DecisionTreeClassifier): """An extremely randomized tree classifier. From 17bc2a3f4c43e1d6774efdb3bb8dfedcf13394a4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 9 Dec 2019 11:03:41 -0500 Subject: [PATCH 02/16] test and doc --- sklearn/inspection/_partial_dependence.py | 22 ++++----- .../tests/test_partial_dependence.py | 45 ++++++++++++------- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 60fd5362b8fef..419ec9fc3535a 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -233,11 +233,12 @@ def partial_dependence(estimator, X, features, response_method='auto', method : str, optional (default='auto') The method used to calculate the averaged predictions: - - 'recursion' is only supported for gradient boosting estimator (namely - :class:`GradientBoostingClassifier`, - :class:`GradientBoostingRegressor`, - :class:`HistGradientBoostingClassifier`, - :class:`HistGradientBoostingRegressor`) + - 'recursion' is only supported for some tree-based estimators, (namely + :class:`~sklearn.ensemble.GradientBoostingClassifier`, + :class:`~sklearn.ensemble.GradientBoostingRegressor`, + :class:`~sklearn.ensemble.HistGradientBoostingClassifier`, + :class:`~sklearn.ensemble.HistGradientBoostingRegressor`, + :class:`~sklearn.tree.DecisionTreeRegressor`) but is more efficient in terms of speed. With this method, ``X`` is only used to build the grid and the partial dependences are computed using the training @@ -513,11 +514,12 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, method : str, optional (default='auto') The method to use to calculate the partial dependence predictions: - - 'recursion' is only supported for gradient boosting estimator (namely - :class:`GradientBoostingClassifier`, - :class:`GradientBoostingRegressor`, - :class:`HistGradientBoostingClassifier`, - :class:`HistGradientBoostingRegressor`) + - 'recursion' is only supported for some tree-based estimators, (namely + :class:`~sklearn.ensemble.GradientBoostingClassifier`, + :class:`~sklearn.ensemble.GradientBoostingRegressor`, + :class:`~sklearn.ensemble.HistGradientBoostingClassifier`, + :class:`~sklearn.ensemble.HistGradientBoostingRegressor`, + :class:`~sklearn.tree.DecisionTreeRegressor`) but is more efficient in terms of speed. With this method, ``X`` is optional and is only used to build the grid and the partial dependences are computed using the training diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index c796a5fb6ec3e..f41570a0cf379 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -206,34 +206,45 @@ def test_partial_dependence_helpers(est, method, target_feature): assert np.allclose(pdp, mean_predictions, rtol=rtol) -@pytest.mark.parametrize('target_feature', range(1)) -def test_decision_tree_vs_gradient_boosting(target_feature): +def test_decision_tree_vs_gradient_boosting(): + # Make sure that the recursion method gives the same results on a + # DecisionTreeRegressor and a GradientBoostingRegressor with 1 tree and + # same parameters. The DecisionTreeRegressor doesn't pass the + # test_partial_dependence_helpers() test. + + # Purely random dataset to avoid correlated features + n_samples = 100 + n_features = 5 + X = np.random.RandomState(0).randn(n_samples, n_features) + y = np.random.RandomState(0).randn(n_samples) - X, y = make_regression(random_state=0, n_features=5, n_informative=5) # The 'init' estimator for GBDT (here the average prediction) isn't taken # into account with the recursion method, for technical reasons. We set # the mean to 0 to that this 'bug' doesn't have any effect. y = y - y.mean() - # gbdt = HistGradientBoostingRegressor(max_iter=1, learning_rate=1, random_state=0) - gbdt = GradientBoostingRegressor(n_estimators=1, learning_rate=1, random_state=0, min_samples_leaf=1, max_leaf_nodes=None) + # set max_depth not too high to avoid splits with same gain but different + # features + max_depth = 5 + gbdt = GradientBoostingRegressor(n_estimators=1, learning_rate=1, + criterion='mse', max_depth=max_depth, + random_state=0) gbdt.fit(X, y) - tree = DecisionTreeRegressor(random_state=0, min_samples_leaf=1) + tree = DecisionTreeRegressor(random_state=0, max_depth=max_depth) tree.fit(X, y) - # target feature will be set to .5 and then to 123 - features = np.array([target_feature], dtype=np.int32) - grid = np.array([[.5], - [123]]) + assert np.allclose(gbdt.predict(X), tree.predict(X)) # sanity check + + grid = np.random.RandomState(0).randn(50).reshape(-1, 1) + for f in range(n_features): + features = np.array([f], dtype=np.int32) - pdp_gbdt = _partial_dependence_brute(gbdt, grid, features, X, - response_method='auto') - pdp_tree = _partial_dependence_brute(tree, grid, features, X, - response_method='auto') - assert np.allclose(pdp_gbdt, pdp_tree) - print(gbdt.predict(X)) - print(tree.predict(X)) + pdp_gbdt = _partial_dependence_brute(gbdt, grid, features, X, + response_method='auto') + pdp_tree = _partial_dependence_brute(tree, grid, features, X, + response_method='auto') + assert np.allclose(pdp_gbdt, pdp_tree) @pytest.mark.parametrize('est', ( From 9fe5234e272097107d83d667cb7be0d6ceab9e17 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 9 Dec 2019 11:09:08 -0500 Subject: [PATCH 03/16] added comment --- sklearn/inspection/tests/test_partial_dependence.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index f41570a0cf379..3ed586f1088e5 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -174,6 +174,11 @@ def test_partial_dependence_helpers(est, method, target_feature): # samples. # This also checks that the brute and recursion methods give the same # output. + # Note that even on the trainset, the brute and the recursion methods + # aren't always strictly equivalent (despite what we say in the docs), in + # particular when the slow method generates unrealistic samples that have + # low mass in the joint distribution of the input features, and when some + # of the features are dependent. Hence the high tolerance on the checks. X, y = make_regression(random_state=0, n_features=5, n_informative=5) # The 'init' estimator for GBDT (here the average prediction) isn't taken From d8f5ee36b802d123b49cbd750791605ed32fcf12 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 9 Dec 2019 11:16:01 -0500 Subject: [PATCH 04/16] pep8 --- sklearn/inspection/_partial_dependence.py | 2 +- sklearn/inspection/tests/test_partial_dependence.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 419ec9fc3535a..4f00243f4595c 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -369,7 +369,7 @@ def partial_dependence(estimator, X, features, response_method='auto', if method == 'recursion': if not isinstance(estimator, (BaseGradientBoosting, BaseHistGradientBoosting, - DecisionTreeRegressor)): + DecisionTreeRegressor)): supported_classes_recursion = ( 'GradientBoostingClassifier', 'GradientBoostingRegressor', diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 3ed586f1088e5..012f2348202e7 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -246,9 +246,9 @@ def test_decision_tree_vs_gradient_boosting(): features = np.array([f], dtype=np.int32) pdp_gbdt = _partial_dependence_brute(gbdt, grid, features, X, - response_method='auto') + response_method='auto') pdp_tree = _partial_dependence_brute(tree, grid, features, X, - response_method='auto') + response_method='auto') assert np.allclose(pdp_gbdt, pdp_tree) From e52ae5084dac1d8d7b26ae0ba0608d1a8693670d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 9 Dec 2019 12:35:04 -0500 Subject: [PATCH 05/16] maybe fix 32 bits issue? --- sklearn/inspection/tests/test_partial_dependence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 012f2348202e7..3bdefc44ab6a7 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -221,7 +221,7 @@ def test_decision_tree_vs_gradient_boosting(): n_samples = 100 n_features = 5 X = np.random.RandomState(0).randn(n_samples, n_features) - y = np.random.RandomState(0).randn(n_samples) + y = np.random.RandomState(0).randn(n_samples).astype(np.float32) # The 'init' estimator for GBDT (here the average prediction) isn't taken # into account with the recursion method, for technical reasons. We set From 597006b3e9a0f279a5fae11423c279825abea754 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 9 Dec 2019 14:33:06 -0500 Subject: [PATCH 06/16] skip test if 32 bits --- sklearn/inspection/tests/test_partial_dependence.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 3bdefc44ab6a7..6c58d43c2ada9 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -36,6 +36,7 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import ignore_warnings +from sklearn.utils._testing import skip_if_32bit # toy sample @@ -211,6 +212,7 @@ def test_partial_dependence_helpers(est, method, target_feature): assert np.allclose(pdp, mean_predictions, rtol=rtol) +@skip_if_32bit def test_decision_tree_vs_gradient_boosting(): # Make sure that the recursion method gives the same results on a # DecisionTreeRegressor and a GradientBoostingRegressor with 1 tree and @@ -221,7 +223,7 @@ def test_decision_tree_vs_gradient_boosting(): n_samples = 100 n_features = 5 X = np.random.RandomState(0).randn(n_samples, n_features) - y = np.random.RandomState(0).randn(n_samples).astype(np.float32) + y = np.random.RandomState(0).randn(n_samples) # The 'init' estimator for GBDT (here the average prediction) isn't taken # into account with the recursion method, for technical reasons. We set From 7d2761e6bae70a3cb451006e80820db102a8b78b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 10 Dec 2019 05:59:09 -0500 Subject: [PATCH 07/16] test recursion instead of brute --- sklearn/inspection/tests/test_partial_dependence.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 6c58d43c2ada9..52bc1aa5e8e0b 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -36,7 +36,6 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import skip_if_32bit # toy sample @@ -212,7 +211,6 @@ def test_partial_dependence_helpers(est, method, target_feature): assert np.allclose(pdp, mean_predictions, rtol=rtol) -@skip_if_32bit def test_decision_tree_vs_gradient_boosting(): # Make sure that the recursion method gives the same results on a # DecisionTreeRegressor and a GradientBoostingRegressor with 1 tree and @@ -247,11 +245,10 @@ def test_decision_tree_vs_gradient_boosting(): for f in range(n_features): features = np.array([f], dtype=np.int32) - pdp_gbdt = _partial_dependence_brute(gbdt, grid, features, X, - response_method='auto') - pdp_tree = _partial_dependence_brute(tree, grid, features, X, - response_method='auto') - assert np.allclose(pdp_gbdt, pdp_tree) + pdp_gbdt = _partial_dependence_recursion(gbdt, grid, features) + pdp_tree = _partial_dependence_recursion(tree, grid, features) + + np.testing.assert_allclose(pdp_gbdt, pdp_tree) @pytest.mark.parametrize('est', ( From 11b648953ba3c4acef3fb137462257bc3cec6286 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 10 Dec 2019 06:03:26 -0500 Subject: [PATCH 08/16] use np.testing --- sklearn/inspection/tests/test_partial_dependence.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 52bc1aa5e8e0b..4c0f1293ebc34 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -239,7 +239,8 @@ def test_decision_tree_vs_gradient_boosting(): tree = DecisionTreeRegressor(random_state=0, max_depth=max_depth) tree.fit(X, y) - assert np.allclose(gbdt.predict(X), tree.predict(X)) # sanity check + # sanity check + np.testing.assert_allclose(gbdt.predict(X), tree.predict(X)) grid = np.random.RandomState(0).randn(50).reshape(-1, 1) for f in range(n_features): From 7263640f2bb7994347ee9164179987efdf45188a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 10 Dec 2019 06:21:51 -0500 Subject: [PATCH 09/16] put back skipif32bits --- sklearn/inspection/tests/test_partial_dependence.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 4c0f1293ebc34..1e66b53cb0468 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -36,6 +36,7 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import ignore_warnings +from sklearn.utils._testing import skip_if_32bit # toy sample @@ -211,6 +212,7 @@ def test_partial_dependence_helpers(est, method, target_feature): assert np.allclose(pdp, mean_predictions, rtol=rtol) +@skip_if_32bit def test_decision_tree_vs_gradient_boosting(): # Make sure that the recursion method gives the same results on a # DecisionTreeRegressor and a GradientBoostingRegressor with 1 tree and From 3ee45b6c3ac0dc3a3168ddf4d1648828d593213f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 10 Dec 2019 06:47:06 -0500 Subject: [PATCH 10/16] try converting grid to float32 --- sklearn/inspection/tests/test_partial_dependence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 1e66b53cb0468..06477b1776469 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -244,7 +244,7 @@ def test_decision_tree_vs_gradient_boosting(): # sanity check np.testing.assert_allclose(gbdt.predict(X), tree.predict(X)) - grid = np.random.RandomState(0).randn(50).reshape(-1, 1) + grid = np.random.RandomState(0).randn(50).reshape(-1, 1).astype(np.float32) for f in range(n_features): features = np.array([f], dtype=np.int32) From cf0df03918b0512b6b47ff0654dbba9000eff12c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 10 Dec 2019 07:13:52 -0500 Subject: [PATCH 11/16] Update sklearn/inspection/tests/test_partial_dependence.py --- sklearn/inspection/tests/test_partial_dependence.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 06477b1776469..6a44543414ab2 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -212,7 +212,6 @@ def test_partial_dependence_helpers(est, method, target_feature): assert np.allclose(pdp, mean_predictions, rtol=rtol) -@skip_if_32bit def test_decision_tree_vs_gradient_boosting(): # Make sure that the recursion method gives the same results on a # DecisionTreeRegressor and a GradientBoostingRegressor with 1 tree and From fb8257bb72d60f1eeec7cc59a44225c121dbdeeb Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 10 Dec 2019 07:47:28 -0500 Subject: [PATCH 12/16] pep8 --- sklearn/inspection/tests/test_partial_dependence.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 6a44543414ab2..25fad865636f5 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -36,7 +36,6 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import skip_if_32bit # toy sample From db8dc097d6d953ea0efb7ef26163af943546126d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 10 Dec 2019 08:11:34 -0500 Subject: [PATCH 13/16] still nope --- sklearn/inspection/tests/test_partial_dependence.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 25fad865636f5..1e66b53cb0468 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -36,6 +36,7 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import ignore_warnings +from sklearn.utils._testing import skip_if_32bit # toy sample @@ -211,6 +212,7 @@ def test_partial_dependence_helpers(est, method, target_feature): assert np.allclose(pdp, mean_predictions, rtol=rtol) +@skip_if_32bit def test_decision_tree_vs_gradient_boosting(): # Make sure that the recursion method gives the same results on a # DecisionTreeRegressor and a GradientBoostingRegressor with 1 tree and @@ -242,7 +244,7 @@ def test_decision_tree_vs_gradient_boosting(): # sanity check np.testing.assert_allclose(gbdt.predict(X), tree.predict(X)) - grid = np.random.RandomState(0).randn(50).reshape(-1, 1).astype(np.float32) + grid = np.random.RandomState(0).randn(50).reshape(-1, 1) for f in range(n_features): features = np.array([f], dtype=np.int32) From 397ce88bb97a488c2a3643aa356a405cceabc966 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 11 Dec 2019 08:41:32 -0500 Subject: [PATCH 14/16] assert tree from DecisionTree and GBDT is exactly the same --- .../tests/test_partial_dependence.py | 43 ++++++++++++++++++- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 1e66b53cb0468..f28f26b796738 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -36,7 +36,9 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import skip_if_32bit +from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils._testing import assert_almost_equal +from sklearn.tree._tree import TREE_LEAF, TREE_UNDEFINED # toy sample @@ -212,7 +214,42 @@ def test_partial_dependence_helpers(est, method, target_feature): assert np.allclose(pdp, mean_predictions, rtol=rtol) -@skip_if_32bit +def assert_is_subtree(tree, subtree): + assert tree.node_count >= subtree.node_count + assert tree.max_depth >= subtree.max_depth + + tree_c_left = tree.children_left + tree_c_right = tree.children_right + subtree_c_left = subtree.children_left + subtree_c_right = subtree.children_right + + stack = [(0, 0)] + while stack: + tree_node_idx, subtree_node_idx = stack.pop() + assert_array_almost_equal(tree.value[tree_node_idx], + subtree.value[subtree_node_idx]) + assert_almost_equal(tree.impurity[tree_node_idx], + subtree.impurity[subtree_node_idx]) + assert_almost_equal(tree.n_node_samples[tree_node_idx], + subtree.n_node_samples[subtree_node_idx]) + assert_almost_equal(tree.weighted_n_node_samples[tree_node_idx], + subtree.weighted_n_node_samples[subtree_node_idx]) + + if (subtree_c_left[subtree_node_idx] == + subtree_c_right[subtree_node_idx]): + # is a leaf + assert_almost_equal(TREE_UNDEFINED, + subtree.threshold[subtree_node_idx]) + else: + # not a leaf + assert_almost_equal(tree.threshold[tree_node_idx], + subtree.threshold[subtree_node_idx]) + stack.append((tree_c_left[tree_node_idx], + subtree_c_left[subtree_node_idx])) + stack.append((tree_c_right[tree_node_idx], + subtree_c_right[subtree_node_idx])) + + def test_decision_tree_vs_gradient_boosting(): # Make sure that the recursion method gives the same results on a # DecisionTreeRegressor and a GradientBoostingRegressor with 1 tree and @@ -244,6 +281,8 @@ def test_decision_tree_vs_gradient_boosting(): # sanity check np.testing.assert_allclose(gbdt.predict(X), tree.predict(X)) + assert_is_subtree(tree.tree_, gbdt[0, 0].tree_) + grid = np.random.RandomState(0).randn(50).reshape(-1, 1) for f in range(n_features): features = np.array([f], dtype=np.int32) From be260a0200791b33c03395be219e8bb2964efbcb Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 11 Dec 2019 08:52:39 -0500 Subject: [PATCH 15/16] pep --- sklearn/inspection/tests/test_partial_dependence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index f28f26b796738..71232807fbb63 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -38,7 +38,7 @@ from sklearn.utils._testing import ignore_warnings from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_almost_equal -from sklearn.tree._tree import TREE_LEAF, TREE_UNDEFINED +from sklearn.tree._tree import TREE_UNDEFINED # toy sample From c09565a652c1e683bd4c67ba6bbb472b76267892 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 11 Dec 2019 12:45:42 -0500 Subject: [PATCH 16/16] skip if 32 bits but better --- .../tests/test_partial_dependence.py | 51 ++++--------------- 1 file changed, 9 insertions(+), 42 deletions(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 71232807fbb63..f53a6777055c2 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -36,9 +36,8 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_almost_equal -from sklearn.tree._tree import TREE_UNDEFINED +from sklearn.utils import _IS_32BIT +from sklearn.tree.tests.test_tree import assert_is_subtree # toy sample @@ -214,42 +213,6 @@ def test_partial_dependence_helpers(est, method, target_feature): assert np.allclose(pdp, mean_predictions, rtol=rtol) -def assert_is_subtree(tree, subtree): - assert tree.node_count >= subtree.node_count - assert tree.max_depth >= subtree.max_depth - - tree_c_left = tree.children_left - tree_c_right = tree.children_right - subtree_c_left = subtree.children_left - subtree_c_right = subtree.children_right - - stack = [(0, 0)] - while stack: - tree_node_idx, subtree_node_idx = stack.pop() - assert_array_almost_equal(tree.value[tree_node_idx], - subtree.value[subtree_node_idx]) - assert_almost_equal(tree.impurity[tree_node_idx], - subtree.impurity[subtree_node_idx]) - assert_almost_equal(tree.n_node_samples[tree_node_idx], - subtree.n_node_samples[subtree_node_idx]) - assert_almost_equal(tree.weighted_n_node_samples[tree_node_idx], - subtree.weighted_n_node_samples[subtree_node_idx]) - - if (subtree_c_left[subtree_node_idx] == - subtree_c_right[subtree_node_idx]): - # is a leaf - assert_almost_equal(TREE_UNDEFINED, - subtree.threshold[subtree_node_idx]) - else: - # not a leaf - assert_almost_equal(tree.threshold[tree_node_idx], - subtree.threshold[subtree_node_idx]) - stack.append((tree_c_left[tree_node_idx], - subtree_c_left[subtree_node_idx])) - stack.append((tree_c_right[tree_node_idx], - subtree_c_right[subtree_node_idx])) - - def test_decision_tree_vs_gradient_boosting(): # Make sure that the recursion method gives the same results on a # DecisionTreeRegressor and a GradientBoostingRegressor with 1 tree and @@ -279,9 +242,13 @@ def test_decision_tree_vs_gradient_boosting(): tree.fit(X, y) # sanity check - np.testing.assert_allclose(gbdt.predict(X), tree.predict(X)) - - assert_is_subtree(tree.tree_, gbdt[0, 0].tree_) + try: + assert_is_subtree(tree.tree_, gbdt[0, 0].tree_) + except AssertionError: + # For some reason the trees aren't exactly equal on 32bits, so the PDs + # cannot be equal either. + assert _IS_32BIT + return grid = np.random.RandomState(0).randn(50).reshape(-1, 1) for f in range(n_features):