From 7f28f0a35cc6c93c321681e9fa8eb42c0354c494 Mon Sep 17 00:00:00 2001 From: Joe Lucas Date: Sat, 6 Jun 2020 15:59:50 -0400 Subject: [PATCH 01/10] WIP Prototyped Solution, Need recommendations --- sklearn/tree/_classes.py | 58 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 0b6191593b548..56f0673a90e89 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -47,6 +47,59 @@ from ._tree import ccp_pruning_path from . import _tree, _splitter, _criterion + + +def _get_n_samples_bootstrap(n_samples, max_samples): + """ + Get the number of samples in a bootstrap sample. + + Parameters + ---------- + n_samples : int + Number of samples in the dataset. + max_samples : int or float + The maximum number of samples to draw from the total available: + - if float, this indicates a fraction of the total and should be + the interval `(0, 1)`; + - if int, this indicates the exact number of samples; + - if None, this indicates the total number of samples. + + Returns + ------- + n_samples_bootstrap : int + The total number of samples to draw for the bootstrap sample. + """ + if max_samples is None: + return n_samples + + if isinstance(max_samples, numbers.Integral): + if not (1 <= max_samples <= n_samples): + msg = "`max_samples` must be in range 1 to {} but got value {}" + raise ValueError(msg.format(n_samples, max_samples)) + return max_samples + + if isinstance(max_samples, numbers.Real): + if not (0 < max_samples < 1): + msg = "`max_samples` must be in range (0, 1) but got value {}" + raise ValueError(msg.format(max_samples)) + return int(round(n_samples * max_samples)) + + msg = "`max_samples` should be int or float, but got type '{}'" + raise TypeError(msg.format(type(max_samples))) + + +def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap): + """ + Private function used to _parallel_build_trees function.""" + + random_instance = check_random_state(random_state) + sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap) + + return sample_indices + + + + __all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor", "ExtraTreeClassifier", @@ -370,7 +423,10 @@ def fit(self, X, y, sample_weight=None, check_input=True, self.min_impurity_decrease, min_impurity_split) - builder.build(self.tree_, X, y, sample_weight) + proportion = 0.99 + boot_size = _get_n_samples_bootstrap(X.shape[0], proportion) + ind = _generate_sample_indices(self.random_state, X.shape[0], boot_size) + builder.build(self.tree_, X[ind], y[ind], sample_weight) if self.n_outputs_ == 1 and is_classifier(self): self.n_classes_ = self.n_classes_[0] From 632715e3a0710b9d491096c786a40954a00dbc89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=C3=ABl=20Beaugnon?= Date: Mon, 8 Jun 2020 12:57:42 +0200 Subject: [PATCH 02/10] add param node_bootstrap and node_max_samples to classes RandomForestClassifier (and upper classes) and DecisionTree (and upper classes) --- sklearn/ensemble/_forest.py | 34 +++++++++++++++++++++++++++++----- sklearn/tree/_classes.py | 31 ++++++++++++++++++++++++------- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index b0b6d492d2f85..291596d3e96bf 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -186,19 +186,22 @@ def __init__(self, n_estimators=100, *, estimator_params=tuple(), bootstrap=False, + node_bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, - max_samples=None): + max_samples=None, + node_max_samples=None): super().__init__( base_estimator=base_estimator, n_estimators=n_estimators, estimator_params=estimator_params) self.bootstrap = bootstrap + self.node_bootstrap = node_bootstrap self.oob_score = oob_score self.n_jobs = n_jobs self.random_state = random_state @@ -206,6 +209,7 @@ def __init__(self, self.warm_start = warm_start self.class_weight = class_weight self.max_samples = max_samples + self.node_max_samples = node_max_samples def apply(self, X): """ @@ -486,25 +490,29 @@ def __init__(self, n_estimators=100, *, estimator_params=tuple(), bootstrap=False, + node_bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, - max_samples=None): + max_samples=None, + node_max_samples=None): super().__init__( base_estimator, n_estimators=n_estimators, estimator_params=estimator_params, bootstrap=bootstrap, + node_bootstrap=node_bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight, - max_samples=max_samples) + max_samples=max_samples, + node_max_samples=node_max_samples) def _set_oob_score(self, X, y): """ @@ -997,6 +1005,9 @@ class RandomForestClassifier(ForestClassifier): Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree. + node_bootstrap : bool, default=False + Whether bootstrap samples are selected at each node. + oob_score : bool, default=False Whether to use out-of-bag samples to estimate the generalization accuracy. @@ -1066,6 +1077,16 @@ class RandomForestClassifier(ForestClassifier): - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0, 1)`. + node_max_samples : int or float, default=None + If node_bootstrap is True, the number of samples to draw at each node + to select the best split criterion. + + - If None (default), then draw `X.shape[0]` samples. # FIXME + - If int, then draw `node_max_samples` samples. + - If float, then draw `node_max_samples * node size` samples. Thus, + `node_max_samples` should be in the interval `(0, 1)`. # FIXME + + .. versionadded:: 0.22 Attributes @@ -1162,6 +1183,7 @@ def __init__(self, min_impurity_decrease=0., min_impurity_split=None, bootstrap=True, + node_bootstrap=False, oob_score=False, n_jobs=None, random_state=None, @@ -1169,7 +1191,8 @@ def __init__(self, warm_start=False, class_weight=None, ccp_alpha=0.0, - max_samples=None): + max_samples=None, + node_max_samples=None): super().__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, @@ -1177,7 +1200,8 @@ def __init__(self, "min_samples_leaf", "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", "min_impurity_decrease", "min_impurity_split", - "random_state", "ccp_alpha"), + "random_state", "ccp_alpha", "node_bootstrap", + "node_max_samples"), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 56f0673a90e89..4c62e57fa1c52 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -150,7 +150,9 @@ def __init__(self, *, min_impurity_decrease, min_impurity_split, class_weight=None, - ccp_alpha=0.0): + ccp_alpha=0.0, + node_bootstrap=False, + node_max_samples=None): self.criterion = criterion self.splitter = splitter self.max_depth = max_depth @@ -164,6 +166,8 @@ def __init__(self, *, self.min_impurity_split = min_impurity_split self.class_weight = class_weight self.ccp_alpha = ccp_alpha + self.node_bootstrap = node_bootstrap + self.node_max_samples = node_max_samples def get_depth(self): """Return the depth of the decision tree. @@ -423,10 +427,14 @@ def fit(self, X, y, sample_weight=None, check_input=True, self.min_impurity_decrease, min_impurity_split) - proportion = 0.99 - boot_size = _get_n_samples_bootstrap(X.shape[0], proportion) - ind = _generate_sample_indices(self.random_state, X.shape[0], boot_size) - builder.build(self.tree_, X[ind], y[ind], sample_weight) + if self.node_bootstrap: + boot_size = _get_n_samples_bootstrap(X.shape[0], + self.node_max_samples) + ind = _generate_sample_indices(self.random_state, X.shape[0], + boot_size) + builder.build(self.tree_, X[ind], y[ind], sample_weight) + else: + builder.build(self.tree_, X, y, sample_weight) if self.n_outputs_ == 1 and is_classifier(self): self.n_classes_ = self.n_classes_[0] @@ -788,6 +796,11 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): ``ccp_alpha`` will be chosen. By default, no pruning is performed. See :ref:`minimal_cost_complexity_pruning` for details. + node_bootstrap : FIXME (do we want to show this param ? or only use it + with ensemble methods ?) + + node_max_samples : FIXME + .. versionadded:: 0.22 Attributes @@ -880,7 +893,9 @@ def __init__(self, *, min_impurity_decrease=0., min_impurity_split=None, class_weight=None, - ccp_alpha=0.0): + ccp_alpha=0.0, + node_bootstrap=False, + node_max_samples=None): super().__init__( criterion=criterion, splitter=splitter, @@ -894,7 +909,9 @@ def __init__(self, *, random_state=random_state, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, - ccp_alpha=ccp_alpha) + ccp_alpha=ccp_alpha, + node_bootstrap=node_bootstrap, + node_max_samples=node_max_samples) def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted="deprecated"): From 4b6136a0817010eb07e3df5705a9a048eb2c6266 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=C3=ABl=20Beaugnon?= Date: Mon, 8 Jun 2020 14:24:19 +0200 Subject: [PATCH 03/10] fix pep8 --- sklearn/ensemble/_forest.py | 1 + sklearn/tree/_classes.py | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 291596d3e96bf..d4eaa4b1f6148 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -888,6 +888,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features): return averaged_predictions + class RandomForestClassifier(ForestClassifier): """ A random forest classifier. diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 4c62e57fa1c52..47a2dc6e29828 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -48,7 +48,6 @@ from . import _tree, _splitter, _criterion - def _get_n_samples_bootstrap(n_samples, max_samples): """ Get the number of samples in a bootstrap sample. @@ -98,8 +97,6 @@ def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap): return sample_indices - - __all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor", "ExtraTreeClassifier", From d85d0b05ab68b686f3377b3391af5a0f5020d8bc Mon Sep 17 00:00:00 2001 From: Joe Lucas Date: Mon, 8 Jun 2020 17:11:53 -0400 Subject: [PATCH 04/10] Added Unit Test Replicated bootstrap attributes from DecisionTreeClassifier to DecisionTreeRegressor Wrote unit test to ensure that trees trained on the full dataset are more accurate than boosted trees Thinking about ways to test the training time (the real motivation behind providing this functionality) --- sklearn/tree/_classes.py | 12 ++++++++++-- sklearn/tree/tests/test_tree.py | 25 +++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 47a2dc6e29828..0dd61a96a07df 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1158,6 +1158,10 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): ``ccp_alpha`` will be chosen. By default, no pruning is performed. See :ref:`minimal_cost_complexity_pruning` for details. + node_bootstrap : FIXME + + node_max_samples : FIXME + .. versionadded:: 0.22 Attributes @@ -1240,7 +1244,9 @@ def __init__(self, *, max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, - ccp_alpha=0.0): + ccp_alpha=0.0, + node_bootstrap=False, + node_max_samples=None): super().__init__( criterion=criterion, splitter=splitter, @@ -1253,7 +1259,9 @@ def __init__(self, *, random_state=random_state, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, - ccp_alpha=ccp_alpha) + ccp_alpha=ccp_alpha, + node_bootstrap=node_bootstrap, + node_max_samples=node_max_samples) def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted="deprecated"): diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 23788fc97fd60..7a624bc2babbe 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1961,3 +1961,28 @@ def test_X_idx_sorted_deprecated(TreeEstimator): with pytest.warns(FutureWarning, match="The parameter 'X_idx_sorted' is deprecated"): tree.fit(X, y, X_idx_sorted=X_idx_sorted) + + +@pytest.mark.parametrize("tree_type,dataset", + [(DecisionTreeRegressor, diabetes), + (DecisionTreeClassifier, iris)]) +def test_node_bootstrap(tree_type, dataset): + rng = np.random.RandomState(1) + + est = tree_type( + random_state=rng, + node_bootstrap=False, + node_max_samples=None) + est_bootstrap = tree_type( + random_state=rng, + node_bootstrap=True, + node_max_samples=0.75) + + est.fit(dataset.data, dataset.target) + est_bootstrap.fit(dataset.data, dataset.target) + score = accuracy_score(est.predict(dataset.data), dataset.target) + score_bootstrap = accuracy_score( + est_bootstrap.predict(dataset.data), dataset.target) + assert score > score_bootstrap, ( + f'''Failed with a full_sample tree score of {score} and a + bootstrapped tree score of {score_bootstrap}.''') From 929f2db872327976c444fa2378825a419968ab2b Mon Sep 17 00:00:00 2001 From: Joe Lucas Date: Mon, 8 Jun 2020 17:39:25 -0400 Subject: [PATCH 05/10] Moved utility functions from ensemble/forest to utils _get_n_samples() and _generate_sample_indices() previously resided in ensemble/_forest.py, leading to a circular definition Moved them both to utils/__init__.py so they could be referenced by ensemble/_forest and tree/_class Ensured all unit tests still pass --- sklearn/ensemble/_forest.py | 53 +++--------------------------------- sklearn/tree/_classes.py | 54 ++----------------------------------- sklearn/utils/__init__.py | 52 +++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 102 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index d4eaa4b1f6148..17dce355e9f6e 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -56,7 +56,9 @@ class calls the ``fit`` method of each sub-estimator on random samples from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier, ExtraTreeRegressor) from ..tree._tree import DTYPE, DOUBLE -from ..utils import check_random_state, check_array, compute_sample_weight +from ..utils import (check_random_state, check_array, + compute_sample_weight, _generate_sample_indices, + _get_n_samples_bootstrap) from ..exceptions import DataConversionWarning from ._base import BaseEnsemble, _partition_estimators from ..utils.fixes import _joblib_parallel_args @@ -74,55 +76,6 @@ class calls the ``fit`` method of each sub-estimator on random samples MAX_INT = np.iinfo(np.int32).max -def _get_n_samples_bootstrap(n_samples, max_samples): - """ - Get the number of samples in a bootstrap sample. - - Parameters - ---------- - n_samples : int - Number of samples in the dataset. - max_samples : int or float - The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total and should be - the interval `(0, 1)`; - - if int, this indicates the exact number of samples; - - if None, this indicates the total number of samples. - - Returns - ------- - n_samples_bootstrap : int - The total number of samples to draw for the bootstrap sample. - """ - if max_samples is None: - return n_samples - - if isinstance(max_samples, numbers.Integral): - if not (1 <= max_samples <= n_samples): - msg = "`max_samples` must be in range 1 to {} but got value {}" - raise ValueError(msg.format(n_samples, max_samples)) - return max_samples - - if isinstance(max_samples, numbers.Real): - if not (0 < max_samples < 1): - msg = "`max_samples` must be in range (0, 1) but got value {}" - raise ValueError(msg.format(max_samples)) - return round(n_samples * max_samples) - - msg = "`max_samples` should be int or float, but got type '{}'" - raise TypeError(msg.format(type(max_samples))) - - -def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap): - """ - Private function used to _parallel_build_trees function.""" - - random_instance = check_random_state(random_state) - sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap) - - return sample_indices - - def _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap): """ Private function used to forest._set_oob_score function.""" diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 0dd61a96a07df..add35804cd46a 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -30,8 +30,8 @@ from ..base import is_classifier from ..base import MultiOutputMixin from ..utils import Bunch -from ..utils import check_array -from ..utils import check_random_state +from ..utils import (check_array, check_random_state, + _get_n_samples_bootstrap, _generate_sample_indices) from ..utils.validation import _check_sample_weight from ..utils import compute_sample_weight from ..utils.multiclass import check_classification_targets @@ -47,56 +47,6 @@ from ._tree import ccp_pruning_path from . import _tree, _splitter, _criterion - -def _get_n_samples_bootstrap(n_samples, max_samples): - """ - Get the number of samples in a bootstrap sample. - - Parameters - ---------- - n_samples : int - Number of samples in the dataset. - max_samples : int or float - The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total and should be - the interval `(0, 1)`; - - if int, this indicates the exact number of samples; - - if None, this indicates the total number of samples. - - Returns - ------- - n_samples_bootstrap : int - The total number of samples to draw for the bootstrap sample. - """ - if max_samples is None: - return n_samples - - if isinstance(max_samples, numbers.Integral): - if not (1 <= max_samples <= n_samples): - msg = "`max_samples` must be in range 1 to {} but got value {}" - raise ValueError(msg.format(n_samples, max_samples)) - return max_samples - - if isinstance(max_samples, numbers.Real): - if not (0 < max_samples < 1): - msg = "`max_samples` must be in range (0, 1) but got value {}" - raise ValueError(msg.format(max_samples)) - return int(round(n_samples * max_samples)) - - msg = "`max_samples` should be int or float, but got type '{}'" - raise TypeError(msg.format(type(max_samples))) - - -def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap): - """ - Private function used to _parallel_build_trees function.""" - - random_instance = check_random_state(random_state) - sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap) - - return sample_indices - - __all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor", "ExtraTreeClassifier", diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 4149765e7c9aa..6a66b6a0112ed 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -1185,3 +1185,55 @@ def is_abstract(c): # itemgetter is used to ensure the sort does not extend to the 2nd item of # the tuple return sorted(set(estimators), key=itemgetter(0)) + + +def _get_n_samples_bootstrap(n_samples, max_samples): + """ + Get the number of samples in a bootstrap sample. + + Parameters + ---------- + n_samples : int + Number of samples in the dataset. + max_samples : int or float + The maximum number of samples to draw from the total available: + - if float, this indicates a fraction of the total and should be + the interval `(0, 1)`; + - if int, this indicates the exact number of samples; + - if None, this indicates the total number of samples. + + Returns + ------- + n_samples_bootstrap : int + The total number of samples to draw for the bootstrap sample. + """ + if max_samples is None: + return n_samples + + if isinstance(max_samples, numbers.Integral): + if not (1 <= max_samples <= n_samples): + msg = "`max_samples` must be in range 1 to {} but got value {}" + raise ValueError(msg.format(n_samples, max_samples)) + return max_samples + + if isinstance(max_samples, numbers.Real): + if not (0 < max_samples < 1): + msg = "`max_samples` must be in range (0, 1) but got value {}" + raise ValueError(msg.format(max_samples)) + return int(round(n_samples * max_samples)) + + msg = "`max_samples` should be int or float, but got type '{}'" + raise TypeError(msg.format(type(max_samples))) + + +def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap): + """ + Private function used to _parallel_build_trees function.""" + + random_instance = check_random_state(random_state) + sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap) + + return sample_indices + + + From bf764dee0b3cfb45116f011ea8407932421208fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=C3=ABl=20Beaugnon?= Date: Tue, 9 Jun 2020 13:20:38 +0200 Subject: [PATCH 06/10] fix pep8 --- sklearn/utils/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 6a66b6a0112ed..40d15814ce587 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -1234,6 +1234,3 @@ def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap): sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap) return sample_indices - - - From fbcf2f1bbdd5dd036c560a9a1f2116653c90fd49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=C3=ABl=20Beaugnon?= Date: Tue, 9 Jun 2020 13:31:22 +0200 Subject: [PATCH 07/10] rm unused import --- sklearn/ensemble/_forest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 17dce355e9f6e..277ab199df044 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -40,7 +40,6 @@ class calls the ``fit`` method of each sub-estimator on random samples # License: BSD 3 clause -import numbers from warnings import catch_warnings, simplefilter, warn import threading From 04492e183b6982c5648dc4770700645314cf2974 Mon Sep 17 00:00:00 2001 From: Joe Lucas Date: Wed, 10 Jun 2020 19:17:50 -0400 Subject: [PATCH 08/10] Improved Unit Test Unit test now compares feature importance between full and bootstrapped trees to ensure similarity in tree construction. --- sklearn/tree/tests/test_tree.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 7a624bc2babbe..497a3cd6f8780 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1966,7 +1966,8 @@ def test_X_idx_sorted_deprecated(TreeEstimator): @pytest.mark.parametrize("tree_type,dataset", [(DecisionTreeRegressor, diabetes), (DecisionTreeClassifier, iris)]) -def test_node_bootstrap(tree_type, dataset): +def test_node_bootstrap_accuracy(tree_type, dataset): + error_threshold = 0.1 rng = np.random.RandomState(1) est = tree_type( @@ -1980,9 +1981,10 @@ def test_node_bootstrap(tree_type, dataset): est.fit(dataset.data, dataset.target) est_bootstrap.fit(dataset.data, dataset.target) - score = accuracy_score(est.predict(dataset.data), dataset.target) - score_bootstrap = accuracy_score( - est_bootstrap.predict(dataset.data), dataset.target) - assert score > score_bootstrap, ( - f'''Failed with a full_sample tree score of {score} and a - bootstrapped tree score of {score_bootstrap}.''') + + error = mean_squared_error( + est.feature_importances_, + est_bootstrap.feature_importances_) + + assert error < error_threshold, (f'''Difference between bootstrap + and full_sample is {error}, exceeding {error_threshold}''') From d84c2a6c42b9433e7ac242d86fb42f7a9a92ed48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=C3=ABl=20Beaugnon?= Date: Sat, 20 Jun 2020 17:15:34 +0200 Subject: [PATCH 09/10] add @skip_if_32bit --- sklearn/tree/tests/test_tree.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 497a3cd6f8780..d8cdcfd223ed0 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1966,6 +1966,7 @@ def test_X_idx_sorted_deprecated(TreeEstimator): @pytest.mark.parametrize("tree_type,dataset", [(DecisionTreeRegressor, diabetes), (DecisionTreeClassifier, iris)]) +@skip_if_32bit def test_node_bootstrap_accuracy(tree_type, dataset): error_threshold = 0.1 rng = np.random.RandomState(1) From bd95a752f1979f8f45d31d37957494e98c50ac04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=C3=ABl=20Beaugnon?= Date: Sat, 20 Jun 2020 18:02:54 +0200 Subject: [PATCH 10/10] fix doc for random forest --- sklearn/ensemble/_forest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 277ab199df044..39fabd7ff4f8c 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1034,10 +1034,10 @@ class RandomForestClassifier(ForestClassifier): If node_bootstrap is True, the number of samples to draw at each node to select the best split criterion. - - If None (default), then draw `X.shape[0]` samples. # FIXME + - If None (default), then draw `node size` samples. - If int, then draw `node_max_samples` samples. - If float, then draw `node_max_samples * node size` samples. Thus, - `node_max_samples` should be in the interval `(0, 1)`. # FIXME + `node_max_samples` should be in the interval `(0, 1)`. .. versionadded:: 0.22