diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 53538866be1fc..a451353e21955 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -45,6 +45,8 @@ class calls the ``fit`` method of each sub-estimator on random samples from warnings import warn from abc import ABCMeta, abstractmethod +from math import ceil +import numbers import numpy as np from scipy.sparse import issparse from scipy.sparse import hstack as sparse_hstack @@ -74,17 +76,17 @@ class calls the ``fit`` method of each sub-estimator on random samples MAX_INT = np.iinfo(np.int32).max -def _generate_sample_indices(random_state, n_samples): +def _generate_sample_indices(random_state, n_samples, max_samples): """Private function used to _parallel_build_trees function.""" random_instance = check_random_state(random_state) - sample_indices = random_instance.randint(0, n_samples, n_samples) + sample_indices = random_instance.randint(0, n_samples, max_samples) return sample_indices -def _generate_unsampled_indices(random_state, n_samples): - """Private function used to forest._set_oob_score function.""" - sample_indices = _generate_sample_indices(random_state, n_samples) +def _generate_unsampled_indices(random_state, n_samples, max_samples): + """Private function used to forest._set_oob_score fuction.""" + sample_indices = _generate_sample_indices(random_state, n_samples, max_samples) sample_counts = np.bincount(sample_indices, minlength=n_samples) unsampled_mask = sample_counts == 0 indices_range = np.arange(n_samples) @@ -94,19 +96,27 @@ def _generate_unsampled_indices(random_state, n_samples): def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, - verbose=0, class_weight=None): + verbose=0, class_weight=None, max_samples=1.0): """Private function used to fit a single tree in parallel.""" if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) if forest.bootstrap: n_samples = X.shape[0] + + # if max_samples is float: + if not isinstance(max_samples, (numbers.Integral, np.integer)): + max_samples = int(ceil(max_samples * X.shape[0])) + + if not (0 < max_samples <= X.shape[0]): + raise ValueError("max_samples = " + str(max_samples) + " and it must be in (0, " + str(n_samples) + ")" ) + if sample_weight is None: curr_sample_weight = np.ones((n_samples,), dtype=np.float64) else: curr_sample_weight = sample_weight.copy() - indices = _generate_sample_indices(tree.random_state, n_samples) + indices = _generate_sample_indices(tree.random_state, n_samples,max_samples) sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts @@ -142,7 +152,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - class_weight=None): + class_weight=None, + max_samples=1.0): super(BaseForest, self).__init__( base_estimator=base_estimator, n_estimators=n_estimators, @@ -155,6 +166,7 @@ def __init__(self, self.verbose = verbose self.warm_start = warm_start self.class_weight = class_weight + self.max_samples = max_samples def apply(self, X): """Apply trees in the forest to X, return leaf indices. @@ -323,7 +335,8 @@ def fit(self, X, y, sample_weight=None): backend="threading")( delayed(_parallel_build_trees)( t, self, X, y, sample_weight, i, len(trees), - verbose=self.verbose, class_weight=self.class_weight) + verbose=self.verbose, class_weight=self.class_weight, + max_samples=self.max_samples) for i, t in enumerate(trees)) # Collect newly grown trees @@ -406,7 +419,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - class_weight=None): + class_weight=None, + max_samples=1.0): super(ForestClassifier, self).__init__( base_estimator, @@ -418,7 +432,8 @@ def __init__(self, random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight) + class_weight=class_weight, + max_samples=max_samples) def _set_oob_score(self, X, y): """Compute out-of-bag score""" @@ -426,6 +441,14 @@ def _set_oob_score(self, X, y): n_classes_ = self.n_classes_ n_samples = y.shape[0] + max_samples = self.max_samples + + # if max_samples is float: + if not isinstance(max_samples, (numbers.Integral, np.integer)): + max_samples = int(ceil(max_samples * X.shape[0])) + + if not (0 < max_samples <= X.shape[0]): + raise ValueError("max_samples = " + str(max_samples) + " and it must be in (0, " + str(n_samples) + ")" ) oob_decision_function = [] oob_score = 0.0 @@ -436,7 +459,7 @@ def _set_oob_score(self, X, y): for estimator in self.estimators_: unsampled_indices = _generate_unsampled_indices( - estimator.random_state, n_samples) + estimator.random_state, n_samples, max_samples) p_estimator = estimator.predict_proba(X[unsampled_indices, :], check_input=False) @@ -643,7 +666,8 @@ def __init__(self, n_jobs=1, random_state=None, verbose=0, - warm_start=False): + warm_start=False, + max_samples=1.0): super(ForestRegressor, self).__init__( base_estimator, n_estimators=n_estimators, @@ -653,7 +677,8 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start) + warm_start=warm_start, + max_samples=max_samples) def predict(self, X): """Predict regression target for X. @@ -700,13 +725,21 @@ def _set_oob_score(self, X, y): X = check_array(X, dtype=DTYPE, accept_sparse='csr') n_samples = y.shape[0] + max_samples = self.max_samples + + # if max_samples is float: + if not isinstance(max_samples, (numbers.Integral, np.integer)): + max_samples = int(ceil(max_samples * X.shape[0])) + + if not (0 < max_samples <= X.shape[0]): + raise ValueError("max_samples = " + str(max_samples) + " and it must be in (0, " + str(n_samples) + ")" ) predictions = np.zeros((n_samples, self.n_outputs_)) n_predictions = np.zeros((n_samples, self.n_outputs_)) for estimator in self.estimators_: unsampled_indices = _generate_unsampled_indices( - estimator.random_state, n_samples) + estimator.random_state, n_samples, max_samples) p_estimator = estimator.predict( X[unsampled_indices, :], check_input=False) @@ -755,6 +788,11 @@ class RandomForestClassifier(ForestClassifier): n_estimators : integer, optional (default=10) The number of trees in the forest. + max_samples : int or float, optional (default=1.0) + The number of samples to draw from X to train each base estimator. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + criterion : string, optional (default="gini") The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. @@ -984,7 +1022,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - class_weight=None): + class_weight=None, + max_samples=1.0): super(RandomForestClassifier, self).__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, @@ -999,7 +1038,8 @@ def __init__(self, random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight) + class_weight=class_weight, + max_samples=max_samples) self.criterion = criterion self.max_depth = max_depth @@ -1029,6 +1069,11 @@ class RandomForestRegressor(ForestRegressor): n_estimators : integer, optional (default=10) The number of trees in the forest. + max_samples : int or float, optional (default=1.0) + The number of samples to draw from X to train each base estimator. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + criterion : string, optional (default="mse") The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance @@ -1223,7 +1268,8 @@ def __init__(self, n_jobs=1, random_state=None, verbose=0, - warm_start=False): + warm_start=False, + max_samples=1.0): super(RandomForestRegressor, self).__init__( base_estimator=DecisionTreeRegressor(), n_estimators=n_estimators, @@ -1237,7 +1283,8 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start) + warm_start=warm_start, + max_samples=max_samples) self.criterion = criterion self.max_depth = max_depth @@ -1265,6 +1312,11 @@ class ExtraTreesClassifier(ForestClassifier): n_estimators : integer, optional (default=10) The number of trees in the forest. + max_samples : int or float, optional (default=1.0) + The number of samples to draw from X to train each base estimator. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + criterion : string, optional (default="gini") The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. @@ -1466,7 +1518,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - class_weight=None): + class_weight=None, + max_samples=1.0): super(ExtraTreesClassifier, self).__init__( base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, @@ -1481,7 +1534,8 @@ def __init__(self, random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight) + class_weight=class_weight, + max_samples=max_samples) self.criterion = criterion self.max_depth = max_depth @@ -1509,6 +1563,11 @@ class ExtraTreesRegressor(ForestRegressor): n_estimators : integer, optional (default=10) The number of trees in the forest. + max_samples : int or float, optional (default=1.0) + The number of samples to draw from X to train each base estimator. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + criterion : string, optional (default="mse") The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance @@ -1677,7 +1736,8 @@ def __init__(self, n_jobs=1, random_state=None, verbose=0, - warm_start=False): + warm_start=False, + max_samples=1.0): super(ExtraTreesRegressor, self).__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, @@ -1691,7 +1751,8 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start) + warm_start=warm_start, + max_samples=max_samples) self.criterion = criterion self.max_depth = max_depth diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 897ca8f077a16..9d986a7b819e9 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -95,16 +95,18 @@ FOREST_ESTIMATORS.update(FOREST_TRANSFORMERS) -def check_classification_toy(name): +def check_classification_toy(name, max_samples): """Check classification on a toy dataset.""" ForestClassifier = FOREST_CLASSIFIERS[name] - clf = ForestClassifier(n_estimators=10, random_state=1) + clf = ForestClassifier(n_estimators=10, random_state=1, + max_samples=max_samples) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf)) - clf = ForestClassifier(n_estimators=10, max_features=1, random_state=1) + clf = ForestClassifier(n_estimators=10, max_features=1, random_state=1, + max_samples=max_samples) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf)) @@ -115,8 +117,8 @@ def check_classification_toy(name): def test_classification_toy(): - for name in FOREST_CLASSIFIERS: - yield check_classification_toy, name + for name, max_samples in product(FOREST_CLASSIFIERS, (1.0, 0.5, 3)): + yield check_classification_toy, name, max_samples def check_iris_criterion(name, criterion): @@ -183,12 +185,12 @@ def test_regressor_attributes(): yield check_regressor_attributes, name -def check_probability(name): +def check_probability(name, max_samples): # Predict probabilities. ForestClassifier = FOREST_CLASSIFIERS[name] with np.errstate(divide="ignore"): clf = ForestClassifier(n_estimators=10, random_state=1, max_features=1, - max_depth=1) + max_depth=1, max_samples=max_samples) clf.fit(iris.data, iris.target) assert_array_almost_equal(np.sum(clf.predict_proba(iris.data), axis=1), np.ones(iris.data.shape[0])) @@ -197,8 +199,8 @@ def check_probability(name): def test_probability(): - for name in FOREST_CLASSIFIERS: - yield check_probability, name + for name, max_samples in product(FOREST_CLASSIFIERS, (1.0, 0.8, 0.5, 0.3, 0.1)): + yield check_probability, name, max_samples def check_importances(name, criterion, dtype, tolerance): @@ -356,14 +358,26 @@ def test_unfitted_feature_importances(): for name in FOREST_ESTIMATORS: yield check_unfitted_feature_importances, name +def check_max_samples_equal_0(name): + ForestClassifier = FOREST_CLASSIFIERS[name] + clf = ForestClassifier(n_estimators=10, random_state=1, max_features=1, + max_depth=1, max_samples=0) + + assert_raises(ValueError, getattr, clf.fit(iris.data, iris.target), + "max_samples = 0 ") + -def check_oob_score(name, X, y, n_estimators=20): +def check_max_samples_equal_0(): + for name in FOREST_ESTIMATORS: + yield check_unfitted_feature_importances, name + +def check_oob_score(name, X, y, n_estimators=20, max_samples = 1.0): # Check that oob prediction is a good estimation of the generalization # error. # Proper behavior est = FOREST_ESTIMATORS[name](oob_score=True, random_state=0, - n_estimators=n_estimators, bootstrap=True) + n_estimators=n_estimators, bootstrap=True, max_samples = max_samples) n_samples = X.shape[0] est.fit(X[:n_samples // 2, :], y[:n_samples // 2]) test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:]) @@ -377,25 +391,25 @@ def check_oob_score(name, X, y, n_estimators=20): # Check warning if not enough estimators with np.errstate(divide="ignore", invalid="ignore"): est = FOREST_ESTIMATORS[name](oob_score=True, random_state=0, - n_estimators=1, bootstrap=True) + n_estimators=1, bootstrap=True, max_samples = max_samples) assert_warns(UserWarning, est.fit, X, y) def test_oob_score(): - for name in FOREST_CLASSIFIERS: - yield check_oob_score, name, iris.data, iris.target + for name, max_samples in product(FOREST_CLASSIFIERS, (1.0, 0.8, 0.5)): + yield check_oob_score, name, iris.data, iris.target, 20, max_samples # csc matrix - yield check_oob_score, name, csc_matrix(iris.data), iris.target + yield check_oob_score, name, csc_matrix(iris.data), iris.target, 20, max_samples # non-contiguous targets in classification - yield check_oob_score, name, iris.data, iris.target * 2 + 1 + yield check_oob_score, name, iris.data, iris.target * 2 + 1, 20, max_samples - for name in FOREST_REGRESSORS: - yield check_oob_score, name, boston.data, boston.target, 50 + for name, max_samples in product(FOREST_REGRESSORS, (1.0, 0.8, 0.5)): + yield check_oob_score, name, boston.data, boston.target, 50, max_samples # csc matrix - yield check_oob_score, name, csc_matrix(boston.data), boston.target, 50 + yield check_oob_score, name, csc_matrix(boston.data), boston.target, 50, max_samples def check_oob_score_raise_error(name): @@ -438,27 +452,27 @@ def test_gridsearch(): yield check_gridsearch, name -def check_parallel(name, X, y): +def check_parallel(name, X, y, max_samples=1.0): """Check parallel computations in classification""" ForestEstimator = FOREST_ESTIMATORS[name] - forest = ForestEstimator(n_estimators=10, n_jobs=3, random_state=0) + forest = ForestEstimator(n_estimators=10, max_samples=max_samples, n_jobs=3, random_state=0) forest.fit(X, y) assert_equal(len(forest), 10) - forest.set_params(n_jobs=1) + forest.set_params(max_samples=max_samples, n_jobs=1) y1 = forest.predict(X) - forest.set_params(n_jobs=2) + forest.set_params(max_samples=max_samples, n_jobs=2) y2 = forest.predict(X) assert_array_almost_equal(y1, y2, 3) def test_parallel(): - for name in FOREST_CLASSIFIERS: - yield check_parallel, name, iris.data, iris.target + for name, max_samples in product(FOREST_CLASSIFIERS, (1.0, 0.8, 0.5, 0.3, 0.1)): + yield check_parallel, name, iris.data, iris.target, max_samples - for name in FOREST_REGRESSORS: - yield check_parallel, name, boston.data, boston.target + for name, max_samples in product(FOREST_REGRESSORS, (1.0, 0.8, 0.5, 0.3, 0.1)): + yield check_parallel, name, boston.data, boston.target, max_samples def check_pickle(name, X, y):