diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 518dfc76ce592..5019d8b016ffa 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -82,6 +82,58 @@ def _generate_sample_indices(random_state, n_samples): return sample_indices +def _get_class_balance_data(y): + """Private function used to fit function. + + Args: outcome array y + Returns: tuple of + - classes: list of classes + - class_counts: list of count of each class + - class_indices: list of indices of each class + """ + if len(y.shape) > 1: + if y.shape[1] == 1: + y = y.flatten() + else: + raise ValueError("Balanced random forest not implemented " + " for multi-output") + + classes = np.unique(y) + class_indices = [np.nonzero(y == cls)[0] for cls in classes] + class_counts = [len(i) for i in class_indices] + + return classes, class_counts, class_indices + + +def _generate_balanced_sample_indices(random_state, balance_data): + """Private function used to _parallel_build_trees function. + + Generates samples according to the balanced random forest method [1], + adapted for multi-class, i.e. a bootstrap sample from the minority + class and a random sample with replacement of the same size from all + other classes. + + References + ---------- + .. [1] Chen, C., Liaw, A., Breiman, L. (2004) "Using Random Forest to + Learn Imbalanced Data", Tech. Rep. 666, 2004 + """ + classes, class_counts, class_indices = balance_data + min_count = np.min(class_counts) + n_class = len(classes) + + random_instance = check_random_state(random_state) + sample_indices = np.empty(n_class*min_count, dtype=int) + + for i, cls, count, indices in zip(range(n_class), classes, class_counts, + class_indices): + random_instances = random_instance.randint(0, count, min_count) + random_indices = indices[random_instances] + sample_indices[i*min_count:(i+1)*min_count] = random_indices + + return sample_indices + + def _generate_unsampled_indices(random_state, n_samples): """Private function used to forest._set_oob_score function.""" sample_indices = _generate_sample_indices(random_state, n_samples) @@ -94,7 +146,7 @@ def _generate_unsampled_indices(random_state, n_samples): def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, - verbose=0, class_weight=None): + verbose=0, class_weight=None, balance_data=None): """Private function used to fit a single tree in parallel.""" if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) @@ -106,7 +158,12 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, else: curr_sample_weight = sample_weight.copy() - indices = _generate_sample_indices(tree.random_state, n_samples) + if balance_data is None: + indices = _generate_sample_indices(tree.random_state, n_samples) + else: + indices = _generate_balanced_sample_indices(tree.random_state, + balance_data) + sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts @@ -117,6 +174,7 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, elif class_weight == 'balanced_subsample': curr_sample_weight *= compute_sample_weight('balanced', y, indices) + tree.sample_weight = curr_sample_weight tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) else: tree.fit(X, y, sample_weight=sample_weight, check_input=False) @@ -142,7 +200,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - class_weight=None): + class_weight=None, + balanced=False): super(BaseForest, self).__init__( base_estimator=base_estimator, n_estimators=n_estimators, @@ -155,6 +214,7 @@ def __init__(self, self.verbose = verbose self.warm_start = warm_start self.class_weight = class_weight + self.balanced = balanced def apply(self, X): """Apply trees in the forest to X, return leaf indices. @@ -207,7 +267,7 @@ def decision_path(self, X): indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(parallel_helper)(tree, 'decision_path', X, - check_input=False) + check_input=False) for tree in self.estimators_) n_nodes = [0] @@ -222,8 +282,8 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] - The training input samples. Internally, its dtype will be converted to - ``dtype=np.float32``. If a sparse matrix is provided, it will be + The training input samples. Internally, its dtype will be converted + to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. y : array-like, shape = [n_samples] or [n_samples, n_outputs] @@ -315,6 +375,9 @@ def fit(self, X, y, sample_weight=None): random_state=random_state) trees.append(tree) + balance_data = _get_class_balance_data(y)\ + if self.balanced else None + # Parallel loop: we use the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading always more efficient than multiprocessing in @@ -323,7 +386,8 @@ def fit(self, X, y, sample_weight=None): backend="threading")( delayed(_parallel_build_trees)( t, self, X, y, sample_weight, i, len(trees), - verbose=self.verbose, class_weight=self.class_weight) + verbose=self.verbose, class_weight=self.class_weight, + balance_data=balance_data) for i, t in enumerate(trees)) # Collect newly grown trees @@ -406,7 +470,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - class_weight=None): + class_weight=None, + balanced=False): super(ForestClassifier, self).__init__( base_estimator, @@ -418,7 +483,8 @@ def __init__(self, random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight) + class_weight=class_weight, + balanced=balanced) def _set_oob_score(self, X, y): """Compute out-of-bag score""" @@ -479,7 +545,8 @@ def _validate_y_class_weight(self, y): y_store_unique_indices = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): - classes_k, y_store_unique_indices[:, k] = np.unique(y[:, k], return_inverse=True) + classes_k, y_store_unique_indices[:, k] = np.unique( + y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_store_unique_indices @@ -489,16 +556,18 @@ def _validate_y_class_weight(self, y): if isinstance(self.class_weight, six.string_types): if self.class_weight not in valid_presets: raise ValueError('Valid presets for class_weight include ' - '"balanced" and "balanced_subsample". Given "%s".' + '"balanced" and "balanced_subsample". ' + 'Given "%s".' % self.class_weight) if self.warm_start: - warn('class_weight presets "balanced" or "balanced_subsample" are ' + warn('class_weight presets "balanced" or ' + '"balanced_subsample" are ' 'not recommended for warm_start if the fitted data ' 'differs from the full dataset. In order to use ' - '"balanced" weights, use compute_class_weight("balanced", ' - 'classes, y). In place of y you can use a large ' - 'enough sample of the full training set target to ' - 'properly estimate the class frequency ' + '"balanced" weights, use compute_class_weight(' + '"balanced", classes, y). In place of y you can use a' + 'large enough sample of the full training set target ' + 'to properly estimate the class frequency ' 'distributions. Pass the resulting weights as the ' 'class_weight parameter.') @@ -554,8 +623,8 @@ def predict_proba(self, X): The predicted class probabilities of an input sample are computed as the mean predicted class probabilities of the trees in the forest. The - class probability of a single tree is the fraction of samples of the same - class in a leaf. + class probability of a single tree is the fraction of samples of the + same class in a leaf. Parameters ---------- @@ -948,7 +1017,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - class_weight=None): + class_weight=None, + balanced=False): super(RandomForestClassifier, self).__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, @@ -963,7 +1033,8 @@ def __init__(self, random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight) + class_weight=class_weight, + balanced=balanced) self.criterion = criterion self.max_depth = max_depth @@ -1300,7 +1371,8 @@ class ExtraTreesClassifier(ForestClassifier): and add more estimators to the ensemble, otherwise, just fit a whole new forest. - class_weight : dict, list of dicts, "balanced", "balanced_subsample" or None, optional (default=None) + class_weight : dict, list of dicts, "balanced", "balanced_subsample" or + None, optional (default=None) Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same @@ -1310,8 +1382,9 @@ class ExtraTreesClassifier(ForestClassifier): weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` - The "balanced_subsample" mode is the same as "balanced" except that weights are - computed based on the bootstrap sample for every tree grown. + The "balanced_subsample" mode is the same as "balanced" except that + weights are computed based on the bootstrap sample for every tree + grown. For multi-output, the weights of each column of y will be multiplied. diff --git a/sklearn/tests/test_balanced_random_forest.py b/sklearn/tests/test_balanced_random_forest.py new file mode 100644 index 0000000000000..8f2a15deaa343 --- /dev/null +++ b/sklearn/tests/test_balanced_random_forest.py @@ -0,0 +1,23 @@ +from sklearn.ensemble.forest import\ + _get_class_balance_data, _generate_balanced_sample_indices +import numpy as np +from numpy.testing import assert_array_equal + + +def test_get_class_balance_data(): + y = np.array([0, 1, 0, 1, 1, 2]) + classes, class_counts, class_indices = _get_class_balance_data(y) + assert_array_equal(classes, [0, 1, 2]) + assert_array_equal(class_counts, [2, 3, 1]) + assert_array_equal(class_indices[0], [0, 2]) + assert_array_equal(class_indices[1], [1, 3, 4]) + assert_array_equal(class_indices[2], [5]) + + +def test_generate_balanced_sample_indices(): + y = np.array([0, 1, 0, 1, 1, 2]) + random_state = 0 + balance_data = _get_class_balance_data(y) + sample_indices = _generate_balanced_sample_indices(random_state, + balance_data) + assert_array_equal(sample_indices, [0, 3, 5])