scikit-learn · potash · Aug 25, 2015 · Aug 27, 2015 · Aug 28, 2015 · Aug 28, 2015
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
@@ -82,6 +82,58 @@ def _generate_sample_indices(random_state, n_samples):
     return sample_indices
 
 
+def _get_class_balance_data(y):
+    """Private function used to fit function.
+
+    Args: outcome array y
+    Returns: tuple of
+        - classes: list of classes
+        - class_counts: list of count of each class
+        - class_indices: list of indices of each class
+    """
+    if len(y.shape) > 1:
+        if y.shape[1] == 1:
+            y = y.flatten()
+        else:
+            raise ValueError("Balanced random forest not implemented "
+                             " for multi-output")
+
+    classes = np.unique(y)
+    class_indices = [np.nonzero(y == cls)[0] for cls in classes]
+    class_counts = [len(i) for i in class_indices]
+
+    return classes, class_counts, class_indices
+
+
+def _generate_balanced_sample_indices(random_state, balance_data):
+    """Private function used to _parallel_build_trees function.
+
+    Generates samples according to the balanced random forest method [1],
+        adapted for multi-class, i.e. a bootstrap sample from the minority
+        class and a random sample with replacement of the same size from all
+        other classes.
+
+    References
+    ----------
+    .. [1] Chen, C., Liaw, A., Breiman, L. (2004) "Using Random Forest to
+           Learn Imbalanced Data", Tech. Rep. 666, 2004
+    """
+    classes, class_counts, class_indices = balance_data
+    min_count = np.min(class_counts)
+    n_class = len(classes)
+
+    random_instance = check_random_state(random_state)
+    sample_indices = np.empty(n_class*min_count, dtype=int)
+
+    for i, cls, count, indices in zip(range(n_class), classes, class_counts,
+                                      class_indices):
+        random_instances = random_instance.randint(0, count, min_count)
+        random_indices = indices[random_instances]
+        sample_indices[i*min_count:(i+1)*min_count] = random_indices
+
+    return sample_indices
+
+
 def _generate_unsampled_indices(random_state, n_samples):
     """Private function used to forest._set_oob_score function."""
     sample_indices = _generate_sample_indices(random_state, n_samples)
@@ -94,7 +146,7 @@ def _generate_unsampled_indices(random_state, n_samples):
 
 
 def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
-                          verbose=0, class_weight=None):
+                          verbose=0, class_weight=None, balance_data=None):
     """Private function used to fit a single tree in parallel."""
     if verbose > 1:
         print("building tree %d of %d" % (tree_idx + 1, n_trees))
@@ -106,7 +158,12 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         else:
             curr_sample_weight = sample_weight.copy()
 
-        indices = _generate_sample_indices(tree.random_state, n_samples)
+        if balance_data is None:
+            indices = _generate_sample_indices(tree.random_state, n_samples)
+        else:
+            indices = _generate_balanced_sample_indices(tree.random_state,
+                                                        balance_data)
+
         sample_counts = bincount(indices, minlength=n_samples)
         curr_sample_weight *= sample_counts
 
@@ -117,6 +174,7 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         elif class_weight == 'balanced_subsample':
             curr_sample_weight *= compute_sample_weight('balanced', y, indices)
 
+        tree.sample_weight = curr_sample_weight
         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
     else:
         tree.fit(X, y, sample_weight=sample_weight, check_input=False)
@@ -142,7 +200,8 @@ def __init__(self,
                  random_state=None,
                  verbose=0,
                  warm_start=False,
-                 class_weight=None):
+                 class_weight=None,
+                 balanced=False):
         super(BaseForest, self).__init__(
             base_estimator=base_estimator,
             n_estimators=n_estimators,
@@ -155,6 +214,7 @@ def __init__(self,
         self.verbose = verbose
         self.warm_start = warm_start
         self.class_weight = class_weight
+        self.balanced = balanced
 
     def apply(self, X):
         """Apply trees in the forest to X, return leaf indices.
@@ -207,7 +267,7 @@ def decision_path(self, X):
         indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                               backend="threading")(
             delayed(parallel_helper)(tree, 'decision_path', X,
-                                      check_input=False)
+                                     check_input=False)
             for tree in self.estimators_)
 
         n_nodes = [0]
@@ -222,8 +282,8 @@ def fit(self, X, y, sample_weight=None):
         Parameters
         ----------
         X : array-like or sparse matrix of shape = [n_samples, n_features]
-            The training input samples. Internally, its dtype will be converted to
-            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            The training input samples. Internally, its dtype will be converted
+            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
             converted into a sparse ``csc_matrix``.
 
         y : array-like, shape = [n_samples] or [n_samples, n_outputs]
@@ -315,6 +375,9 @@ def fit(self, X, y, sample_weight=None):
                                             random_state=random_state)
                 trees.append(tree)
 
+            balance_data = _get_class_balance_data(y)\
+                if self.balanced else None
+
             # Parallel loop: we use the threading backend as the Cython code
             # for fitting the trees is internally releasing the Python GIL
             # making threading always more efficient than multiprocessing in
@@ -323,7 +386,8 @@ def fit(self, X, y, sample_weight=None):
                              backend="threading")(
                 delayed(_parallel_build_trees)(
                     t, self, X, y, sample_weight, i, len(trees),
-                    verbose=self.verbose, class_weight=self.class_weight)
+                    verbose=self.verbose, class_weight=self.class_weight,
+                    balance_data=balance_data)
                 for i, t in enumerate(trees))
 
             # Collect newly grown trees
@@ -406,7 +470,8 @@ def __init__(self,
                  random_state=None,
                  verbose=0,
                  warm_start=False,
-                 class_weight=None):
+                 class_weight=None,
+                 balanced=False):
 
         super(ForestClassifier, self).__init__(
             base_estimator,
@@ -418,7 +483,8 @@ def __init__(self,
             random_state=random_state,
             verbose=verbose,
             warm_start=warm_start,
-            class_weight=class_weight)
+            class_weight=class_weight,
+            balanced=balanced)
 
     def _set_oob_score(self, X, y):
         """Compute out-of-bag score"""
@@ -479,7 +545,8 @@ def _validate_y_class_weight(self, y):
 
         y_store_unique_indices = np.zeros(y.shape, dtype=np.int)
         for k in range(self.n_outputs_):
-            classes_k, y_store_unique_indices[:, k] = np.unique(y[:, k], return_inverse=True)
+            classes_k, y_store_unique_indices[:, k] = np.unique(
+                    y[:, k], return_inverse=True)
             self.classes_.append(classes_k)
             self.n_classes_.append(classes_k.shape[0])
         y = y_store_unique_indices
@@ -489,16 +556,18 @@ def _validate_y_class_weight(self, y):
             if isinstance(self.class_weight, six.string_types):
                 if self.class_weight not in valid_presets:
                     raise ValueError('Valid presets for class_weight include '
-                                     '"balanced" and "balanced_subsample". Given "%s".'
+                                     '"balanced" and "balanced_subsample". '
+                                     'Given "%s".'
                                      % self.class_weight)
                 if self.warm_start:
-                    warn('class_weight presets "balanced" or "balanced_subsample" are '
+                    warn('class_weight presets "balanced" or '
+                         '"balanced_subsample" are '
                          'not recommended for warm_start if the fitted data '
                          'differs from the full dataset. In order to use '
-                         '"balanced" weights, use compute_class_weight("balanced", '
-                         'classes, y). In place of y you can use a large '
-                         'enough sample of the full training set target to '
-                         'properly estimate the class frequency '
+                         '"balanced" weights, use compute_class_weight('
+                         '"balanced", classes, y). In place of y you can use a'
+                         'large enough sample of the full training set target '
+                         'to properly estimate the class frequency '
                          'distributions. Pass the resulting weights as the '
                          'class_weight parameter.')
 
@@ -554,8 +623,8 @@ def predict_proba(self, X):
 
         The predicted class probabilities of an input sample are computed as
         the mean predicted class probabilities of the trees in the forest. The
-        class probability of a single tree is the fraction of samples of the same
-        class in a leaf.
+        class probability of a single tree is the fraction of samples of the
+        same class in a leaf.
 
         Parameters
         ----------
@@ -948,7 +1017,8 @@ def __init__(self,
                  random_state=None,
                  verbose=0,
                  warm_start=False,
-                 class_weight=None):
+                 class_weight=None,
+                 balanced=False):
         super(RandomForestClassifier, self).__init__(
             base_estimator=DecisionTreeClassifier(),
             n_estimators=n_estimators,
@@ -963,7 +1033,8 @@ def __init__(self,
             random_state=random_state,
             verbose=verbose,
             warm_start=warm_start,
-            class_weight=class_weight)
+            class_weight=class_weight,
+            balanced=balanced)
 
         self.criterion = criterion
         self.max_depth = max_depth
@@ -1300,7 +1371,8 @@ class ExtraTreesClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
-    class_weight : dict, list of dicts, "balanced", "balanced_subsample" or None, optional (default=None)
+    class_weight : dict, list of dicts, "balanced", "balanced_subsample" or
+        None, optional (default=None)
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one. For
         multi-output problems, a list of dicts can be provided in the same
@@ -1310,8 +1382,9 @@ class ExtraTreesClassifier(ForestClassifier):
         weights inversely proportional to class frequencies in the input data
         as ``n_samples / (n_classes * np.bincount(y))``
 
-        The "balanced_subsample" mode is the same as "balanced" except that weights are
-        computed based on the bootstrap sample for every tree grown.
+        The "balanced_subsample" mode is the same as "balanced" except that
+        weights are computed based on the bootstrap sample for every tree
+        grown.
 
         For multi-output, the weights of each column of y will be multiplied.
 

diff --git a/sklearn/tests/test_balanced_random_forest.py b/sklearn/tests/test_balanced_random_forest.py
@@ -0,0 +1,23 @@
+from sklearn.ensemble.forest import\
+    _get_class_balance_data, _generate_balanced_sample_indices
+import numpy as np
+from numpy.testing import assert_array_equal
+
+
+def test_get_class_balance_data():
+    y = np.array([0, 1, 0, 1, 1, 2])
+    classes, class_counts, class_indices = _get_class_balance_data(y)
+    assert_array_equal(classes, [0, 1, 2])
+    assert_array_equal(class_counts, [2, 3, 1])
+    assert_array_equal(class_indices[0], [0, 2])
+    assert_array_equal(class_indices[1], [1, 3, 4])
+    assert_array_equal(class_indices[2], [5])
+
+
+def test_generate_balanced_sample_indices():
+    y = np.array([0, 1, 0, 1, 1, 2])
+    random_state = 0
+    balance_data = _get_class_balance_data(y)
+    sample_indices = _generate_balanced_sample_indices(random_state,
+                                                       balance_data)
+    assert_array_equal(sample_indices, [0, 3, 5])