scikit-learn · massich · Aug 25, 2015 · Aug 27, 2015 · Aug 28, 2015 · Aug 28, 2015
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
@@ -82,6 +82,58 @@ def _generate_sample_indices(random_state, n_samples):
     return sample_indices
 
 
+def _get_class_balance_data(y):
+    """Private function used to fit function."""
+    if len(y.shape) == 1:
+        classes, class_counts = np.unique(y, return_counts=True)
+        class_indices = [np.nonzero(y == cls)[0] for cls in classes]
+
+    else:
+        classes, class_counts, class_indices = [], [], []
+        for i in range(y.shape[1]):
+            y_i = y[:, i]
+            classes_i, class_counts_i = np.unique(y_i, return_counts=True)
+            class_indices_i = [np.nonzero(y == cls)[0] for cls in classes_i]
+            classes_i = [(i, cls) for cls in classes_i]
+
+            classes.extend(classes_i)
+            class_counts.extend(class_counts_i)
+            class_indices.extend(class_indices_i)
+
+    return classes, class_counts, class_indices
+
+
+def _generate_balanced_sample_indices(random_state, balance_data):
+    """Private function used to _parallel_build_trees function.
+
+    Generates samples according to the balanced random forest method [1],
+        adapted for multi-class, i.e. a bootstrap sample from the minority
+        class and a random sample with replacement of the same size from all
+        other classes.
+
+    References
+    ----------
+    .. [1] Chen, C., Liaw, A., Breiman, L. (2004) "Using Random Forest to
+           Learn Imbalanced Data", Tech. Rep. 666, 2004
+    """
+    classes, class_counts, class_indices = balance_data
+    min_count = np.min(class_counts)
+    n_class = len(classes)
+
+    random_instance = check_random_state(random_state)
+    sample_indices = np.empty(n_class*min_count, dtype=int)
+
+    for i, cls, count, indices in zip(range(n_class),
+                                      classes,
+                                      class_counts,
+                                      class_indices):
+        random_instances = random_instance.randint(0, count, min_count)
+        random_indices = indices[random_instances]
+        sample_indices[i*min_count:(i+1)*min_count] = random_indices
+
+    return sample_indices
+
+
 def _generate_unsampled_indices(random_state, n_samples):
     """Private function used to forest._set_oob_score function."""
     sample_indices = _generate_sample_indices(random_state, n_samples)
@@ -94,7 +146,7 @@ def _generate_unsampled_indices(random_state, n_samples):
 
 
 def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
-                          verbose=0, class_weight=None):
+                          verbose=0, class_weight=None, balance_data=None):
     """Private function used to fit a single tree in parallel."""
     if verbose > 1:
         print("building tree %d of %d" % (tree_idx + 1, n_trees))
@@ -106,7 +158,12 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         else:
             curr_sample_weight = sample_weight.copy()
 
-        indices = _generate_sample_indices(tree.random_state, n_samples)
+        if balance_data is None:
+            indices = _generate_sample_indices(tree.random_state, n_samples)
+        else:
+            indices = _generate_balanced_sample_indices(tree.random_state,
+                                                        balance_data)
+
         sample_counts = bincount(indices, minlength=n_samples)
         curr_sample_weight *= sample_counts
 
@@ -117,6 +174,7 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         elif class_weight == 'balanced_subsample':
             curr_sample_weight *= compute_sample_weight('balanced', y, indices)
 
+        tree.sample_weight = curr_sample_weight
         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
     else:
         tree.fit(X, y, sample_weight=sample_weight, check_input=False)
@@ -142,7 +200,8 @@ def __init__(self,
                  random_state=None,
                  verbose=0,
                  warm_start=False,
-                 class_weight=None):
+                 class_weight=None,
+                 balanced=False):
         super(BaseForest, self).__init__(
             base_estimator=base_estimator,
             n_estimators=n_estimators,
@@ -155,6 +214,7 @@ def __init__(self,
         self.verbose = verbose
         self.warm_start = warm_start
         self.class_weight = class_weight
+        self.balanced = balanced
 
     def apply(self, X):
         """Apply trees in the forest to X, return leaf indices.
@@ -207,7 +267,7 @@ def decision_path(self, X):
         indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                               backend="threading")(
             delayed(parallel_helper)(tree, 'decision_path', X,
-                                      check_input=False)
+                                     check_input=False)
             for tree in self.estimators_)
 
         n_nodes = [0]
@@ -315,6 +375,11 @@ def fit(self, X, y, sample_weight=None):
                                             random_state=random_state)
                 trees.append(tree)
 
+            if self.balanced:
+                balance_data = _get_class_balance_data(y)
+            else:
+                balance_data = None
+
             # Parallel loop: we use the threading backend as the Cython code
             # for fitting the trees is internally releasing the Python GIL
             # making threading always more efficient than multiprocessing in
@@ -323,7 +388,8 @@ def fit(self, X, y, sample_weight=None):
                              backend="threading")(
                 delayed(_parallel_build_trees)(
                     t, self, X, y, sample_weight, i, len(trees),
-                    verbose=self.verbose, class_weight=self.class_weight)
+                    verbose=self.verbose, class_weight=self.class_weight,
+                    balance_data=balance_data)
                 for i, t in enumerate(trees))
 
             # Collect newly grown trees
@@ -406,7 +472,8 @@ def __init__(self,
                  random_state=None,
                  verbose=0,
                  warm_start=False,
-                 class_weight=None):
+                 class_weight=None,
+                 balanced=False):
 
         super(ForestClassifier, self).__init__(
             base_estimator,
@@ -418,7 +485,8 @@ def __init__(self,
             random_state=random_state,
             verbose=verbose,
             warm_start=warm_start,
-            class_weight=class_weight)
+            class_weight=class_weight,
+            balanced=balanced)
 
     def _set_oob_score(self, X, y):
         """Compute out-of-bag score"""
@@ -948,7 +1016,8 @@ def __init__(self,
                  random_state=None,
                  verbose=0,
                  warm_start=False,
-                 class_weight=None):
+                 class_weight=None,
+                 balanced=False):
         super(RandomForestClassifier, self).__init__(
             base_estimator=DecisionTreeClassifier(),
             n_estimators=n_estimators,
@@ -963,7 +1032,8 @@ def __init__(self,
             random_state=random_state,
             verbose=verbose,
             warm_start=warm_start,
-            class_weight=class_weight)
+            class_weight=class_weight,
+            balanced=balanced)
 
         self.criterion = criterion
         self.max_depth = max_depth
@@ -1300,7 +1370,8 @@ class ExtraTreesClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
-    class_weight : dict, list of dicts, "balanced", "balanced_subsample" or None, optional (default=None)
+    class_weight : dict, list of dicts, "balanced", "balanced_subsample",
+        or None, optional (default=None)
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one. For
         multi-output problems, a list of dicts can be provided in the same
@@ -1310,14 +1381,16 @@ class ExtraTreesClassifier(ForestClassifier):
         weights inversely proportional to class frequencies in the input data
         as ``n_samples / (n_classes * np.bincount(y))``
 
-        The "balanced_subsample" mode is the same as "balanced" except that weights are
-        computed based on the bootstrap sample for every tree grown.
+        The "balanced_subsample" mode is the same as "balanced" except that
+        weights are computed based on the bootstrap sample for every tree
+        grown.
 
         For multi-output, the weights of each column of y will be multiplied.
 
         Note that these weights will be multiplied with sample_weight (passed
         through the fit method) if sample_weight is specified.
 
+
     Attributes
     ----------
     estimators_ : list of DecisionTreeClassifier