Skip to content

[WIP] Balanced Random Forest #8732

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 17 commits into from
97 changes: 85 additions & 12 deletions sklearn/ensemble/forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,58 @@ def _generate_sample_indices(random_state, n_samples):
return sample_indices


def _get_class_balance_data(y):
"""Private function used to fit function."""
if len(y.shape) == 1:
classes, class_counts = np.unique(y, return_counts=True)
class_indices = [np.nonzero(y == cls)[0] for cls in classes]

else:
classes, class_counts, class_indices = [], [], []
for i in range(y.shape[1]):
y_i = y[:, i]
classes_i, class_counts_i = np.unique(y_i, return_counts=True)
class_indices_i = [np.nonzero(y == cls)[0] for cls in classes_i]
classes_i = [(i, cls) for cls in classes_i]

classes.extend(classes_i)
class_counts.extend(class_counts_i)
class_indices.extend(class_indices_i)

return classes, class_counts, class_indices


def _generate_balanced_sample_indices(random_state, balance_data):
"""Private function used to _parallel_build_trees function.

Generates samples according to the balanced random forest method [1],
adapted for multi-class, i.e. a bootstrap sample from the minority
class and a random sample with replacement of the same size from all
other classes.

References
----------
.. [1] Chen, C., Liaw, A., Breiman, L. (2004) "Using Random Forest to
Learn Imbalanced Data", Tech. Rep. 666, 2004
"""
classes, class_counts, class_indices = balance_data
min_count = np.min(class_counts)
n_class = len(classes)

random_instance = check_random_state(random_state)
sample_indices = np.empty(n_class*min_count, dtype=int)

for i, cls, count, indices in zip(range(n_class),
classes,
class_counts,
class_indices):
random_instances = random_instance.randint(0, count, min_count)
random_indices = indices[random_instances]
sample_indices[i*min_count:(i+1)*min_count] = random_indices

return sample_indices


def _generate_unsampled_indices(random_state, n_samples):
"""Private function used to forest._set_oob_score function."""
sample_indices = _generate_sample_indices(random_state, n_samples)
Expand All @@ -94,7 +146,7 @@ def _generate_unsampled_indices(random_state, n_samples):


def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
verbose=0, class_weight=None):
verbose=0, class_weight=None, balance_data=None):
"""Private function used to fit a single tree in parallel."""
if verbose > 1:
print("building tree %d of %d" % (tree_idx + 1, n_trees))
Expand All @@ -106,7 +158,12 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
else:
curr_sample_weight = sample_weight.copy()

indices = _generate_sample_indices(tree.random_state, n_samples)
if balance_data is None:
indices = _generate_sample_indices(tree.random_state, n_samples)
else:
indices = _generate_balanced_sample_indices(tree.random_state,
balance_data)

sample_counts = bincount(indices, minlength=n_samples)
curr_sample_weight *= sample_counts

Expand All @@ -117,6 +174,7 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
elif class_weight == 'balanced_subsample':
curr_sample_weight *= compute_sample_weight('balanced', y, indices)

tree.sample_weight = curr_sample_weight
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
else:
tree.fit(X, y, sample_weight=sample_weight, check_input=False)
Expand All @@ -142,7 +200,8 @@ def __init__(self,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None):
class_weight=None,
balanced=False):
super(BaseForest, self).__init__(
base_estimator=base_estimator,
n_estimators=n_estimators,
Expand All @@ -155,6 +214,7 @@ def __init__(self,
self.verbose = verbose
self.warm_start = warm_start
self.class_weight = class_weight
self.balanced = balanced

def apply(self, X):
"""Apply trees in the forest to X, return leaf indices.
Expand Down Expand Up @@ -207,7 +267,7 @@ def decision_path(self, X):
indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
backend="threading")(
delayed(parallel_helper)(tree, 'decision_path', X,
check_input=False)
check_input=False)
for tree in self.estimators_)

n_nodes = [0]
Expand Down Expand Up @@ -315,6 +375,11 @@ def fit(self, X, y, sample_weight=None):
random_state=random_state)
trees.append(tree)

if self.balanced:
balance_data = _get_class_balance_data(y)
else:
balance_data = None

# Parallel loop: we use the threading backend as the Cython code
# for fitting the trees is internally releasing the Python GIL
# making threading always more efficient than multiprocessing in
Expand All @@ -323,7 +388,8 @@ def fit(self, X, y, sample_weight=None):
backend="threading")(
delayed(_parallel_build_trees)(
t, self, X, y, sample_weight, i, len(trees),
verbose=self.verbose, class_weight=self.class_weight)
verbose=self.verbose, class_weight=self.class_weight,
balance_data=balance_data)
for i, t in enumerate(trees))

# Collect newly grown trees
Expand Down Expand Up @@ -406,7 +472,8 @@ def __init__(self,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None):
class_weight=None,
balanced=False):

super(ForestClassifier, self).__init__(
base_estimator,
Expand All @@ -418,7 +485,8 @@ def __init__(self,
random_state=random_state,
verbose=verbose,
warm_start=warm_start,
class_weight=class_weight)
class_weight=class_weight,
balanced=balanced)

def _set_oob_score(self, X, y):
"""Compute out-of-bag score"""
Expand Down Expand Up @@ -948,7 +1016,8 @@ def __init__(self,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None):
class_weight=None,
balanced=False):
super(RandomForestClassifier, self).__init__(
base_estimator=DecisionTreeClassifier(),
n_estimators=n_estimators,
Expand All @@ -963,7 +1032,8 @@ def __init__(self,
random_state=random_state,
verbose=verbose,
warm_start=warm_start,
class_weight=class_weight)
class_weight=class_weight,
balanced=balanced)

self.criterion = criterion
self.max_depth = max_depth
Expand Down Expand Up @@ -1300,7 +1370,8 @@ class ExtraTreesClassifier(ForestClassifier):
and add more estimators to the ensemble, otherwise, just fit a whole
new forest.

class_weight : dict, list of dicts, "balanced", "balanced_subsample" or None, optional (default=None)
class_weight : dict, list of dicts, "balanced", "balanced_subsample",
or None, optional (default=None)
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one. For
multi-output problems, a list of dicts can be provided in the same
Expand All @@ -1310,14 +1381,16 @@ class ExtraTreesClassifier(ForestClassifier):
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``

The "balanced_subsample" mode is the same as "balanced" except that weights are
computed based on the bootstrap sample for every tree grown.
The "balanced_subsample" mode is the same as "balanced" except that
weights are computed based on the bootstrap sample for every tree
grown.

For multi-output, the weights of each column of y will be multiplied.

Note that these weights will be multiplied with sample_weight (passed
through the fit method) if sample_weight is specified.


Attributes
----------
estimators_ : list of DecisionTreeClassifier
Expand Down