Skip to content

Balanced Random Forest #5181

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 96 additions & 23 deletions sklearn/ensemble/forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,58 @@ def _generate_sample_indices(random_state, n_samples):
return sample_indices


def _get_class_balance_data(y):
"""Private function used to fit function.

Args: outcome array y
Returns: tuple of
- classes: list of classes
- class_counts: list of count of each class
- class_indices: list of indices of each class
"""
if len(y.shape) > 1:
if y.shape[1] == 1:
y = y.flatten()
else:
raise ValueError("Balanced random forest not implemented "
" for multi-output")

classes = np.unique(y)
class_indices = [np.nonzero(y == cls)[0] for cls in classes]
class_counts = [len(i) for i in class_indices]

return classes, class_counts, class_indices


def _generate_balanced_sample_indices(random_state, balance_data):
"""Private function used to _parallel_build_trees function.

Generates samples according to the balanced random forest method [1],
adapted for multi-class, i.e. a bootstrap sample from the minority
class and a random sample with replacement of the same size from all
other classes.

References
----------
.. [1] Chen, C., Liaw, A., Breiman, L. (2004) "Using Random Forest to
Learn Imbalanced Data", Tech. Rep. 666, 2004
"""
classes, class_counts, class_indices = balance_data
min_count = np.min(class_counts)
n_class = len(classes)

random_instance = check_random_state(random_state)
sample_indices = np.empty(n_class*min_count, dtype=int)

for i, cls, count, indices in zip(range(n_class), classes, class_counts,
class_indices):
random_instances = random_instance.randint(0, count, min_count)
random_indices = indices[random_instances]
sample_indices[i*min_count:(i+1)*min_count] = random_indices

return sample_indices


def _generate_unsampled_indices(random_state, n_samples):
"""Private function used to forest._set_oob_score function."""
sample_indices = _generate_sample_indices(random_state, n_samples)
Expand All @@ -94,7 +146,7 @@ def _generate_unsampled_indices(random_state, n_samples):


def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
verbose=0, class_weight=None):
verbose=0, class_weight=None, balance_data=None):
"""Private function used to fit a single tree in parallel."""
if verbose > 1:
print("building tree %d of %d" % (tree_idx + 1, n_trees))
Expand All @@ -106,7 +158,12 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
else:
curr_sample_weight = sample_weight.copy()

indices = _generate_sample_indices(tree.random_state, n_samples)
if balance_data is None:
indices = _generate_sample_indices(tree.random_state, n_samples)
else:
indices = _generate_balanced_sample_indices(tree.random_state,
balance_data)

sample_counts = bincount(indices, minlength=n_samples)
curr_sample_weight *= sample_counts

Expand All @@ -117,6 +174,7 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
elif class_weight == 'balanced_subsample':
curr_sample_weight *= compute_sample_weight('balanced', y, indices)

tree.sample_weight = curr_sample_weight
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
else:
tree.fit(X, y, sample_weight=sample_weight, check_input=False)
Expand All @@ -142,7 +200,8 @@ def __init__(self,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None):
class_weight=None,
balanced=False):
super(BaseForest, self).__init__(
base_estimator=base_estimator,
n_estimators=n_estimators,
Expand All @@ -155,6 +214,7 @@ def __init__(self,
self.verbose = verbose
self.warm_start = warm_start
self.class_weight = class_weight
self.balanced = balanced

def apply(self, X):
"""Apply trees in the forest to X, return leaf indices.
Expand Down Expand Up @@ -207,7 +267,7 @@ def decision_path(self, X):
indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
backend="threading")(
delayed(parallel_helper)(tree, 'decision_path', X,
check_input=False)
check_input=False)
for tree in self.estimators_)

n_nodes = [0]
Expand All @@ -222,8 +282,8 @@ def fit(self, X, y, sample_weight=None):
Parameters
----------
X : array-like or sparse matrix of shape = [n_samples, n_features]
The training input samples. Internally, its dtype will be converted to
``dtype=np.float32``. If a sparse matrix is provided, it will be
The training input samples. Internally, its dtype will be converted
to ``dtype=np.float32``. If a sparse matrix is provided, it will be
converted into a sparse ``csc_matrix``.

y : array-like, shape = [n_samples] or [n_samples, n_outputs]
Expand Down Expand Up @@ -315,6 +375,9 @@ def fit(self, X, y, sample_weight=None):
random_state=random_state)
trees.append(tree)

balance_data = _get_class_balance_data(y)\
if self.balanced else None

# Parallel loop: we use the threading backend as the Cython code
# for fitting the trees is internally releasing the Python GIL
# making threading always more efficient than multiprocessing in
Expand All @@ -323,7 +386,8 @@ def fit(self, X, y, sample_weight=None):
backend="threading")(
delayed(_parallel_build_trees)(
t, self, X, y, sample_weight, i, len(trees),
verbose=self.verbose, class_weight=self.class_weight)
verbose=self.verbose, class_weight=self.class_weight,
balance_data=balance_data)
for i, t in enumerate(trees))

# Collect newly grown trees
Expand Down Expand Up @@ -406,7 +470,8 @@ def __init__(self,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None):
class_weight=None,
balanced=False):

super(ForestClassifier, self).__init__(
base_estimator,
Expand All @@ -418,7 +483,8 @@ def __init__(self,
random_state=random_state,
verbose=verbose,
warm_start=warm_start,
class_weight=class_weight)
class_weight=class_weight,
balanced=balanced)

def _set_oob_score(self, X, y):
"""Compute out-of-bag score"""
Expand Down Expand Up @@ -479,7 +545,8 @@ def _validate_y_class_weight(self, y):

y_store_unique_indices = np.zeros(y.shape, dtype=np.int)
for k in range(self.n_outputs_):
classes_k, y_store_unique_indices[:, k] = np.unique(y[:, k], return_inverse=True)
classes_k, y_store_unique_indices[:, k] = np.unique(
y[:, k], return_inverse=True)
self.classes_.append(classes_k)
self.n_classes_.append(classes_k.shape[0])
y = y_store_unique_indices
Expand All @@ -489,16 +556,18 @@ def _validate_y_class_weight(self, y):
if isinstance(self.class_weight, six.string_types):
if self.class_weight not in valid_presets:
raise ValueError('Valid presets for class_weight include '
'"balanced" and "balanced_subsample". Given "%s".'
'"balanced" and "balanced_subsample". '
'Given "%s".'
% self.class_weight)
if self.warm_start:
warn('class_weight presets "balanced" or "balanced_subsample" are '
warn('class_weight presets "balanced" or '
'"balanced_subsample" are '
'not recommended for warm_start if the fitted data '
'differs from the full dataset. In order to use '
'"balanced" weights, use compute_class_weight("balanced", '
'classes, y). In place of y you can use a large '
'enough sample of the full training set target to '
'properly estimate the class frequency '
'"balanced" weights, use compute_class_weight('
'"balanced", classes, y). In place of y you can use a'
'large enough sample of the full training set target '
'to properly estimate the class frequency '
'distributions. Pass the resulting weights as the '
'class_weight parameter.')

Expand Down Expand Up @@ -554,8 +623,8 @@ def predict_proba(self, X):

The predicted class probabilities of an input sample are computed as
the mean predicted class probabilities of the trees in the forest. The
class probability of a single tree is the fraction of samples of the same
class in a leaf.
class probability of a single tree is the fraction of samples of the
same class in a leaf.

Parameters
----------
Expand Down Expand Up @@ -948,7 +1017,8 @@ def __init__(self,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None):
class_weight=None,
balanced=False):
super(RandomForestClassifier, self).__init__(
base_estimator=DecisionTreeClassifier(),
n_estimators=n_estimators,
Expand All @@ -963,7 +1033,8 @@ def __init__(self,
random_state=random_state,
verbose=verbose,
warm_start=warm_start,
class_weight=class_weight)
class_weight=class_weight,
balanced=balanced)

self.criterion = criterion
self.max_depth = max_depth
Expand Down Expand Up @@ -1300,7 +1371,8 @@ class ExtraTreesClassifier(ForestClassifier):
and add more estimators to the ensemble, otherwise, just fit a whole
new forest.

class_weight : dict, list of dicts, "balanced", "balanced_subsample" or None, optional (default=None)
class_weight : dict, list of dicts, "balanced", "balanced_subsample" or
None, optional (default=None)
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one. For
multi-output problems, a list of dicts can be provided in the same
Expand All @@ -1310,8 +1382,9 @@ class ExtraTreesClassifier(ForestClassifier):
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``

The "balanced_subsample" mode is the same as "balanced" except that weights are
computed based on the bootstrap sample for every tree grown.
The "balanced_subsample" mode is the same as "balanced" except that
weights are computed based on the bootstrap sample for every tree
grown.

For multi-output, the weights of each column of y will be multiplied.

Expand Down
23 changes: 23 additions & 0 deletions sklearn/tests/test_balanced_random_forest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from sklearn.ensemble.forest import\
_get_class_balance_data, _generate_balanced_sample_indices
import numpy as np
from numpy.testing import assert_array_equal


def test_get_class_balance_data():
y = np.array([0, 1, 0, 1, 1, 2])
classes, class_counts, class_indices = _get_class_balance_data(y)
assert_array_equal(classes, [0, 1, 2])
assert_array_equal(class_counts, [2, 3, 1])
assert_array_equal(class_indices[0], [0, 2])
assert_array_equal(class_indices[1], [1, 3, 4])
assert_array_equal(class_indices[2], [5])


def test_generate_balanced_sample_indices():
y = np.array([0, 1, 0, 1, 1, 2])
random_state = 0
balance_data = _get_class_balance_data(y)
sample_indices = _generate_balanced_sample_indices(random_state,
balance_data)
assert_array_equal(sample_indices, [0, 3, 5])