Skip to content

Stratifying Across Classes During Training in ShuffleSplit #5965 #5972

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 88 additions & 18 deletions sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,11 +772,12 @@ class BaseShuffleSplit(with_metaclass(ABCMeta)):
"""Base class for ShuffleSplit and StratifiedShuffleSplit"""

def __init__(self, n_iter=10, test_size=0.1, train_size=None,
random_state=None):
_validate_shuffle_split_init(test_size, train_size)
stratify_across_classes=False, random_state=None):
_validate_shuffle_split_init(test_size, train_size,stratify_across_classes)
self.n_iter = n_iter
self.test_size = test_size
self.train_size = train_size
self.stratify_across_classes = stratify_across_classes
self.random_state = random_state

def split(self, X, y=None, labels=None):
Expand Down Expand Up @@ -895,7 +896,7 @@ class ShuffleSplit(BaseShuffleSplit):

def _iter_indices(self, X, y=None, labels=None):
n_samples = _num_samples(X)
n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
n_train, n_test, train_size, test_size = _validate_shuffle_split(n_samples, self.test_size,
self.train_size)
rng = check_random_state(self.random_state)
for i in range(self.n_iter):
Expand Down Expand Up @@ -1005,6 +1006,11 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
int, represents the absolute number of train samples. If None,
the value is automatically set to the complement of the test size.

stratify_across_classes : bool (default is False)
If True, the sizes for the different classes in the training set
are made equal to the fixed percentage (or number) from the smallest class.
The underlying motivation is to avoid class-imbalance during training phase.

random_state : int or RandomState
Pseudo-random number generator state used for random sampling.

Expand All @@ -1028,13 +1034,13 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
"""

def __init__(self, n_iter=10, test_size=0.1, train_size=None,
random_state=None):
random_state=None, stratify_across_classes=False):
super(StratifiedShuffleSplit, self).__init__(
n_iter, test_size, train_size, random_state)
n_iter, test_size, train_size, stratify_across_classes, random_state)

def _iter_indices(self, X, y, labels=None):
def _iter_indices(self, X, y=None, labels=None):
n_samples = _num_samples(X)
n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
n_train, n_test, train_size, test_size = _validate_shuffle_split(n_samples, self.test_size,
self.train_size)
classes, y_indices = np.unique(y, return_inverse=True)
n_classes = classes.shape[0]
Expand All @@ -1057,8 +1063,14 @@ def _iter_indices(self, X, y, labels=None):

rng = check_random_state(self.random_state)
p_i = class_counts / float(n_samples)
n_i = np.round(n_train * p_i).astype(int)
t_i = np.minimum(class_counts - n_i,
if self.stratify_across_classes:
n_train_per_class = np.round(class_counts*train_size)
smallest_size = np.max([1,np.min(n_train_per_class)])
n_i = np.tile(smallest_size,class_counts.shape).astype(int)
t_i = (class_counts - n_i).astype(int)
else:
n_i = np.round(n_train * p_i).astype(int)
t_i = np.minimum(class_counts - n_i,
np.round(n_test * p_i).astype(int))

for _ in range(self.n_iter):
Expand Down Expand Up @@ -1089,15 +1101,20 @@ def _iter_indices(self, X, y, labels=None):
yield train, test


def _validate_shuffle_split_init(test_size, train_size):
"""Validation helper to check the test_size and train_size at init
def _validate_shuffle_split_init(test_size, train_size,stratify_across_classes):
"""Validation helper to check the test_size, train_size and stratify_across_classes at init

NOTE This does not take into account the number of samples which is known
only at split
"""
if test_size is None and train_size is None:
raise ValueError('test_size and train_size can not both be None')

if test_size is not None and train_size is not None\
and (np.asarray(train_size).dtype.kind == 'f' and np.asarray(test_size).dtype.kind == 'f') \
and ( train_size + test_size != 1. ):
raise ValueError('test_size and train_size must sum to 1. Got %f' % np.sum(test_size+train_size) )

if test_size is not None:
if np.asarray(test_size).dtype.kind == 'f':
if test_size >= 1.:
Expand All @@ -1123,6 +1140,10 @@ def _validate_shuffle_split_init(test_size, train_size):
# int values are checked during split based on the input
raise ValueError("Invalid value for train_size: %r" % train_size)

if not isinstance(stratify_across_classes,bool):
raise TypeError('stratify_across_classes must be a boolean flag. Got %s' %
stratify_across_classes.__class__)


def _validate_shuffle_split(n_samples, test_size, train_size):
"""
Expand All @@ -1139,28 +1160,77 @@ def _validate_shuffle_split(n_samples, test_size, train_size):
raise ValueError("train_size=%d should be smaller than the number of"
" samples %d" % (train_size, n_samples))

if (np.asarray(test_size).dtype.kind == 'f' and np.asarray(train_size).dtype.kind == 'i' ) or \
(np.asarray(test_size).dtype.kind == 'i' and np.asarray(train_size).dtype.kind == 'f' ):
raise ValueError("Data types of train and test sizes mixed up. Choose either float or int, only one.")

# this check is necessary to ensure expected behaviour
if (np.asarray(test_size).dtype.kind == 'f' and np.asarray(train_size).dtype.kind == 'f'
and test_size+train_size!=1.0):
raise ValueError("Sum of train and test size percentages must be 1. Got %f" % test_size+train_size)

if (np.asarray(test_size).dtype.kind == 'i' and np.asarray(train_size).dtype.kind == 'i'
and test_size+train_size != n_samples):
raise ValueError("Sum of train and test size must be n_samples. Got %f" % np.sum([test_size,train_size]))

# to work around the UnboundLocalError
n_test = None
n_train = None

if np.asarray(test_size).dtype.kind == 'f':
n_test = ceil(test_size * n_samples)
if train_size is None:
train_size = 1.0 - test_size
n_train = n_samples - n_test
else:
# TODO probably incorrect way to handle this situation - need to discuss with core-dev and community
n_train = n_samples - n_test
elif np.asarray(test_size).dtype.kind == 'i':
n_test = float(test_size)

if train_size is None:
n_train = n_samples - n_test
if train_size is None:
train_size = n_samples - n_test
n_train = n_samples - n_test
else:
# assert ((np.asarray(train_size).dtype.kind == 'i') and (train_size+test_size == n_samples)), \
# 'when test_size is int, train_size also must be int and the sum must be n_samples'
# TODO assert np.asarray(train_size).dtype.kind == 'i', 'when test_size is int, train_size also must be int.'
n_train = n_samples - n_test
elif np.asarray(train_size).dtype.kind == 'f':
n_train = floor(train_size * n_samples)
else:
if test_size is None:
test_size = 1.0 - train_size
n_test = n_samples - n_train
else:
# TODO assert ( (np.asarray(test_size).dtype.kind == 'f') and (train_size+test_size == 1.0)), \
# 'when train_size is float, test_size also must be float and they must sum to 1.0'
# assert np.asarray(test_size).dtype.kind == 'f', 'when train_size is float, test_size also must be float.'
n_test = n_samples - n_train
elif np.asarray(train_size).dtype.kind == 'i':
n_train = float(train_size)

if test_size is None:
n_test = n_samples - n_train
test_size = int(n_test)
else:
raise TypeError('Unexpected specification of train_size and test_size.'
'Only one of test_size or train_size must be specified.'
' either as percentage p such that 0.0 < p < 1.0, '
' or as an integer n such that 0 < n < #samples.')

if n_test is None or n_train is None:
raise ValueError('Combination of various inputs led to invalid calculations.')

if n_train + n_test > n_samples:
raise ValueError('The sum of train_size and test_size = %d, '
'should be smaller than the number of '
'samples %d. Reduce test_size and/or '
'train_size.' % (n_train + n_test, n_samples))

return int(n_train), int(n_test)
# TODO find a way to handle the sticky situation in which float & int are mixed to specify train_size and test_size
# if n_train < 1 or n_test < 1:
# raise ValueError('There must be at least one data point for training'
# 'and one data point for testing. Got {0:d} training'
# 'and {1:d} testing points.'.format(n_train, n_test))

return int(n_train), int(n_test), float(train_size), float(test_size)


class PredefinedSplit(BaseCrossValidator):
Expand Down
5 changes: 4 additions & 1 deletion sklearn/model_selection/tests/test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,10 +559,13 @@ def assert_counts_are_ok(idx_counts, p):
counter[id] += 1
assert_equal(n_splits, n_iter)

n_train, n_test = _validate_shuffle_split(n_samples,
n_train, n_test, _, _ = _validate_shuffle_split(n_samples,
test_size=1./n_folds,
train_size=1.-(1./n_folds))

# TODO need to revise this test for the proposed modification of StratifiedShuffleSplit
# TODO this is also to do with ensuring train_size + test_size = 1.0 when they are float
# TODO and similary when they are int, train_size + test_size = n_samples
assert_equal(len(train), n_train)
assert_equal(len(test), n_test)
assert_equal(len(set(train).intersection(test)), 0)
Expand Down