scikit-learn · raamana · Dec 7, 2015 · Dec 7, 2015 · Dec 7, 2015 · Dec 7, 2015
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -772,11 +772,12 @@ class BaseShuffleSplit(with_metaclass(ABCMeta)):
     """Base class for ShuffleSplit and StratifiedShuffleSplit"""
 
     def __init__(self, n_iter=10, test_size=0.1, train_size=None,
-                 random_state=None):
-        _validate_shuffle_split_init(test_size, train_size)
+                stratify_across_classes=False, random_state=None):
+        _validate_shuffle_split_init(test_size, train_size,stratify_across_classes)
         self.n_iter = n_iter
         self.test_size = test_size
         self.train_size = train_size
+        self.stratify_across_classes = stratify_across_classes
         self.random_state = random_state
 
     def split(self, X, y=None, labels=None):
@@ -895,7 +896,7 @@ class ShuffleSplit(BaseShuffleSplit):
 
     def _iter_indices(self, X, y=None, labels=None):
         n_samples = _num_samples(X)
-        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
+        n_train, n_test, train_size, test_size = _validate_shuffle_split(n_samples, self.test_size,
                                                   self.train_size)
         rng = check_random_state(self.random_state)
         for i in range(self.n_iter):
@@ -1005,6 +1006,11 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
         int, represents the absolute number of train samples. If None,
         the value is automatically set to the complement of the test size.
 
+    stratify_across_classes : bool (default is False)
+        If True, the sizes for the different classes in the training set
+        are made equal to the fixed percentage (or number) from the smallest class.
+        The underlying motivation is to avoid class-imbalance during training phase.
+
     random_state : int or RandomState
         Pseudo-random number generator state used for random sampling.
 
@@ -1028,13 +1034,13 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
     """
 
     def __init__(self, n_iter=10, test_size=0.1, train_size=None,
-                 random_state=None):
+                 random_state=None, stratify_across_classes=False):
         super(StratifiedShuffleSplit, self).__init__(
-            n_iter, test_size, train_size, random_state)
+            n_iter, test_size, train_size, stratify_across_classes, random_state)
 
-    def _iter_indices(self, X, y, labels=None):
+    def _iter_indices(self, X, y=None, labels=None):
         n_samples = _num_samples(X)
-        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
+        n_train, n_test, train_size, test_size = _validate_shuffle_split(n_samples, self.test_size,
                                                   self.train_size)
         classes, y_indices = np.unique(y, return_inverse=True)
         n_classes = classes.shape[0]
@@ -1057,8 +1063,14 @@ def _iter_indices(self, X, y, labels=None):
 
         rng = check_random_state(self.random_state)
         p_i = class_counts / float(n_samples)
-        n_i = np.round(n_train * p_i).astype(int)
-        t_i = np.minimum(class_counts - n_i,
+        if self.stratify_across_classes:
+            n_train_per_class = np.round(class_counts*train_size)
+            smallest_size = np.max([1,np.min(n_train_per_class)])
+            n_i = np.tile(smallest_size,class_counts.shape).astype(int)
+            t_i = (class_counts - n_i).astype(int)
+        else:
+            n_i = np.round(n_train * p_i).astype(int)
+            t_i = np.minimum(class_counts - n_i,
                          np.round(n_test * p_i).astype(int))
 
         for _ in range(self.n_iter):
@@ -1089,15 +1101,20 @@ def _iter_indices(self, X, y, labels=None):
             yield train, test
 
 
-def _validate_shuffle_split_init(test_size, train_size):
-    """Validation helper to check the test_size and train_size at init
+def _validate_shuffle_split_init(test_size, train_size,stratify_across_classes):
+    """Validation helper to check the test_size, train_size and stratify_across_classes at init
 
     NOTE This does not take into account the number of samples which is known
     only at split
     """
     if test_size is None and train_size is None:
         raise ValueError('test_size and train_size can not both be None')
 
+    if test_size is not None and train_size is not None\
+            and (np.asarray(train_size).dtype.kind == 'f' and np.asarray(test_size).dtype.kind == 'f') \
+            and ( train_size + test_size != 1. ):
+        raise ValueError('test_size and train_size must sum to 1. Got %f' % np.sum(test_size+train_size) )
+
     if test_size is not None:
         if np.asarray(test_size).dtype.kind == 'f':
             if test_size >= 1.:
@@ -1123,6 +1140,10 @@ def _validate_shuffle_split_init(test_size, train_size):
             # int values are checked during split based on the input
             raise ValueError("Invalid value for train_size: %r" % train_size)
 
+    if not isinstance(stratify_across_classes,bool):
+        raise TypeError('stratify_across_classes must be a boolean flag. Got %s' %
+                        stratify_across_classes.__class__)
+
 
 def _validate_shuffle_split(n_samples, test_size, train_size):
     """
@@ -1139,28 +1160,77 @@ def _validate_shuffle_split(n_samples, test_size, train_size):
         raise ValueError("train_size=%d should be smaller than the number of"
                          " samples %d" % (train_size, n_samples))
 
+    if (np.asarray(test_size).dtype.kind == 'f' and np.asarray(train_size).dtype.kind == 'i' ) or \
+            (np.asarray(test_size).dtype.kind == 'i' and np.asarray(train_size).dtype.kind == 'f' ):
+        raise ValueError("Data types of train and test sizes mixed up. Choose either float or int, only one.")
+
+    # this check is necessary to ensure expected behaviour
+    if (np.asarray(test_size).dtype.kind == 'f' and np.asarray(train_size).dtype.kind == 'f'
+        and test_size+train_size!=1.0):
+        raise ValueError("Sum of train and test size percentages must be 1. Got %f" % test_size+train_size)
+
+    if (np.asarray(test_size).dtype.kind == 'i' and np.asarray(train_size).dtype.kind == 'i'
+        and test_size+train_size != n_samples):
+        raise ValueError("Sum of train and test size must be n_samples. Got %f" % np.sum([test_size,train_size]))
+
+    # to work around the UnboundLocalError
+    n_test = None
+    n_train = None
+
     if np.asarray(test_size).dtype.kind == 'f':
         n_test = ceil(test_size * n_samples)
+        if train_size is None:
+            train_size = 1.0 - test_size
+            n_train = n_samples - n_test
+        else:
+            # TODO probably incorrect way to handle this situation - need to discuss with core-dev and community
+            n_train = n_samples - n_test
     elif np.asarray(test_size).dtype.kind == 'i':
         n_test = float(test_size)
-
-    if train_size is None:
-        n_train = n_samples - n_test
+        if train_size is None:
+            train_size = n_samples - n_test
+            n_train = n_samples - n_test
+        else:
+            # assert ((np.asarray(train_size).dtype.kind == 'i') and (train_size+test_size == n_samples)), \
+            #     'when test_size is int, train_size also must be int and the sum must be n_samples'
+            # TODO assert np.asarray(train_size).dtype.kind == 'i', 'when test_size is int, train_size also must be int.'
+            n_train = n_samples - n_test
     elif np.asarray(train_size).dtype.kind == 'f':
         n_train = floor(train_size * n_samples)
-    else:
+        if test_size is None:
+            test_size = 1.0 - train_size
+            n_test = n_samples - n_train
+        else:
+            # TODO assert ( (np.asarray(test_size).dtype.kind == 'f') and (train_size+test_size == 1.0)), \
+            #     'when train_size is float, test_size also must be float and they must sum to 1.0'
+            # assert np.asarray(test_size).dtype.kind == 'f', 'when train_size is float, test_size also must be float.'
+            n_test = n_samples - n_train
+    elif np.asarray(train_size).dtype.kind == 'i':
         n_train = float(train_size)
-
-    if test_size is None:
         n_test = n_samples - n_train
+        test_size = int(n_test)
+    else:
+        raise TypeError('Unexpected specification of train_size and test_size.'
+                        'Only one of test_size or train_size must be specified.'
+                        ' either as percentage p such that 0.0 < p < 1.0, '
+                        '     or as an integer n such that 0 < n < #samples.')
+
+    if n_test is None or n_train is None:
+        raise ValueError('Combination of various inputs led to invalid calculations.')
 
     if n_train + n_test > n_samples:
         raise ValueError('The sum of train_size and test_size = %d, '
                          'should be smaller than the number of '
                          'samples %d. Reduce test_size and/or '
                          'train_size.' % (n_train + n_test, n_samples))
 
-    return int(n_train), int(n_test)
+    # TODO find a way to handle the sticky situation in which float & int are mixed to specify train_size and test_size
+    # if n_train < 1 or n_test < 1:
+    #     raise ValueError('There must be at least one data point for training'
+    #                         'and one data point for testing. Got {0:d} training'
+    #                         'and {1:d} testing points.'.format(n_train, n_test))
+
+    return int(n_train), int(n_test), float(train_size), float(test_size)
 
 
 class PredefinedSplit(BaseCrossValidator):

diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
@@ -559,10 +559,13 @@ def assert_counts_are_ok(idx_counts, p):
                     counter[id] += 1
         assert_equal(n_splits, n_iter)
 
-        n_train, n_test = _validate_shuffle_split(n_samples,
+        n_train, n_test, _, _ = _validate_shuffle_split(n_samples,
                                                   test_size=1./n_folds,
                                                   train_size=1.-(1./n_folds))
 
+        # TODO need to revise this test for the proposed modification of StratifiedShuffleSplit
+        # TODO this is also to do with ensuring train_size + test_size = 1.0 when they are float
+        # TODO and similary when they are int, train_size + test_size = n_samples
         assert_equal(len(train), n_train)
         assert_equal(len(test), n_test)
         assert_equal(len(set(train).intersection(test)), 0)