scikit-learn · amueller · Aug 2, 2019 · Aug 2, 2019 · Aug 2, 2019 · Aug 2, 2019
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1137,6 +1137,7 @@ Splitter Classes
    :toctree: generated/
    :template: class.rst
 
+   model_selection.BinnedStratifiedKFold
    model_selection.GroupKFold
    model_selection.GroupShuffleSplit
    model_selection.KFold

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
@@ -517,7 +517,7 @@ Here is a visualization of the cross-validation behavior. Note that
 validation that allows a finer control on the number of iterations and
 the proportion of samples on each side of the train / test split.
 
-Cross-validation iterators with stratification based on class labels.
+Cross-validation iterators with stratification based on targets
 ---------------------------------------------------------------------
 
 Some classification problems can exhibit a large imbalance in the distribution
@@ -531,8 +531,8 @@ Stratified k-fold
 ^^^^^^^^^^^^^^^^^
 
 :class:`StratifiedKFold` is a variation of *k-fold* which returns *stratified*
-folds: each set contains approximately the same percentage of samples of each
-target class as the complete set.
+folds for classification: each set contains approximately the same percentage
+of samples of each target class as the complete set.
 
 Here is an example of stratified 3-fold cross-validation on a dataset with 50 samples from
 two unbalanced classes.  We show the number of samples in each class and compare with 
@@ -569,6 +569,35 @@ Here is a visualization of the cross-validation behavior.
 :class:`RepeatedStratifiedKFold` can be used to repeat Stratified K-Fold n times
 with different randomization in each repetition.
 
+Binned Stratified k-fold
+^^^^^^^^^^^^^^^^^^^^^^^^
+:class:`BinnedStratifiedKFold` is a variant of *k-fold* which returns *stratified*
+fold for regression. Similarly to :class:`StratifiedKFold`, the goal is to make
-fold for regression. Similarly to :class:`StratifiedKFold`, the goal is to make
+folds for regression. Similarly to :class:`StratifiedKFold`, the goal is to make
-fold for regression. Similarly to :class:`StratifiedKFold`, the goal is to make
+folds for regression. Similarly to :class:`StratifiedKFold`, the goal is to make
+sure that the distribution of the target is similar within each fold. This
+is achieved by binning the target into ``n_bins`` using quantiles of the target.
-is achieved by binning the target into ``n_bins`` using quantiles of the target.
+is achieved by binning the target into ``n_bins`` bins using quantiles of the target.
-is achieved by binning the target into ``n_bins`` using quantiles of the target.
+is achieved by binning the target into ``n_bins`` bins using quantiles of the target.
+
+As is shown in the following example, without stratification or binning,
+there can be large differences between the target statistics in folds,
+in particular if the data was sorted in some way::
+
+  >>> from sklearn.model_selection import BinnedStratifiedKFold
+  >>> X = np.random.uniform(size=(15, 3))
+  >>> y = np.linspace(0, 1, 15)
-  >>> y = np.linspace(0, 1, 15)
+  >>> y = np.linspace(0, 1, 15)  # sorted target
-  >>> y = np.linspace(0, 1, 15)
+  >>> y = np.linspace(0, 1, 15)  # sorted target
+  >>> bskf = BinnedStratifiedKFold(n_splits=3)
+  >>> kf = KFold(n_splits=3)
+  >>> for train, test in bskf.split(X, y):
+  ...     print("stratified test-set mean y: {:.3f}".format(y[test].mean()))
+  stratified test-set mean y: 0.429
+  stratified test-set mean y: 0.500
+  stratified test-set mean y: 0.571
+
+  >>> for train, test in kf.split(X, y):
+  ...     print("standard test-set mean y: {:.3f}".format(y[test].mean()))
+  standard test-set mean y: 0.143
+  standard test-set mean y: 0.500
+  standard test-set mean y: 0.857
+
+
 
 Stratified Shuffle Split
 ^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -768,6 +768,10 @@ Changelog
   plot model scalability (see learning_curve example).
   :pr:`13938` by :user:`Hadrien Reboul <H4dr1en>`.
 
+- |Enhancement| A new cross-validation iterator
+  :class:`model_selection.BinnedStratifiedKFold` for stratifying in the regression
+  setting was added. :pr:`14560` by :user:`Zech Xu<RNAer>` and `Andreas Müller`_.
+
 - |Enhancement| :class:`model_selection.RandomizedSearchCV` now accepts lists
   of parameter distributions. :pr:`14549` by `Andreas Müller`_.
 

diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
@@ -13,6 +13,7 @@
 from ._split import GroupShuffleSplit
 from ._split import StratifiedShuffleSplit
 from ._split import PredefinedSplit
+from ._split import BinnedStratifiedKFold
 from ._split import train_test_split
 from ._split import check_cv
 
@@ -30,6 +31,7 @@
 from ._search import fit_grid_point
 
 __all__ = ('BaseCrossValidator',
+           'BinnedStratifiedKFold',
            'GridSearchCV',
            'TimeSeriesSplit',
            'KFold',

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -632,6 +632,7 @@ class StratifiedKFold(_BaseKFold):
     See also
     --------
     RepeatedStratifiedKFold: Repeats Stratified K-Fold n times.
+    BinnedStratifiedKFold: Stratified K-Fold variant for regression targets.
     """
 
     def __init__(self, n_splits=5, shuffle=False, random_state=None):
@@ -1906,6 +1907,86 @@ def get_n_splits(self, X=None, y=None, groups=None):
         return len(self.unique_folds)
 
 
+class BinnedStratifiedKFold(StratifiedKFold):
+    """Stratified K-Folds cross-validator for regression
+
+    Provides train/test indices to split data in train/test sets.
+
+    This cross-validation object is a variation of KFold that returns
+    stratified folds by binning a regression target. The folds are made
+    by approximately preserving the distribution of targets in each bin.
+
+    Read more in the :ref:`User Guide <cross_validation>`.
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+    shuffle : boolean, default=False
+        Whether to shuffle each stratification of the data before splitting
+        into batches.
+
+    n_bins : int, default=10
+        How many quantile bins to use.
+
+    random_state : None, int or RandomState
+        When ``shuffle=True``, pseudo-random number generator state used for
+        shuffling.
+        See :term:`glossary <random_state>` for details.
+
+    Examples
+    --------
+    >>> from sklearn.model_selection import BinnedStratifiedKFold
+    >>> y = np.arange(11.0)
-    >>> y = np.arange(11.0)
+    >>> y = np.arange(11)
-    >>> y = np.arange(11.0)
+    >>> y = np.arange(11)
+    >>> rng = np.random.RandomState(0)
+    >>> X = y + 0.1 * rng.randn(len(y))
+    >>> cv = BinnedStratifiedKFold(n_splits=3)
+    >>> skf = cv.split(X, y)
+    >>> print(cv)
+    BinnedStratifiedKFold(n_bins=5, n_splits=3, random_state=None...)
+    >>> for train_index, test_index in skf:
-    >>> for train_index, test_index in skf:
+    >>> for train_index, test_index in cv.split(X, y):
-    >>> for train_index, test_index in skf:
+    >>> for train_index, test_index in cv.split(X, y):
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...    X_train, X_test = X[train_index], X[test_index]
+    ...    y_train, y_test = y[train_index], y[test_index]
+    TRAIN: [ 1  2  4  6  7  8 10] TEST: [0 3 5 9]
+    TRAIN: [0 2 3 5 6 8 9] TEST: [ 1  4  7 10]
+    TRAIN: [ 0  1  3  4  5  7  9 10] TEST: [2 6 8]
+
+
+    Notes
+    -----
+    The implementation is designed to:
+
+    * Generate test sets such that all contain the same distribution of
+      binned targets, or as close as possible.
+    * Preserve order dependencies in the dataset ordering, when
+      ``shuffle=False``: all samples from bin k in some test set were
+      contiguous in y, or separated in y by samples from classes other than k.
-      contiguous in y, or separated in y by samples from classes other than k.
+      contiguous in y, or separated in y by samples from bin other than k.
-      contiguous in y, or separated in y by samples from classes other than k.
+      contiguous in y, or separated in y by samples from bin other than k.
+    * Generate test sets where the smallest and largest differ by at most one
-    * Generate test sets where the smallest and largest differ by at most one
+    * Generate test sets where the sizes of the smallest and largest differ by at most one
-    * Generate test sets where the smallest and largest differ by at most one
+    * Generate test sets where the sizes of the smallest and largest differ by at most one
+      sample.
+
+    See also
+    --------
+    KFold: k-fold generator without any stratification
+    StratifiedKFold: stratified k-fold generator for classification data
+    """
+
+    def __init__(self, n_splits=5, shuffle=False, n_bins=5,
+                 random_state=None):
+        super().__init__(n_splits, shuffle, random_state)
+        self.n_bins = n_bins
+        if n_bins < 2:
+            raise ValueError("Need at least two bins, got {}.".format(
+                n_bins))
+
+    def _make_test_folds(self, X, y):
+        percentiles = np.percentile(
+            y, np.linspace(0, 100, self.n_bins + 1))
+        bins = np.searchsorted(percentiles[1:-1], y)
+        return super()._make_test_folds(X, bins)
+
+
 class _CVIterableWrapper(BaseCrossValidator):
     """Wrapper class for old style cv objects and iterables."""
     def __init__(self, cv):

diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
@@ -37,14 +37,15 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.model_selection import RepeatedKFold
 from sklearn.model_selection import RepeatedStratifiedKFold
+from sklearn.model_selection import BinnedStratifiedKFold
 
 from sklearn.linear_model import Ridge
 
 from sklearn.model_selection._split import _validate_shuffle_split
 from sklearn.model_selection._split import _build_repr
 
 from sklearn.datasets import load_digits
-from sklearn.datasets import make_classification
+from sklearn.datasets import make_classification, make_regression
 
 from sklearn.utils.fixes import comb
 
@@ -146,6 +147,7 @@ def test_cross_validator_with_default_params():
     lopo = LeavePGroupsOut(p)
     ss = ShuffleSplit(random_state=0)
     ps = PredefinedSplit([1, 1, 2, 2])  # n_splits = np of unique folds = 2
+    bskf = BinnedStratifiedKFold(n_splits)
 
     loo_repr = "LeaveOneOut()"
     lpo_repr = "LeavePOut(p=2)"
@@ -156,15 +158,17 @@ def test_cross_validator_with_default_params():
     ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, "
                "test_size=None, train_size=None)")
     ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"
+    bskf_repr = ("BinnedStratifiedKFold(n_bins=5, n_splits=2, "
+                 "random_state=None, shuffle=False)")
 
     n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits,
                          n_unique_groups, comb(n_unique_groups, p),
-                         n_shuffle_splits, 2]
+                         n_shuffle_splits, 2, 2]
 
     for i, (cv, cv_repr) in enumerate(zip(
-            [loo, lpo, kf, skf, lolo, lopo, ss, ps],
+            [loo, lpo, kf, skf, lolo, lopo, ss, ps, bskf],
             [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr,
-             ss_repr, ps_repr])):
+             ss_repr, ps_repr, bskf_repr])):
         # Test if get_n_splits works correctly
         assert n_splits_expected[i] == cv.get_n_splits(X, y, groups)
 
@@ -202,7 +206,8 @@ def test_2d_y():
                  ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                  GroupShuffleSplit(), LeaveOneGroupOut(),
                  LeavePGroupsOut(n_groups=2), GroupKFold(n_splits=3),
-                 TimeSeriesSplit(), PredefinedSplit(test_fold=groups)]
+                 TimeSeriesSplit(), PredefinedSplit(test_fold=groups),
+                 BinnedStratifiedKFold()]
     for splitter in splitters:
         list(splitter.split(X, y, groups))
         list(splitter.split(X, y_2d, groups))
@@ -686,10 +691,10 @@ def test_stratified_shuffle_split_iter():
             assert_array_equal(np.unique(y[train]), np.unique(y[test]))
             # Checks if folds keep classes proportions
             p_train = (np.bincount(np.unique(y[train],
-                                   return_inverse=True)[1]) /
+                                             return_inverse=True)[1]) /
                        float(len(y[train])))
             p_test = (np.bincount(np.unique(y[test],
-                                  return_inverse=True)[1]) /
+                                            return_inverse=True)[1]) /
                       float(len(y[test])))
             assert_array_almost_equal(p_train, p_test, 1)
             assert len(train) + len(test) == y.size
@@ -875,8 +880,7 @@ def test_leave_one_p_group_out():
     assert repr(logo) == 'LeaveOneGroupOut()'
     assert repr(lpgo_1) == 'LeavePGroupsOut(n_groups=1)'
     assert repr(lpgo_2) == 'LeavePGroupsOut(n_groups=2)'
-    assert (repr(LeavePGroupsOut(n_groups=3)) ==
-                 'LeavePGroupsOut(n_groups=3)')
+    assert (repr(LeavePGroupsOut(n_groups=3)) == 'LeavePGroupsOut(n_groups=3)')
 
     for j, (cv, p_groups_out) in enumerate(((logo, 1), (lpgo_1, 1),
                                             (lpgo_2, 2))):
@@ -942,8 +946,7 @@ def test_leave_group_out_changing_groups():
 
     # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3
     assert (
-        3 == LeavePGroupsOut(n_groups=2).get_n_splits(X, y=X,
-                                                    groups=groups))
+        3 == LeavePGroupsOut(n_groups=2).get_n_splits(X, y=X, groups=groups))
     # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups)
     assert 3 == LeaveOneGroupOut().get_n_splits(X, y=X,
                                                 groups=groups)
@@ -1381,8 +1384,7 @@ def test_group_kfold():
     # Check that folds have approximately the same size
     assert len(folds) == len(groups)
     for i in np.unique(folds):
-        assert (tolerance >=
-                             abs(sum(folds == i) - ideal_n_groups_per_fold))
+        assert (tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold))
-        assert (tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold))
+        assert tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold)
-        assert (tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold))
+        assert tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold)
 
     # Check that each group appears only in 1 fold
     for group in np.unique(groups):
@@ -1418,8 +1420,7 @@ def test_group_kfold():
     # Check that folds have approximately the same size
     assert len(folds) == len(groups)
     for i in np.unique(folds):
-        assert (tolerance >=
-                             abs(sum(folds == i) - ideal_n_groups_per_fold))
+        assert (tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold))
-        assert (tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold))
+        assert tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold)
-        assert (tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold))
+        assert tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold)
 
     # Check that each group appears only in 1 fold
     with warnings.catch_warnings():
@@ -1586,6 +1587,85 @@ def test_leave_p_out_empty_trainset():
         next(cv.split(X, y, groups=[1, 2]))
 
 
+def test_binnedstratifiedkfold_error():
+    with pytest.raises(ValueError, match='Need at least two bins'):
+        BinnedStratifiedKFold(n_bins=1)
+
+
+def test_binnedstratifiedkfold_balance():
+    rng = np.random.RandomState(0)
+    for _ in range(10):
+        n_splits = 2 + int(10 * rng.rand())
+        X, y = make_regression(noise=10, random_state=rng)
+        train_sizes = []
+        test_sizes = []
+
+        bskf = BinnedStratifiedKFold(n_splits=n_splits)
+
+        for train_index, test_index in bskf.split(X=X, y=y):
+            train_sizes.append(len(train_index))
+            test_sizes.append(len(test_index))
+
+        assert (np.max(test_sizes) - np.min(test_sizes)) <= 1
+        assert (np.max(train_sizes) - np.min(train_sizes)) <= 1
+        assert np.sum(test_sizes) == X.shape[0]
+        assert np.sum(train_sizes) == X.shape[0] * (n_splits - 1)
+
+
+def test_binnedstratifiedkfold_stable_moments_between_splits():
+    """check if BinnedStratifiedKFold performs on average better than KFold in
+    terms of lower between-fold variance of fold mean(y_test) and fold
+    std(y_test)
+    """
+    binned_has_more_stable_std_list = []
+    binned_has_more_stable_mean_list = []
+    rng = np.random.RandomState(0)
+
+    for trial in range(100):
+        n_splits = 2 + int(10*rng.rand())
+        X, y = make_regression(noise=10, random_state=rng)
+
+        ymeans_binned = []
+        ystds_binned = []
+
+        skf = BinnedStratifiedKFold(n_splits=n_splits)
+
+        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
+
+        bins = np.percentile(y, np.linspace(0, 100, skf.n_bins + 1))
+        bins[-1] += 1e-8
+
+        for train_index, test_index in skf.split(X=X, y=y):
+            y_test = y[test_index]
+            ymeans_binned.append(y_test.mean())
+            ystds_binned.append(y_test.std())
+            hist_, _ = np.histogram(y[test_index], bins=bins)
+
+            assert all(abs(hist_ - np.mean(hist_)) <= 1)
+
+        ymeans_regular = []
+        ystds_regular = []
+        for train_index_reg, test_index_reg in kf.split(X=X, y=y):
+            ymeans_regular.append(y[test_index_reg].mean())
+            ystds_regular.append(y[test_index_reg].std())
+
+        binned_has_more_stable_std = np.std(
+            ystds_regular) > np.std(ystds_binned)
+        binned_has_more_stable_std_list.append(binned_has_more_stable_std)
+
+        binned_has_more_stable_mean = np.std(
+            ymeans_regular) > np.std(ymeans_binned)
+        binned_has_more_stable_mean_list.append(binned_has_more_stable_mean)
+
+    binned_has_more_stable_std_fraction = np.mean(
+        binned_has_more_stable_std_list)
+    binned_has_more_stable_mean_fraction = np.mean(
+        binned_has_more_stable_mean_list)
+
+    assert binned_has_more_stable_std_fraction > 0.5
+    assert binned_has_more_stable_mean_fraction > 0.5
+
+
 @pytest.mark.parametrize('Klass', (KFold, StratifiedKFold))
 def test_random_state_shuffle_false(Klass):
     # passing a non-default random_state when shuffle=False makes no sense