From 6eb40285192f5530e6ef3e37aa70b0571164e1d8 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Tue, 24 Mar 2015 15:32:29 +0000
Subject: [PATCH 1/4] Added subject independent KFold

Changed SubjectIndependentKFold to DisjointGroupKFold

cosmetic changes  test (fix seed correctly, use assert_equal for
meaningful error messages)

Changed name to DisjointLabelKFold

Added example of use

FIX: whitespace related doctest failure

FIX: Python 2.6 requires the field numbers in print

FIX: change docstring to comment in test function

DOC: moved docstring from function to class

FIX: added call to parent class

FIX: error in calling the parent

DOC: fixed doctest

FIX: doctest

Cosmetic changes (minor refactoring)

Optimised code (use np.bincount)

Cosmetic: use samples instead of weight for clarity

Minor fix: removed shuffle parameter

Cosmetic

Use mergesort instead of quicksort for reproducibility.

Changed variable name 'y' to 'label'.

Added test for degenerate case where n_folds > n_labels.

Documented the requirement n_labels > n_folds.

DOC: improved description + added see also sections.

Fixed dtype of temporary arrays.

Improved test: check that one label is not in both test and training.

Added documentation for DisjoinLabelKFold.
---
 doc/modules/classes.rst                |   1 +
 doc/modules/cross_validation.rst       |  27 +++++
 sklearn/cross_validation.py            | 139 +++++++++++++++++++++++++
 sklearn/tests/test_cross_validation.py |  68 ++++++++++++
 4 files changed, 235 insertions(+)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index d8b3dd4dbe1d6..d50a7375a1889 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -168,6 +168,7 @@ Classes
    cross_validation.LeavePOut
    cross_validation.PredefinedSplit
    cross_validation.StratifiedKFold
+   cross_validation.DisjointLabelKFold
    cross_validation.ShuffleSplit
    cross_validation.LabelShuffleSplit
    cross_validation.StratifiedShuffleSplit
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 53afdf53550b1..ad1e9a8ddcf9c 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -261,6 +261,33 @@ two slightly unbalanced classes::
   [0 1 2 4 5 6 7] [3 8 9]
 
 
+Disjoint label KFold
+--------------------
+
+:class:`DisjointLabelKFold` is a variation of *k-fold* which ensures that the same
+label is not in both testing and training sets.
+This is necessary for example if you obtained data from different subjects and you
+want to avoid over-fitting (ie learning person specific features) by testing and 
+training on different subjects.
+
+Imagine you have three subjects, each with an associated number from 1 to 3::
+
+  >>> from sklearn.cross_validation import DisjointLabelKFold
+
+  >>> labels = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
+
+  >>> dlkf = DisjointLabelKFold(labels, 3)
+  >>> for train, test in dlkf:
+  ...     print("%s %s" % (train, test))
+  [0 1 2 3 4 5] [6 7 8 9]
+  [0 1 2 6 7 8 9] [3 4 5]
+  [3 4 5 6 7 8 9] [0 1 2]
+
+Each subject is in a different testing fold, and the same subject is never in both
+testing and training.
+Notice that the folds do not have exactly the same size due to the imbalance in the data.
+
+
 Leave-One-Out - LOO
 -------------------
 
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 4c49bee86517d..c9b88832b102a 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -304,6 +304,8 @@ class KFold(_BaseKFold):
     StratifiedKFold: take label information into account to avoid building
     folds with imbalanced class distributions (for binary or multiclass
     classification tasks).
+
+    DisjointLabelKFold: K-fold iterator variant with non-overlapping labels.
     """
 
     def __init__(self, n, n_folds=3, shuffle=False,
@@ -339,6 +341,133 @@ def __len__(self):
         return self.n_folds
 
 
+def disjoint_label_folds(labels, n_folds=3):
+    """Creates folds where a same label is not in two different folds.
+    
+    Parameters
+    ----------
+    labels: numpy array, shape (n_samples,)
+        Contains an id for each sample.
+        The folds are built so that the same id doesn't appear in two different folds.
+    
+    n_folds: int, default=3
+        Number of folds to split the data into.
+        
+    Returns
+    -------
+    folds: numpy array of shape (n_samples, )
+        Array of integers between 0 and (n_folds - 1).
+        Folds[i] contains the folds to which sample i is assigned.
+        
+    Notes
+    -----
+    The folds are built by distributing the labels by frequency of appearance.
+    The number of labels has to be at least equal to the number of folds.
+    """
+    labels = np.array(labels)
+    unique_labels, labels = np.unique(labels, return_inverse=True)
+    n_labels = len(unique_labels)
+    if n_folds > n_labels:
+        raise ValueError(
+                ("Cannot have number of folds n_folds={0} greater"
+                 " than the number of labels: {1}.").format(n_folds, n_labels))
+    
+    # number of occurrence of each label (its "weight")
+    samples_per_label = np.bincount(labels)
+    # We want to distribute the most frequent labels first
+    ind = np.argsort(samples_per_label, kind='mergesort')[::-1]
+    samples_per_label = samples_per_label[ind]
+
+    # Total weight of each fold
+    samples_per_fold = np.zeros(n_folds, dtype=np.uint64)
+
+    # Mapping from label index to fold index
+    label_to_fold = np.zeros(len(unique_labels), dtype=np.uintp)
+    
+    # While there are weights, distribute them
+    # Specifically, add the biggest weight to the lightest fold
+    for label_index, w in enumerate(samples_per_label):
+        ind_min = np.argmin(samples_per_fold)
+        samples_per_fold[ind_min] += w
+        label_to_fold[ind[label_index]] = ind_min
+    
+    folds = label_to_fold[labels]
+
+    return folds
+
+
+class DisjointLabelKFold(_BaseKFold):
+    """K-fold iterator variant with non-overlapping labels.
+
+    The same label will not appear in two different folds (the number of
+    labels has to be at least equal to the number of folds).
+
+    The folds are approximately balanced in the sense so that the number of
+    distinct labels is approximately the same in each fold.
+
+    Parameters
+    ----------
+    labels : array-like with shape (n_samples, )
+        Contains a label for each sample.
+        The folds are built so that the same label doesn't appear in two different folds.
+
+    n_folds : int, default is 3
+        Number of folds.
+
+    Examples
+    --------
+    >>> from sklearn import cross_validation
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    >>> y = np.array([1, 2, 3, 4])
+    >>> labels = np.array([0, 0, 2, 2])
+    >>> dl_kfold = cross_validation.DisjointLabelKFold(labels, n_folds=2)
+    >>> len(dl_kfold)
+    2
+    >>> print(dl_kfold)
+    sklearn.cross_validation.DisjointLabelKFold(n_labels=4, n_folds=2)
+    >>> for train_index, test_index in dl_kfold:
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
+    ...     X_train, X_test = X[train_index], X[test_index]
+    ...     y_train, y_test = y[train_index], y[test_index]
+    ...     print(X_train, X_test, y_train, y_test)
+    ... 
+    TRAIN: [0 1] TEST: [2 3]
+    [[1 2]
+     [3 4]] [[5 6]
+     [7 8]] [1 2] [3 4]
+    TRAIN: [2 3] TEST: [0 1]
+    [[5 6]
+     [7 8]] [[1 2]
+     [3 4]] [3 4] [1 2]
+
+    See also
+    --------
+    LeaveOneLabelOut for splitting the data according to explicit,
+    domain-specific stratification of the dataset.
+    """
+    def __init__(self, labels, n_folds=3):
+        # No shuffling implemented yet
+        super(DisjointLabelKFold, self).__init__(len(labels), n_folds, False, None)
+        self.n_folds = n_folds
+        self.n = len(labels)
+        self.idxs = disjoint_label_folds(labels=labels, n_folds=n_folds)
+
+    def _iter_test_indices(self):
+        for i in range(self.n_folds):
+            yield (self.idxs == i)
+
+    def __repr__(self):
+        return '{0}.{1}(n_labels={2}, n_folds={3})'.format(
+            self.__class__.__module__,
+            self.__class__.__name__,
+            self.n,
+            self.n_folds,
+        )
+
+    def __len__(self):
+        return self.n_folds
+
+
 class StratifiedKFold(_BaseKFold):
     """Stratified K-Folds cross validation iterator
 
@@ -389,6 +518,9 @@ class StratifiedKFold(_BaseKFold):
     All the folds have size trunc(n_samples / n_folds), the last one has the
     complementary.
 
+    See also
+    --------
+    DisjointLabelKFold: K-fold iterator variant with non-overlapping labels.
     """
 
     def __init__(self, y, n_folds=3, shuffle=False,
@@ -497,6 +629,9 @@ class LeaveOneLabelOut(_PartitionIterator):
      [3 4]] [[5 6]
      [7 8]] [1 2] [1 2]
 
+    See also
+    --------
+    DisjointLabelKFold: K-fold iterator variant with non-overlapping labels.
     """
 
     def __init__(self, labels):
@@ -572,6 +707,10 @@ class LeavePLabelOut(_PartitionIterator):
     TRAIN: [0] TEST: [1 2]
     [[1 2]] [[3 4]
      [5 6]] [1] [2 1]
+
+    See also
+    --------
+    DisjointLabelKFold: K-fold iterator variant with non-overlapping labels.
     """
 
     def __init__(self, labels, p):
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 3c45261103411..1bdd195621615 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -13,6 +13,7 @@
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_greater
+from sklearn.utils.testing import assert_greater_equal
 from sklearn.utils.testing import assert_less
 from sklearn.utils.testing import assert_not_equal
 from sklearn.utils.testing import assert_array_almost_equal
@@ -1153,3 +1154,70 @@ def test_cross_val_predict_sparse_prediction():
     preds_sparse = cval.cross_val_predict(classif, X_sparse, y_sparse, cv=10)
     preds_sparse = preds_sparse.toarray()
     assert_array_almost_equal(preds_sparse, preds)
+
+
+def test_disjoint_label_folds():
+    ## Check that the function produces equilibrated folds
+    ##      with no label appearing in two different folds
+
+    # Fix the seed for reproducibility
+    rng = np.random.RandomState(0)
+
+    # Parameters of the test
+    n_labels = 15
+    n_samples = 1000
+    n_folds = 5
+
+    # Construct the test data
+    tolerance = 0.05 * n_samples # 5 percent error allowed
+    labels = rng.randint(0, n_labels, n_samples)
+    folds = cval.disjoint_label_folds(labels, n_folds)
+    ideal_n_labels_per_fold = n_samples // n_folds
+
+    # Check that folds have approximately the same size
+    assert_equal(len(folds), len(labels))
+    for i in np.unique(folds):
+        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold))
+
+    # Check that each label appears only in 1 fold
+    for label in np.unique(labels):
+        assert_equal(len(np.unique(folds[labels == label])), 1)
+
+    # Check that no label is on both sides of the split
+    labels = np.asarray(labels, dtype=object)  # to allow fancy indexing on labels
+    for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds):
+        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
+
+    # Construct the test data
+    labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
+                'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
+                'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
+                'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
+                'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
+                'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis',
+                'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']
+
+    n_labels = len(np.unique(labels))
+    n_samples = len(labels)
+    n_folds = 5
+    tolerance = 0.05 * n_samples # 5 percent error allowed
+    folds = cval.disjoint_label_folds(labels, n_folds)
+    ideal_n_labels_per_fold = n_samples // n_folds
+
+    # Check that folds have approximately the same size
+    assert_equal(len(folds), len(labels))
+    for i in np.unique(folds):
+        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold))
+
+    # Check that each label appears only in 1 fold
+    for label in np.unique(labels):
+        assert_equal(len(np.unique(folds[labels == label])), 1)
+
+    # Check that no label is on both sides of the split
+    labels = np.asarray(labels, dtype=object)  # to allow fancy indexing on labels
+    for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds):
+        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
+
+    # Should fail if there are more folds than labels
+    labels = np.array([1, 1, 1, 2, 2])
+    assert_raises(ValueError, cval.disjoint_label_folds, labels, n_folds=3)

From cf7bc7b035efaefea7dc523e814a65cc3d0c5514 Mon Sep 17 00:00:00 2001
From: Gilles Louppe <g.louppe@gmail.com>
Date: Sun, 30 Aug 2015 18:56:57 +0200
Subject: [PATCH 2/4] ENH: rename to LabelKFold

COSMIT: doc, pep8, etc

Refactor code
---
 doc/modules/classes.rst                |   7 +-
 doc/modules/cross_validation.rst       |  32 ++---
 sklearn/cross_validation.py            | 169 ++++++++++++-------------
 sklearn/tests/test_cross_validation.py | 132 ++++++++++---------
 4 files changed, 164 insertions(+), 176 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index d50a7375a1889..6001c3d9f92a0 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -162,17 +162,18 @@ Classes
    :template: class.rst
 
    cross_validation.KFold
+   cross_validation.LabelKFold
+   cross_validation.LabelShuffleSplit
    cross_validation.LeaveOneLabelOut
    cross_validation.LeaveOneOut
    cross_validation.LeavePLabelOut
    cross_validation.LeavePOut
    cross_validation.PredefinedSplit
-   cross_validation.StratifiedKFold
-   cross_validation.DisjointLabelKFold
    cross_validation.ShuffleSplit
-   cross_validation.LabelShuffleSplit
+   cross_validation.StratifiedKFold
    cross_validation.StratifiedShuffleSplit
 
+
 .. autosummary::
    :toctree: generated/
    :template: function.rst
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index ad1e9a8ddcf9c..93c30eb342f19 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -261,31 +261,31 @@ two slightly unbalanced classes::
   [0 1 2 4 5 6 7] [3 8 9]
 
 
-Disjoint label KFold
---------------------
+Label k-fold
+------------
 
-:class:`DisjointLabelKFold` is a variation of *k-fold* which ensures that the same
-label is not in both testing and training sets.
-This is necessary for example if you obtained data from different subjects and you
-want to avoid over-fitting (ie learning person specific features) by testing and 
-training on different subjects.
+:class:`LabelKFold` is a variation of *k-fold* which ensures that the same
+label is not in both testing and training sets. This is necessary for example
+if you obtained data from different subjects and you want to avoid over-fitting
+(i.e., learning person specific features) by testing and training on different
+subjects.
 
 Imagine you have three subjects, each with an associated number from 1 to 3::
 
-  >>> from sklearn.cross_validation import DisjointLabelKFold
+  >>> from sklearn.cross_validation import LabelKFold
 
   >>> labels = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
 
-  >>> dlkf = DisjointLabelKFold(labels, 3)
-  >>> for train, test in dlkf:
+  >>> lkf = LabelKFold(labels, 3)
+  >>> for train, test in lkf:
   ...     print("%s %s" % (train, test))
   [0 1 2 3 4 5] [6 7 8 9]
   [0 1 2 6 7 8 9] [3 4 5]
   [3 4 5 6 7 8 9] [0 1 2]
 
-Each subject is in a different testing fold, and the same subject is never in both
-testing and training.
-Notice that the folds do not have exactly the same size due to the imbalance in the data.
+Each subject is in a different testing fold, and the same subject is never in
+both testing and training. Notice that the folds do not have exactly the same
+size due to the imbalance in the data.
 
 
 Leave-One-Out - LOO
@@ -462,15 +462,15 @@ Label-Shuffle-Split
 
 :class:`LabelShuffleSplit`
 
-The :class:`LabelShuffleSplit` iterator behaves as a combination of 
-:class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a 
+The :class:`LabelShuffleSplit` iterator behaves as a combination of
+:class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a
 sequence of randomized partitions in which a subset of labels are held
 out for each split.
 
 Here is a usage example::
 
   >>> from sklearn.cross_validation import LabelShuffleSplit
-  
+
   >>> labels = [1, 1, 2, 2, 3, 3, 4, 4]
   >>> slo = LabelShuffleSplit(labels, n_iter=4, test_size=0.5,
   ...                        random_state=0)
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index c9b88832b102a..a9808a8de65bf 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -33,6 +33,7 @@
 from .utils.fixes import bincount
 
 __all__ = ['KFold',
+           'LabelKFold',
            'LeaveOneLabelOut',
            'LeaveOneOut',
            'LeavePLabelOut',
@@ -273,7 +274,7 @@ class KFold(_BaseKFold):
         Whether to shuffle the data before splitting into batches.
 
     random_state : None, int or RandomState
-        When shuffle=True, pseudo-random number generator state used for 
+        When shuffle=True, pseudo-random number generator state used for
         shuffling. If None, use default numpy RNG for shuffling.
 
     Examples
@@ -305,7 +306,7 @@ class KFold(_BaseKFold):
     folds with imbalanced class distributions (for binary or multiclass
     classification tasks).
 
-    DisjointLabelKFold: K-fold iterator variant with non-overlapping labels.
+    LabelKFold: K-fold iterator variant with non-overlapping labels.
     """
 
     def __init__(self, n, n_folds=3, shuffle=False,
@@ -341,78 +342,31 @@ def __len__(self):
         return self.n_folds
 
 
-def disjoint_label_folds(labels, n_folds=3):
-    """Creates folds where a same label is not in two different folds.
-    
-    Parameters
-    ----------
-    labels: numpy array, shape (n_samples,)
-        Contains an id for each sample.
-        The folds are built so that the same id doesn't appear in two different folds.
-    
-    n_folds: int, default=3
-        Number of folds to split the data into.
-        
-    Returns
-    -------
-    folds: numpy array of shape (n_samples, )
-        Array of integers between 0 and (n_folds - 1).
-        Folds[i] contains the folds to which sample i is assigned.
-        
-    Notes
-    -----
-    The folds are built by distributing the labels by frequency of appearance.
-    The number of labels has to be at least equal to the number of folds.
-    """
-    labels = np.array(labels)
-    unique_labels, labels = np.unique(labels, return_inverse=True)
-    n_labels = len(unique_labels)
-    if n_folds > n_labels:
-        raise ValueError(
-                ("Cannot have number of folds n_folds={0} greater"
-                 " than the number of labels: {1}.").format(n_folds, n_labels))
-    
-    # number of occurrence of each label (its "weight")
-    samples_per_label = np.bincount(labels)
-    # We want to distribute the most frequent labels first
-    ind = np.argsort(samples_per_label, kind='mergesort')[::-1]
-    samples_per_label = samples_per_label[ind]
-
-    # Total weight of each fold
-    samples_per_fold = np.zeros(n_folds, dtype=np.uint64)
-
-    # Mapping from label index to fold index
-    label_to_fold = np.zeros(len(unique_labels), dtype=np.uintp)
-    
-    # While there are weights, distribute them
-    # Specifically, add the biggest weight to the lightest fold
-    for label_index, w in enumerate(samples_per_label):
-        ind_min = np.argmin(samples_per_fold)
-        samples_per_fold[ind_min] += w
-        label_to_fold[ind[label_index]] = ind_min
-    
-    folds = label_to_fold[labels]
-
-    return folds
-
-
-class DisjointLabelKFold(_BaseKFold):
+class LabelKFold(_BaseKFold):
     """K-fold iterator variant with non-overlapping labels.
 
     The same label will not appear in two different folds (the number of
-    labels has to be at least equal to the number of folds).
+    distinct labels has to be at least equal to the number of folds).
 
-    The folds are approximately balanced in the sense so that the number of
+    The folds are approximately balanced in the sense that the number of
     distinct labels is approximately the same in each fold.
 
     Parameters
     ----------
     labels : array-like with shape (n_samples, )
         Contains a label for each sample.
-        The folds are built so that the same label doesn't appear in two different folds.
+        The folds are built so that the same label does not appear in two
+        different folds.
 
-    n_folds : int, default is 3
-        Number of folds.
+    n_folds : int, default=3
+        Number of folds. Must be at least 2.
+
+    shuffle : boolean, optional
+        Whether to shuffle the data before splitting into batches.
+
+    random_state : None, int or RandomState
+        When shuffle=True, pseudo-random number generator state used for
+        shuffling. If None, use default numpy RNG for shuffling.
 
     Examples
     --------
@@ -420,17 +374,17 @@ class DisjointLabelKFold(_BaseKFold):
     >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
     >>> y = np.array([1, 2, 3, 4])
     >>> labels = np.array([0, 0, 2, 2])
-    >>> dl_kfold = cross_validation.DisjointLabelKFold(labels, n_folds=2)
-    >>> len(dl_kfold)
+    >>> label_kfold = cross_validation.LabelKFold(labels, n_folds=2)
+    >>> len(label_kfold)
     2
-    >>> print(dl_kfold)
-    sklearn.cross_validation.DisjointLabelKFold(n_labels=4, n_folds=2)
-    >>> for train_index, test_index in dl_kfold:
+    >>> print(label_kfold)
+    sklearn.cross_validation.LabelKFold(n_labels=4, n_folds=2)
+    >>> for train_index, test_index in label_kfold:
     ...     print("TRAIN:", train_index, "TEST:", test_index)
     ...     X_train, X_test = X[train_index], X[test_index]
     ...     y_train, y_test = y[train_index], y[test_index]
     ...     print(X_train, X_test, y_train, y_test)
-    ... 
+    ...
     TRAIN: [0 1] TEST: [2 3]
     [[1 2]
      [3 4]] [[5 6]
@@ -445,12 +399,47 @@ class DisjointLabelKFold(_BaseKFold):
     LeaveOneLabelOut for splitting the data according to explicit,
     domain-specific stratification of the dataset.
     """
-    def __init__(self, labels, n_folds=3):
-        # No shuffling implemented yet
-        super(DisjointLabelKFold, self).__init__(len(labels), n_folds, False, None)
-        self.n_folds = n_folds
+    def __init__(self, labels, n_folds=3, shuffle=False, random_state=None):
+        super(LabelKFold, self).__init__(len(labels), n_folds, shuffle,
+                                         random_state)
+
         self.n = len(labels)
-        self.idxs = disjoint_label_folds(labels=labels, n_folds=n_folds)
+        self.n_folds = n_folds
+
+        unique_labels, labels = np.unique(labels, return_inverse=True)
+        n_labels = len(unique_labels)
+
+        if n_folds > n_labels:
+            raise ValueError(
+                    ("Cannot have number of folds n_folds={0} greater"
+                     " than the number of labels: {1}.").format(n_folds,
+                                                                n_labels))
+
+        # Weight labels by their number of occurences
+        samples_per_label = np.bincount(labels)
+
+        # Distribute the most frequent labels first
+        ind = np.argsort(samples_per_label)[::-1]
+        samples_per_label = samples_per_label[ind]
+
+        # Total weight of each fold
+        samples_per_fold = np.zeros(n_folds, dtype=np.uint64)
+
+        # Mapping from label index to fold index
+        label_to_fold = np.zeros(len(unique_labels), dtype=np.uintp)
+
+        # While there are weights, distribute them
+        # Specifically, add the biggest weight to the lightest fold
+        for label_index, w in enumerate(samples_per_label):
+            ind_min = np.argmin(samples_per_fold)
+            samples_per_fold[ind_min] += w
+            label_to_fold[ind[label_index]] = ind_min
+
+        self.idxs = label_to_fold[labels]
+
+        if shuffle:
+            rng = check_random_state(self.random_state)
+            rng.shuffle(self.idxs)
 
     def _iter_test_indices(self):
         for i in range(self.n_folds):
@@ -492,7 +481,7 @@ class StratifiedKFold(_BaseKFold):
         into batches.
 
     random_state : None, int or RandomState
-        When shuffle=True, pseudo-random number generator state used for 
+        When shuffle=True, pseudo-random number generator state used for
         shuffling. If None, use default numpy RNG for shuffling.
 
     Examples
@@ -520,7 +509,7 @@ class StratifiedKFold(_BaseKFold):
 
     See also
     --------
-    DisjointLabelKFold: K-fold iterator variant with non-overlapping labels.
+    LabelKFold: K-fold iterator variant with non-overlapping labels.
     """
 
     def __init__(self, y, n_folds=3, shuffle=False,
@@ -631,7 +620,7 @@ class LeaveOneLabelOut(_PartitionIterator):
 
     See also
     --------
-    DisjointLabelKFold: K-fold iterator variant with non-overlapping labels.
+    LabelKFold: K-fold iterator variant with non-overlapping labels.
     """
 
     def __init__(self, labels):
@@ -710,7 +699,7 @@ class LeavePLabelOut(_PartitionIterator):
 
     See also
     --------
-    DisjointLabelKFold: K-fold iterator variant with non-overlapping labels.
+    LabelKFold: K-fold iterator variant with non-overlapping labels.
     """
 
     def __init__(self, labels, p):
@@ -1218,11 +1207,11 @@ def cross_val_predict(estimator, X, y=None, cv=None, n_jobs=1,
         supervised learning.
 
     cv : integer or cross-validation generator, optional, default=3
-        A cross-validation generator to use. If int, determines the number 
-        of folds in StratifiedKFold if estimator is a classifier and the 
-        target y is binary or multiclass, or the number of folds in KFold 
+        A cross-validation generator to use. If int, determines the number
+        of folds in StratifiedKFold if estimator is a classifier and the
+        target y is binary or multiclass, or the number of folds in KFold
         otherwise.
-        Specific cross-validation objects can be passed, see 
+        Specific cross-validation objects can be passed, see
         sklearn.cross_validation module for the list of possible objects.
         This generator must include all elements in the test set exactly once.
         Otherwise, a ValueError is raised.
@@ -1387,11 +1376,11 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
         ``scorer(estimator, X, y)``.
 
     cv : integer or cross-validation generator, optional, default=3
-        A cross-validation generator to use. If int, determines the number 
-        of folds in StratifiedKFold if estimator is a classifier and the 
-        target y is binary or multiclass, or the number of folds in KFold 
+        A cross-validation generator to use. If int, determines the number
+        of folds in StratifiedKFold if estimator is a classifier and the
+        target y is binary or multiclass, or the number of folds in KFold
         otherwise.
-        Specific cross-validation objects can be passed, see 
+        Specific cross-validation objects can be passed, see
         sklearn.cross_validation module for the list of possible objects.
 
     n_jobs : integer, optional
@@ -1708,11 +1697,11 @@ def permutation_test_score(estimator, X, y, cv=None,
         ``scorer(estimator, X, y)``.
 
     cv : integer or cross-validation generator, optional, default=3
-        A cross-validation generator to use. If int, determines the number 
-        of folds in StratifiedKFold if estimator is a classifier and the 
-        target y is binary or multiclass, or the number of folds in KFold 
+        A cross-validation generator to use. If int, determines the number
+        of folds in StratifiedKFold if estimator is a classifier and the
+        target y is binary or multiclass, or the number of folds in KFold
         otherwise.
-        Specific cross-validation objects can be passed, see 
+        Specific cross-validation objects can be passed, see
         sklearn.cross_validation module for the list of possible objects.
 
     n_permutations : integer, optional
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 1bdd195621615..df2d90877bc50 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -359,6 +359,71 @@ def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
     assert_greater(mean_score, 0.85)
 
 
+def test_label_kfold():
+    rng = np.random.RandomState(0)
+
+    # Parameters of the test
+    n_labels = 15
+    n_samples = 1000
+    n_folds = 5
+
+    # Construct the test data
+    tolerance = 0.05 * n_samples  # 5 percent error allowed
+    labels = rng.randint(0, n_labels, n_samples)
+    folds = cval.LabelKFold(labels, n_folds).idxs
+    ideal_n_labels_per_fold = n_samples // n_folds
+
+    # Check that folds have approximately the same size
+    assert_equal(len(folds), len(labels))
+    for i in np.unique(folds):
+        assert_greater_equal(tolerance,
+                             abs(sum(folds == i) - ideal_n_labels_per_fold))
+
+    # Check that each label appears only in 1 fold
+    for label in np.unique(labels):
+        assert_equal(len(np.unique(folds[labels == label])), 1)
+
+    # Check that no label is on both sides of the split
+    labels = np.asarray(labels, dtype=object)
+    for train, test in cval.LabelKFold(labels, n_folds=n_folds):
+        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
+
+    # Construct the test data
+    labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
+              'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
+              'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
+              'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
+              'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
+              'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis',
+              'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']
+
+    n_labels = len(np.unique(labels))
+    n_samples = len(labels)
+    n_folds = 5
+    tolerance = 0.05 * n_samples  # 5 percent error allowed
+    folds = cval.LabelKFold(labels, n_folds).idxs
+    ideal_n_labels_per_fold = n_samples // n_folds
+
+    # Check that folds have approximately the same size
+    assert_equal(len(folds), len(labels))
+    for i in np.unique(folds):
+        assert_greater_equal(tolerance,
+                             abs(sum(folds == i) - ideal_n_labels_per_fold))
+
+    # Check that each label appears only in 1 fold
+    for label in np.unique(labels):
+        assert_equal(len(np.unique(folds[labels == label])), 1)
+
+    # Check that no label is on both sides of the split
+    labels = np.asarray(labels, dtype=object)
+    for train, test in cval.LabelKFold(labels, n_folds=n_folds):
+        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
+
+    # Should fail if there are more folds than labels
+    labels = np.array([1, 1, 1, 2, 2])
+    assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
+
+
 def test_shuffle_split():
     ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
     ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
@@ -1154,70 +1219,3 @@ def test_cross_val_predict_sparse_prediction():
     preds_sparse = cval.cross_val_predict(classif, X_sparse, y_sparse, cv=10)
     preds_sparse = preds_sparse.toarray()
     assert_array_almost_equal(preds_sparse, preds)
-
-
-def test_disjoint_label_folds():
-    ## Check that the function produces equilibrated folds
-    ##      with no label appearing in two different folds
-
-    # Fix the seed for reproducibility
-    rng = np.random.RandomState(0)
-
-    # Parameters of the test
-    n_labels = 15
-    n_samples = 1000
-    n_folds = 5
-
-    # Construct the test data
-    tolerance = 0.05 * n_samples # 5 percent error allowed
-    labels = rng.randint(0, n_labels, n_samples)
-    folds = cval.disjoint_label_folds(labels, n_folds)
-    ideal_n_labels_per_fold = n_samples // n_folds
-
-    # Check that folds have approximately the same size
-    assert_equal(len(folds), len(labels))
-    for i in np.unique(folds):
-        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold))
-
-    # Check that each label appears only in 1 fold
-    for label in np.unique(labels):
-        assert_equal(len(np.unique(folds[labels == label])), 1)
-
-    # Check that no label is on both sides of the split
-    labels = np.asarray(labels, dtype=object)  # to allow fancy indexing on labels
-    for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds):
-        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
-
-    # Construct the test data
-    labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
-                'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
-                'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
-                'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
-                'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
-                'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis',
-                'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']
-
-    n_labels = len(np.unique(labels))
-    n_samples = len(labels)
-    n_folds = 5
-    tolerance = 0.05 * n_samples # 5 percent error allowed
-    folds = cval.disjoint_label_folds(labels, n_folds)
-    ideal_n_labels_per_fold = n_samples // n_folds
-
-    # Check that folds have approximately the same size
-    assert_equal(len(folds), len(labels))
-    for i in np.unique(folds):
-        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold))
-
-    # Check that each label appears only in 1 fold
-    for label in np.unique(labels):
-        assert_equal(len(np.unique(folds[labels == label])), 1)
-
-    # Check that no label is on both sides of the split
-    labels = np.asarray(labels, dtype=object)  # to allow fancy indexing on labels
-    for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds):
-        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
-
-    # Should fail if there are more folds than labels
-    labels = np.array([1, 1, 1, 2, 2])
-    assert_raises(ValueError, cval.disjoint_label_folds, labels, n_folds=3)

From c769a65e156b7281c5423ef359782a7be60a62af Mon Sep 17 00:00:00 2001
From: Gilles Louppe <g.louppe@gmail.com>
Date: Mon, 7 Sep 2015 10:08:17 +0200
Subject: [PATCH 3/4] COSMIT: variable names, documentation, etc

---
 doc/modules/cross_validation.rst |  2 +-
 sklearn/cross_validation.py      | 33 ++++++++++++++++----------------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 93c30eb342f19..e76b95c6e48be 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -276,7 +276,7 @@ Imagine you have three subjects, each with an associated number from 1 to 3::
 
   >>> labels = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
 
-  >>> lkf = LabelKFold(labels, 3)
+  >>> lkf = LabelKFold(labels, n_folds=3)
   >>> for train, test in lkf:
   ...     print("%s %s" % (train, test))
   [0 1 2 3 4 5] [6 7 8 9]
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index a9808a8de65bf..2e988d5985cf8 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -279,10 +279,10 @@ class KFold(_BaseKFold):
 
     Examples
     --------
-    >>> from sklearn import cross_validation
+    >>> from sklearn.cross_validation import KFold
     >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
     >>> y = np.array([1, 2, 3, 4])
-    >>> kf = cross_validation.KFold(4, n_folds=2)
+    >>> kf = KFold(4, n_folds=2)
     >>> len(kf)
     2
     >>> print(kf)  # doctest: +NORMALIZE_WHITESPACE
@@ -370,11 +370,11 @@ class LabelKFold(_BaseKFold):
 
     Examples
     --------
-    >>> from sklearn import cross_validation
+    >>> from sklearn.cross_validation import LabelKFold
     >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
     >>> y = np.array([1, 2, 3, 4])
     >>> labels = np.array([0, 0, 2, 2])
-    >>> label_kfold = cross_validation.LabelKFold(labels, n_folds=2)
+    >>> label_kfold = LabelKFold(labels, n_folds=2)
     >>> len(label_kfold)
     2
     >>> print(label_kfold)
@@ -416,24 +416,23 @@ def __init__(self, labels, n_folds=3, shuffle=False, random_state=None):
                                                                 n_labels))
 
         # Weight labels by their number of occurences
-        samples_per_label = np.bincount(labels)
+        n_samples_per_label = np.bincount(labels)
 
         # Distribute the most frequent labels first
-        ind = np.argsort(samples_per_label)[::-1]
-        samples_per_label = samples_per_label[ind]
+        indices = np.argsort(n_samples_per_label)[::-1]
+        n_samples_per_label = n_samples_per_label[indices]
 
         # Total weight of each fold
-        samples_per_fold = np.zeros(n_folds, dtype=np.uint64)
+        n_samples_per_fold = np.zeros(n_folds)
 
         # Mapping from label index to fold index
-        label_to_fold = np.zeros(len(unique_labels), dtype=np.uintp)
+        label_to_fold = np.zeros(len(unique_labels))
 
-        # While there are weights, distribute them
-        # Specifically, add the biggest weight to the lightest fold
-        for label_index, w in enumerate(samples_per_label):
-            ind_min = np.argmin(samples_per_fold)
-            samples_per_fold[ind_min] += w
-            label_to_fold[ind[label_index]] = ind_min
+        # Distribute samples by adding the largest weight to the lightest fold
+        for label_index, weight in enumerate(n_samples_per_label):
+            lightest_fold = np.argmin(n_samples_per_fold)
+            n_samples_per_fold[lightest_fold] += weight
+            label_to_fold[indices[label_index]] = lightest_fold
 
         self.idxs = label_to_fold[labels]
 
@@ -486,10 +485,10 @@ class StratifiedKFold(_BaseKFold):
 
     Examples
     --------
-    >>> from sklearn import cross_validation
+    >>> from sklearn.cross_validation import StratifiedKFold
     >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
     >>> y = np.array([0, 0, 1, 1])
-    >>> skf = cross_validation.StratifiedKFold(y, n_folds=2)
+    >>> skf = StratifiedKFold(y, n_folds=2)
     >>> len(skf)
     2
     >>> print(skf)  # doctest: +NORMALIZE_WHITESPACE

From 410e7f2c2a2a909e47683f9d43d87962628d7b99 Mon Sep 17 00:00:00 2001
From: Gilles Louppe <g.louppe@gmail.com>
Date: Tue, 8 Sep 2015 07:48:26 +0200
Subject: [PATCH 4/4] ENH: remove unnecessary assignments

---
 doc/whats_new.rst           | 13 ++++++++-----
 sklearn/cross_validation.py |  3 ---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 7ba8e03378b11..66df0155d76f4 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -34,9 +34,12 @@ New features
      function into a ``Pipeline``-compatible transformer object.
      By Joe Jevnik.
 
-   - :class:`cross_validation.LabelShuffleSplit` generates random train-test
-     splits, similar to :class:`cross_validation.ShuffleSplit`, except that
-     the splits are conditioned on a label array. By `Brian McFee`_.
+   - The new classes :class:`cross_validation.LabelKFold` and
+     :class:`cross_validation.LabelShuffleSplit` generate train-test folds,
+     respectively similar to :class:`cross_validation.KFold` and
+     :class:`cross_validation.ShuffleSplit`, except that the folds are
+     conditioned on a label array. By `Brian McFee`_, Jean Kossaifi and
+     `Gilles Louppe`_.
 
 
 Enhancements
@@ -127,11 +130,11 @@ Enhancements
 
    - Allow :func:`datasets.make_multilabel_classification` to output
      a sparse ``y``. By Kashif Rasul.
-     
+
    - :class:`cluster.DBSCAN` now accepts a sparse matrix of precomputed
      distances, allowing memory-efficient distance precomputation. By
      `Joel Nothman`_.
-     
+
    - :class:`tree.DecisionTreeClassifier` now exposes an ``apply`` method
      for retrieving the leaf indices samples are predicted as. By
      `Daniel Galvez`_ and `Gilles Louppe`_.
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 2e988d5985cf8..117327dbc814a 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -403,9 +403,6 @@ def __init__(self, labels, n_folds=3, shuffle=False, random_state=None):
         super(LabelKFold, self).__init__(len(labels), n_folds, shuffle,
                                          random_state)
 
-        self.n = len(labels)
-        self.n_folds = n_folds
-
         unique_labels, labels = np.unique(labels, return_inverse=True)
         n_labels = len(unique_labels)