From 53063ba667750544f9e57d14043dd06a748d1110 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Tue, 24 Mar 2015 15:32:29 +0000
Subject: [PATCH 01/25] Added subject independent KFold

---
 sklearn/cross_validation.py            | 65 ++++++++++++++++++++++++++
 sklearn/tests/test_cross_validation.py | 56 ++++++++++++++++++++++
 2 files changed, 121 insertions(+)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index dacf4c4c67f63..7c37148dbba30 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -332,6 +332,71 @@ def __len__(self):
         return self.n_folds
 
 
+def subject_independent_folds(subjects, n_folds=3):
+    """ Creates folds where a same subject is not in two different folds
+    
+    Parameters
+    ----------
+    subjects: iterable of shape (n_samples, )
+        contains an id for each sample
+        The folds are built so that the same id doesn't appear in two different folds
+    
+    n_folds: int, default is 3
+        number of folds to split the data into
+        
+    Returns
+    -------
+    folds: numpy array of shape (n_samples, )
+        array of integers between 0 and (n_folds - 1)
+        folds[i] contains the folds to which sample i is assigned.
+        
+    Notes
+    -----
+    The folds are built by distributing the subjects by frequency of appearance.
+    """
+    subjects = np.array(subjects)
+    unique_subjects = np.unique(subjects)
+    
+    # number of occurrence of each subject (its "weight")
+    weight_per_subject = sorted([(sum(subjects == i), i) for i in unique_subjects])
+    # Total weight of each fold
+    weight_per_fold = np.zeros(n_folds)
+    # For each sample, a digit between 0 and (n_folds - 1) to tell which fold it belongs to
+    folds = np.zeros(len(subjects))
+    
+    # While there are weights, distribute them
+    # Specifically, add the biggest weight to the lightest fold
+    while weight_per_subject:
+        ind_min = np.argmin(weight_per_fold)
+        w, actor = weight_per_subject.pop()
+        weight_per_fold[ind_min] += w
+        folds[subjects == actor] = ind_min
+    
+    return folds
+
+
+class SubjectIndependentKfold(_BaseKFold):
+    def __init__(self, subjects, n_folds=3):
+        self.n_folds = n_folds
+        self.n = len(subjects)
+        self.idxs = subject_independent_folds(subjects=subjects, n_folds=n_folds)
+
+    def _iter_test_indices(self):
+        for i in range(self.n_folds):
+            yield (self.idxs == i)
+
+    def __repr__(self):
+        return '{}.{}(n_subjects={}, n_folds={})'.format(
+            self.__class__.__module__,
+            self.__class__.__name__,
+            self.n,
+            self.n_folds,
+        )
+
+    def __len__(self):
+        return self.n_folds
+
+
 class StratifiedKFold(_BaseKFold):
     """Stratified K-Folds cross validation iterator
 
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 786bf561ec5e2..a1bf5a4293fd0 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -1038,3 +1038,59 @@ def test_check_is_partition():
 
     p[0] = 23
     assert_false(cval._check_is_partition(p, 100))
+
+
+def test_subject_independent_folds():
+    """ Check that the function produces equilibrated folds
+        with no subject appearing in two different folds
+    """
+    # Fix the seed for reproducibility 
+    np.random.seed(0)
+    
+    # Parameters of the test
+    n_subjects = 15
+    n_samples = 1000
+    n_folds = 5
+    
+    # Construct the test data
+    tolerance = 0.05 * n_samples # 5 percent error allowed
+    subjects = np.random.randint(0, n_subjects, n_samples)
+    folds = cval.subject_independent_folds(subjects, n_folds)
+    ideal_n_subjects_per_fold = n_samples // n_folds
+    
+    # Check that folds have approximately the same size
+    assert(len(folds)==len(subjects))
+    for i in np.unique(folds):
+        assert(abs(sum(folds == i) - ideal_n_subjects_per_fold) <= tolerance)
+    
+    # Check that each subjects appears only in 1 fold
+    for subject in np.unique(subjects):
+        assert(len(np.unique(folds[subjects == subject])) == 1)
+        
+    subjects = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
+                'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
+                'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
+                'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
+                'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
+                'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis',
+                'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']
+    
+    n_subjects = len(np.unique(subjects))
+    n_samples = len(subjects)
+    n_folds = 5
+    
+    # Construct the test data
+    tolerance = 0.05 * n_samples # 5 percent error allowed
+    subjects = np.random.randint(0, n_subjects, n_samples)
+    folds = cval.subject_independent_folds(subjects, n_folds)
+    ideal_n_subjects_per_fold = n_samples // n_folds
+    
+    # Check that folds have approximately the same size
+    assert(len(folds)==len(subjects))
+    for i in np.unique(folds):
+        assert(abs(sum(folds == i) - ideal_n_subjects_per_fold) <= tolerance)
+    
+    # Check that each subjects appears only in 1 fold
+    for subject in np.unique(subjects):
+        assert(len(np.unique(folds[subjects == subject])) == 1)
+        

From 37ecdd7776f9293c19f233e23e0f2ddffab1d9e6 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Wed, 25 Mar 2015 11:34:33 +0000
Subject: [PATCH 02/25] Changed SubjectIndependentKFold to DisjointGroupKFold

cosmetic changes  test (fix seed correctly, use assert_equal for
meaningful error messages)
---
 sklearn/cross_validation.py            | 44 ++++++++++++++++----------
 sklearn/tests/test_cross_validation.py | 43 ++++++++++++-------------
 2 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 7c37148dbba30..47bff1fe8bf0b 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -332,12 +332,12 @@ def __len__(self):
         return self.n_folds
 
 
-def subject_independent_folds(subjects, n_folds=3):
-    """ Creates folds where a same subject is not in two different folds
+def disjoint_group_folds(groups, n_folds=3):
+    """Creates folds where a same group is not in two different folds
     
     Parameters
     ----------
-    subjects: iterable of shape (n_samples, )
+    groups: iterable of shape (n_samples, )
         contains an id for each sample
         The folds are built so that the same id doesn't appear in two different folds
     
@@ -352,34 +352,46 @@ def subject_independent_folds(subjects, n_folds=3):
         
     Notes
     -----
-    The folds are built by distributing the subjects by frequency of appearance.
+    The folds are built by distributing the groups by frequency of appearance.
     """
-    subjects = np.array(subjects)
-    unique_subjects = np.unique(subjects)
+    groups = np.array(groups)
+    unique_groups = np.unique(groups)
     
-    # number of occurrence of each subject (its "weight")
-    weight_per_subject = sorted([(sum(subjects == i), i) for i in unique_subjects])
+    # number of occurrence of each group (its "weight")
+    weight_per_group = sorted([(sum(groups == group_id), group_id) for group_id in unique_groups])
     # Total weight of each fold
     weight_per_fold = np.zeros(n_folds)
     # For each sample, a digit between 0 and (n_folds - 1) to tell which fold it belongs to
-    folds = np.zeros(len(subjects))
+    folds = np.zeros(len(groups))
     
     # While there are weights, distribute them
     # Specifically, add the biggest weight to the lightest fold
-    while weight_per_subject:
+    while weight_per_group:
         ind_min = np.argmin(weight_per_fold)
-        w, actor = weight_per_subject.pop()
+        w, group_id = weight_per_group.pop()
         weight_per_fold[ind_min] += w
-        folds[subjects == actor] = ind_min
+        folds[groups == group_id] = ind_min
     
     return folds
 
 
-class SubjectIndependentKfold(_BaseKFold):
-    def __init__(self, subjects, n_folds=3):
+class DisjointGroupKfold(_BaseKFold):
+    def __init__(self, groups, n_folds=3):
+        """Creates K approximately equilibrated folds
+            where the same group will not appear in two different folds
+
+        Parameters
+        ----------
+        groups: numpy array of shape (n_samples, )
+            contains an id for each sample
+            The folds are built so that the same id doesn't appear in two different folds
+
+        n_folds: int, default is 3
+            number of folds
+        """
         self.n_folds = n_folds
-        self.n = len(subjects)
-        self.idxs = subject_independent_folds(subjects=subjects, n_folds=n_folds)
+        self.n = len(groups)
+        self.idxs = disjoint_group_folds(groups=groups, n_folds=n_folds)
 
     def _iter_test_indices(self):
         for i in range(self.n_folds):
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index a1bf5a4293fd0..8bfb713e4b0e9 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -12,6 +12,7 @@
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_greater
+from sklearn.utils.testing import assert_greater_equal
 from sklearn.utils.testing import assert_less
 from sklearn.utils.testing import assert_not_equal
 from sklearn.utils.testing import assert_array_almost_equal
@@ -1042,32 +1043,33 @@ def test_check_is_partition():
 
 def test_subject_independent_folds():
     """ Check that the function produces equilibrated folds
-        with no subject appearing in two different folds
+        with no group appearing in two different folds
     """
     # Fix the seed for reproducibility 
-    np.random.seed(0)
+    rng = np.random.RandomState(0)
     
     # Parameters of the test
-    n_subjects = 15
+    n_groups = 15
     n_samples = 1000
     n_folds = 5
     
     # Construct the test data
     tolerance = 0.05 * n_samples # 5 percent error allowed
-    subjects = np.random.randint(0, n_subjects, n_samples)
-    folds = cval.subject_independent_folds(subjects, n_folds)
-    ideal_n_subjects_per_fold = n_samples // n_folds
+    groups = np.random.randint(0, n_groups, n_samples)
+    folds = cval.disjoint_group_folds(groups, n_folds)
+    ideal_n_groups_per_fold = n_samples // n_folds
     
     # Check that folds have approximately the same size
-    assert(len(folds)==len(subjects))
+    assert_equal(len(folds), len(groups))
     for i in np.unique(folds):
-        assert(abs(sum(folds == i) - ideal_n_subjects_per_fold) <= tolerance)
+        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold))
     
     # Check that each subjects appears only in 1 fold
-    for subject in np.unique(subjects):
-        assert(len(np.unique(folds[subjects == subject])) == 1)
+    for group in np.unique(groups):
+        assert_equal(len(np.unique(folds[groups == group])), 1)
         
-    subjects = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
+    # Construct the test data
+    groups = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
                 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
                 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
                 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
@@ -1075,22 +1077,19 @@ def test_subject_independent_folds():
                 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis',
                 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']
     
-    n_subjects = len(np.unique(subjects))
-    n_samples = len(subjects)
+    n_groups = len(np.unique(groups))
+    n_samples = len(groups)
     n_folds = 5
-    
-    # Construct the test data
     tolerance = 0.05 * n_samples # 5 percent error allowed
-    subjects = np.random.randint(0, n_subjects, n_samples)
-    folds = cval.subject_independent_folds(subjects, n_folds)
-    ideal_n_subjects_per_fold = n_samples // n_folds
+    folds = cval.disjoint_group_folds(groups, n_folds)
+    ideal_n_groups_per_fold = n_samples // n_folds
     
     # Check that folds have approximately the same size
-    assert(len(folds)==len(subjects))
+    assert_equal(len(folds), len(groups))
     for i in np.unique(folds):
-        assert(abs(sum(folds == i) - ideal_n_subjects_per_fold) <= tolerance)
+        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold))
     
     # Check that each subjects appears only in 1 fold
-    for subject in np.unique(subjects):
-        assert(len(np.unique(folds[subjects == subject])) == 1)
+    for group in np.unique(groups):
+        assert_equal(len(np.unique(folds[groups == group])), 1)
         

From 79cea0c32196a7a59eda8a6241904b7458f5ffe4 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Thu, 26 Mar 2015 11:15:11 +0000
Subject: [PATCH 03/25] Changed name to DisjointLabelKFold

---
 sklearn/cross_validation.py            | 38 +++++++++++++-------------
 sklearn/tests/test_cross_validation.py | 38 +++++++++++++-------------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 47bff1fe8bf0b..9d6ceb9e4dda6 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -332,12 +332,12 @@ def __len__(self):
         return self.n_folds
 
 
-def disjoint_group_folds(groups, n_folds=3):
-    """Creates folds where a same group is not in two different folds
+def disjoint_label_folds(y, n_folds=3):
+    """Creates folds where a same label is not in two different folds
     
     Parameters
     ----------
-    groups: iterable of shape (n_samples, )
+    y: iterable of shape (n_samples, )
         contains an id for each sample
         The folds are built so that the same id doesn't appear in two different folds
     
@@ -352,46 +352,46 @@ def disjoint_group_folds(groups, n_folds=3):
         
     Notes
     -----
-    The folds are built by distributing the groups by frequency of appearance.
+    The folds are built by distributing the labels by frequency of appearance.
     """
-    groups = np.array(groups)
-    unique_groups = np.unique(groups)
+    labels = np.array(y)
+    unique_labels = np.unique(labels)
     
-    # number of occurrence of each group (its "weight")
-    weight_per_group = sorted([(sum(groups == group_id), group_id) for group_id in unique_groups])
+    # number of occurrence of each label (its "weight")
+    weight_per_label = sorted([(sum(labels == label), label) for label in unique_labels])
     # Total weight of each fold
     weight_per_fold = np.zeros(n_folds)
     # For each sample, a digit between 0 and (n_folds - 1) to tell which fold it belongs to
-    folds = np.zeros(len(groups))
+    folds = np.zeros(len(labels))
     
     # While there are weights, distribute them
     # Specifically, add the biggest weight to the lightest fold
-    while weight_per_group:
+    while weight_per_label:
         ind_min = np.argmin(weight_per_fold)
-        w, group_id = weight_per_group.pop()
+        w, label = weight_per_label.pop()
         weight_per_fold[ind_min] += w
-        folds[groups == group_id] = ind_min
+        folds[labels == label] = ind_min
     
     return folds
 
 
-class DisjointGroupKfold(_BaseKFold):
-    def __init__(self, groups, n_folds=3):
+class DisjointLabelKfold(_BaseKFold):
+    def __init__(self, y, n_folds=3):
         """Creates K approximately equilibrated folds
-            where the same group will not appear in two different folds
+            where the same label will not appear in two different folds
 
         Parameters
         ----------
-        groups: numpy array of shape (n_samples, )
+        y: numpy array of shape (n_samples, )
             contains an id for each sample
-            The folds are built so that the same id doesn't appear in two different folds
+            The folds are built so that the same label doesn't appear in two different folds
 
         n_folds: int, default is 3
             number of folds
         """
         self.n_folds = n_folds
-        self.n = len(groups)
-        self.idxs = disjoint_group_folds(groups=groups, n_folds=n_folds)
+        self.n = len(y)
+        self.idxs = disjoint_label_folds(y=y, n_folds=n_folds)
 
     def _iter_test_indices(self):
         for i in range(self.n_folds):
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 8bfb713e4b0e9..6db640c1925e4 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -1041,35 +1041,35 @@ def test_check_is_partition():
     assert_false(cval._check_is_partition(p, 100))
 
 
-def test_subject_independent_folds():
+def test_disjoint_label_folds():
     """ Check that the function produces equilibrated folds
-        with no group appearing in two different folds
+        with no label appearing in two different folds
     """
     # Fix the seed for reproducibility 
     rng = np.random.RandomState(0)
     
     # Parameters of the test
-    n_groups = 15
+    n_labels = 15
     n_samples = 1000
     n_folds = 5
     
     # Construct the test data
     tolerance = 0.05 * n_samples # 5 percent error allowed
-    groups = np.random.randint(0, n_groups, n_samples)
-    folds = cval.disjoint_group_folds(groups, n_folds)
-    ideal_n_groups_per_fold = n_samples // n_folds
+    labels = np.random.randint(0, n_labels, n_samples)
+    folds = cval.disjoint_label_folds(labels, n_folds)
+    ideal_n_labels_per_fold = n_samples // n_folds
     
     # Check that folds have approximately the same size
-    assert_equal(len(folds), len(groups))
+    assert_equal(len(folds), len(labels))
     for i in np.unique(folds):
-        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold))
+        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold))
     
     # Check that each subjects appears only in 1 fold
-    for group in np.unique(groups):
-        assert_equal(len(np.unique(folds[groups == group])), 1)
+    for label in np.unique(labels):
+        assert_equal(len(np.unique(folds[labels == label])), 1)
         
     # Construct the test data
-    groups = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
+    labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
                 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
                 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
                 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
@@ -1077,19 +1077,19 @@ def test_subject_independent_folds():
                 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis',
                 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']
     
-    n_groups = len(np.unique(groups))
-    n_samples = len(groups)
+    n_labels = len(np.unique(labels))
+    n_samples = len(labels)
     n_folds = 5
     tolerance = 0.05 * n_samples # 5 percent error allowed
-    folds = cval.disjoint_group_folds(groups, n_folds)
-    ideal_n_groups_per_fold = n_samples // n_folds
+    folds = cval.disjoint_label_folds(labels, n_folds)
+    ideal_n_labels_per_fold = n_samples // n_folds
     
     # Check that folds have approximately the same size
-    assert_equal(len(folds), len(groups))
+    assert_equal(len(folds), len(labels))
     for i in np.unique(folds):
-        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold))
+        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold))
     
     # Check that each subjects appears only in 1 fold
-    for group in np.unique(groups):
-        assert_equal(len(np.unique(folds[groups == group])), 1)
+    for label in np.unique(labels):
+        assert_equal(len(np.unique(folds[labels == label])), 1)
         

From de5d272eaf71b96ca98f7c4b84a8488db2b8b268 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Thu, 26 Mar 2015 12:31:41 +0000
Subject: [PATCH 04/25] Added example of use

---
 sklearn/cross_validation.py | 40 ++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 9d6ceb9e4dda6..d2e26c5bbbf17 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -382,12 +382,38 @@ def __init__(self, y, n_folds=3):
 
         Parameters
         ----------
-        y: numpy array of shape (n_samples, )
-            contains an id for each sample
-            The folds are built so that the same label doesn't appear in two different folds
-
-        n_folds: int, default is 3
-            number of folds
+        y : array-like with shape (n_samples, )
+            Contains a label for each sample.
+            The folds are built so that the same label doesn't appear in two different folds.
+
+        n_folds : int, default is 3
+            Number of folds.
+
+        Examples
+        --------
+        >>> from sklearn import cross_validation
+        >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+        >>> y = np.array([1, 2, 3, 4])
+        >>> labels = np.array([0, 0, 2, 2])
+        >>> dl_kfold = cross_validation.DisjointLabelKfold(labels, n_folds=2)
+        >>> len(dl_kfold)
+        2
+        >>> print(dl_kfold)
+        sklearn.cross_validation.DisjointLabelKfold(n_labels=4, n_folds=2)
+        >>> for train_index, test_index in dl_kfold:
+        ...     print("TRAIN:", train_index, "TEST:", test_index)
+        ...     X_train, X_test = X[train_index], X[test_index]
+        ...     y_train, y_test = y[train_index], y[test_index]
+        ...     print(X_train, X_test, y_train, y_test)
+        ... 
+        TRAIN: [0 1] TEST: [2 3]
+        [[1 2]
+        [3 4]] [[5 6]
+        [7 8]] [1 2] [3 4]
+        TRAIN: [2 3] TEST: [0 1]
+        [[5 6]
+        [7 8]] [[1 2]
+        [3 4]] [3 4] [1 2]
         """
         self.n_folds = n_folds
         self.n = len(y)
@@ -398,7 +424,7 @@ def _iter_test_indices(self):
             yield (self.idxs == i)
 
     def __repr__(self):
-        return '{}.{}(n_subjects={}, n_folds={})'.format(
+        return '{}.{}(n_labels={}, n_folds={})'.format(
             self.__class__.__module__,
             self.__class__.__name__,
             self.n,

From a8098819f5983efc21767e74fac2509f2ae74726 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Thu, 26 Mar 2015 12:56:33 +0000
Subject: [PATCH 05/25] FIX: whitespace related doctest failure

---
 sklearn/cross_validation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index d2e26c5bbbf17..db7071dd05276 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -408,12 +408,12 @@ def __init__(self, y, n_folds=3):
         ... 
         TRAIN: [0 1] TEST: [2 3]
         [[1 2]
-        [3 4]] [[5 6]
-        [7 8]] [1 2] [3 4]
+         [3 4]] [[5 6]
+         [7 8]] [1 2] [3 4]
         TRAIN: [2 3] TEST: [0 1]
         [[5 6]
-        [7 8]] [[1 2]
-        [3 4]] [3 4] [1 2]
+         [7 8]] [[1 2]
+         [3 4]] [3 4] [1 2]
         """
         self.n_folds = n_folds
         self.n = len(y)

From c14f84784031085b0ba912174c3fd41b56263367 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Thu, 26 Mar 2015 13:24:01 +0000
Subject: [PATCH 06/25] FIX: Python 2.6 requires the field numbers in print

---
 sklearn/cross_validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index db7071dd05276..53420c97157cf 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -424,7 +424,7 @@ def _iter_test_indices(self):
             yield (self.idxs == i)
 
     def __repr__(self):
-        return '{}.{}(n_labels={}, n_folds={})'.format(
+        return '{0}.{1}(n_labels={2}, n_folds={3})'.format(
             self.__class__.__module__,
             self.__class__.__name__,
             self.n,

From b0fc20422f26e1134e8b618ae047ba22444acf57 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Thu, 26 Mar 2015 13:45:28 +0000
Subject: [PATCH 07/25] FIX: change docstring to comment in test function

---
 sklearn/tests/test_cross_validation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 6db640c1925e4..25b617c93d3ae 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -1042,9 +1042,9 @@ def test_check_is_partition():
 
 
 def test_disjoint_label_folds():
-    """ Check that the function produces equilibrated folds
-        with no label appearing in two different folds
-    """
+    ## Check that the function produces equilibrated folds
+    ##      with no label appearing in two different folds
+
     # Fix the seed for reproducibility 
     rng = np.random.RandomState(0)
     

From 1916b26425772f177b155c012c0d46ade2ddc8b3 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Thu, 14 May 2015 12:16:34 +0100
Subject: [PATCH 08/25] DOC: moved docstring from function to class

---
 sklearn/cross_validation.py | 77 +++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 38 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 53420c97157cf..965596ce74475 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -376,45 +376,46 @@ def disjoint_label_folds(y, n_folds=3):
 
 
 class DisjointLabelKfold(_BaseKFold):
+    """Creates K approximately equilibrated folds
+        where the same label will not appear in two different folds
+
+    Parameters
+    ----------
+    y : array-like with shape (n_samples, )
+        Contains a label for each sample.
+        The folds are built so that the same label doesn't appear in two different folds.
+
+    n_folds : int, default is 3
+        Number of folds.
+
+    Examples
+    --------
+    >>> from sklearn import cross_validation
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    >>> y = np.array([1, 2, 3, 4])
+    >>> labels = np.array([0, 0, 2, 2])
+    >>> dl_kfold = cross_validation.DisjointLabelKfold(labels, n_folds=2)
+    >>> len(dl_kfold)
+    2
+    >>> print(dl_kfold)
+    sklearn.cross_validation.DisjointLabelKfold(n_labels=4, n_folds=2)
+    >>> for train_index, test_index in dl_kfold:
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
+    ...     X_train, X_test = X[train_index], X[test_index]
+    ...     y_train, y_test = y[train_index], y[test_index]
+    ...     print(X_train, X_test, y_train, y_test)
+    ... 
+    TRAIN: [0 1] TEST: [2 3]
+    [[1 2]
+        [3 4]] [[5 6]
+        [7 8]] [1 2] [3 4]
+    TRAIN: [2 3] TEST: [0 1]
+    [[5 6]
+        [7 8]] [[1 2]
+        [3 4]] [3 4] [1 2]
+    """
+
     def __init__(self, y, n_folds=3):
-        """Creates K approximately equilibrated folds
-            where the same label will not appear in two different folds
-
-        Parameters
-        ----------
-        y : array-like with shape (n_samples, )
-            Contains a label for each sample.
-            The folds are built so that the same label doesn't appear in two different folds.
-
-        n_folds : int, default is 3
-            Number of folds.
-
-        Examples
-        --------
-        >>> from sklearn import cross_validation
-        >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
-        >>> y = np.array([1, 2, 3, 4])
-        >>> labels = np.array([0, 0, 2, 2])
-        >>> dl_kfold = cross_validation.DisjointLabelKfold(labels, n_folds=2)
-        >>> len(dl_kfold)
-        2
-        >>> print(dl_kfold)
-        sklearn.cross_validation.DisjointLabelKfold(n_labels=4, n_folds=2)
-        >>> for train_index, test_index in dl_kfold:
-        ...     print("TRAIN:", train_index, "TEST:", test_index)
-        ...     X_train, X_test = X[train_index], X[test_index]
-        ...     y_train, y_test = y[train_index], y[test_index]
-        ...     print(X_train, X_test, y_train, y_test)
-        ... 
-        TRAIN: [0 1] TEST: [2 3]
-        [[1 2]
-         [3 4]] [[5 6]
-         [7 8]] [1 2] [3 4]
-        TRAIN: [2 3] TEST: [0 1]
-        [[5 6]
-         [7 8]] [[1 2]
-         [3 4]] [3 4] [1 2]
-        """
         self.n_folds = n_folds
         self.n = len(y)
         self.idxs = disjoint_label_folds(y=y, n_folds=n_folds)

From f320bc1a384221e359803a0f84319a1979712344 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Thu, 14 May 2015 12:29:42 +0100
Subject: [PATCH 09/25] FIX: added call to parent class

---
 sklearn/cross_validation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 965596ce74475..a56271ef0b160 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -416,6 +416,8 @@ class DisjointLabelKfold(_BaseKFold):
     """
 
     def __init__(self, y, n_folds=3):
+        # No shuffling implemented yet
+        super(KFold, self).__init__(n, n_folds, False, None)
         self.n_folds = n_folds
         self.n = len(y)
         self.idxs = disjoint_label_folds(y=y, n_folds=n_folds)

From f58dbe663b9b4e250719d98bb5394685dfd20630 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Thu, 14 May 2015 12:44:28 +0100
Subject: [PATCH 10/25] FIX: error in calling the parent

---
 sklearn/cross_validation.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index a56271ef0b160..42bcdb97ef860 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -406,18 +406,18 @@ class DisjointLabelKfold(_BaseKFold):
     ...     print(X_train, X_test, y_train, y_test)
     ... 
     TRAIN: [0 1] TEST: [2 3]
-    [[1 2]
-        [3 4]] [[5 6]
-        [7 8]] [1 2] [3 4]
+     [[1 2]
+     [3 4]] [[5 6]
+     [7 8]] [1 2] [3 4]
     TRAIN: [2 3] TEST: [0 1]
-    [[5 6]
-        [7 8]] [[1 2]
-        [3 4]] [3 4] [1 2]
+     [[5 6]
+     [7 8]] [[1 2]
+     [3 4]] [3 4] [1 2]
     """
 
     def __init__(self, y, n_folds=3):
         # No shuffling implemented yet
-        super(KFold, self).__init__(n, n_folds, False, None)
+        super(DisjointLabelKfold, self).__init__(len(y), n_folds, False, None)
         self.n_folds = n_folds
         self.n = len(y)
         self.idxs = disjoint_label_folds(y=y, n_folds=n_folds)

From 89358a3670f444e7810e9c7627a90f7ba7d36b6a Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Thu, 14 May 2015 13:30:24 +0100
Subject: [PATCH 11/25] DOC: fixed doctest

---
 sklearn/cross_validation.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 42bcdb97ef860..b90c2dcd9618c 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -406,13 +406,13 @@ class DisjointLabelKfold(_BaseKFold):
     ...     print(X_train, X_test, y_train, y_test)
     ... 
     TRAIN: [0 1] TEST: [2 3]
-     [[1 2]
-     [3 4]] [[5 6]
-     [7 8]] [1 2] [3 4]
+    [[1 2]
+    [3 4]] [[5 6]
+    [7 8]] [1 2] [3 4]
     TRAIN: [2 3] TEST: [0 1]
-     [[5 6]
-     [7 8]] [[1 2]
-     [3 4]] [3 4] [1 2]
+    [[5 6]
+    [7 8]] [[1 2]
+    [3 4]] [3 4] [1 2]
     """
 
     def __init__(self, y, n_folds=3):

From 90cfc566feff55de32bcf060ef7d644edf845649 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Thu, 14 May 2015 14:33:42 +0100
Subject: [PATCH 12/25] FIX: doctest

---
 sklearn/cross_validation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index b90c2dcd9618c..5b5a7e7260e65 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -407,12 +407,12 @@ class DisjointLabelKfold(_BaseKFold):
     ... 
     TRAIN: [0 1] TEST: [2 3]
     [[1 2]
-    [3 4]] [[5 6]
-    [7 8]] [1 2] [3 4]
+     [3 4]] [[5 6]
+     [7 8]] [1 2] [3 4]
     TRAIN: [2 3] TEST: [0 1]
     [[5 6]
-    [7 8]] [[1 2]
-    [3 4]] [3 4] [1 2]
+     [7 8]] [[1 2]
+     [3 4]] [3 4] [1 2]
     """
 
     def __init__(self, y, n_folds=3):

From 5630d34512665d3260f9dc5e3131dfa25f508f27 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Tue, 23 Jun 2015 16:35:47 +0100
Subject: [PATCH 13/25] Cosmetic changes (minor refactoring)

---
 sklearn/cross_validation.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 5b5a7e7260e65..5c26d5ab2bf66 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -333,29 +333,29 @@ def __len__(self):
 
 
 def disjoint_label_folds(y, n_folds=3):
-    """Creates folds where a same label is not in two different folds
+    """Creates folds where a same label is not in two different folds.
     
     Parameters
     ----------
-    y: iterable of shape (n_samples, )
-        contains an id for each sample
-        The folds are built so that the same id doesn't appear in two different folds
+    y : iterable of shape (n_samples,)
+        Contains an id for each sample.
+        The folds are built so that the same id doesn't appear in two different folds.
     
     n_folds: int, default is 3
-        number of folds to split the data into
+        Number of folds to split the data into.
         
     Returns
     -------
     folds: numpy array of shape (n_samples, )
-        array of integers between 0 and (n_folds - 1)
-        folds[i] contains the folds to which sample i is assigned.
+        Array of integers between 0 and (n_folds - 1).
+        Folds[i] contains the folds to which sample i is assigned.
         
     Notes
     -----
     The folds are built by distributing the labels by frequency of appearance.
     """
     labels = np.array(y)
-    unique_labels = np.unique(labels)
+    unique_labels, y = np.unique(labels, return_inverse=True)
     
     # number of occurrence of each label (its "weight")
     weight_per_label = sorted([(sum(labels == label), label) for label in unique_labels])
@@ -375,9 +375,10 @@ def disjoint_label_folds(y, n_folds=3):
     return folds
 
 
-class DisjointLabelKfold(_BaseKFold):
-    """Creates K approximately equilibrated folds
-        where the same label will not appear in two different folds
+class DisjointLabelKFold(_BaseKFold):
+    """Creates K approximately equilibrated folds.
+    
+    The same label will not appear in two different folds.
 
     Parameters
     ----------
@@ -394,11 +395,11 @@ class DisjointLabelKfold(_BaseKFold):
     >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
     >>> y = np.array([1, 2, 3, 4])
     >>> labels = np.array([0, 0, 2, 2])
-    >>> dl_kfold = cross_validation.DisjointLabelKfold(labels, n_folds=2)
+    >>> dl_kfold = cross_validation.DisjointLabelKFold(labels, n_folds=2)
     >>> len(dl_kfold)
     2
     >>> print(dl_kfold)
-    sklearn.cross_validation.DisjointLabelKfold(n_labels=4, n_folds=2)
+    sklearn.cross_validation.DisjointLabelKFold(n_labels=4, n_folds=2)
     >>> for train_index, test_index in dl_kfold:
     ...     print("TRAIN:", train_index, "TEST:", test_index)
     ...     X_train, X_test = X[train_index], X[test_index]
@@ -415,9 +416,9 @@ class DisjointLabelKfold(_BaseKFold):
      [3 4]] [3 4] [1 2]
     """
 
-    def __init__(self, y, n_folds=3):
+    def __init__(self, y, n_folds=3, shuffle=False, random_state=None):
         # No shuffling implemented yet
-        super(DisjointLabelKfold, self).__init__(len(y), n_folds, False, None)
+        super(DisjointLabelKFold, self).__init__(len(y), n_folds, shuffle, random_state)
         self.n_folds = n_folds
         self.n = len(y)
         self.idxs = disjoint_label_folds(y=y, n_folds=n_folds)

From 19da58ae24cd3b49b884303ed090d090dd917bce Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Tue, 23 Jun 2015 17:17:53 +0100
Subject: [PATCH 14/25] Optimised code (use np.bincount)

---
 sklearn/cross_validation.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 5c26d5ab2bf66..28fa786d9516e 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -358,20 +358,26 @@ def disjoint_label_folds(y, n_folds=3):
     unique_labels, y = np.unique(labels, return_inverse=True)
     
     # number of occurrence of each label (its "weight")
-    weight_per_label = sorted([(sum(labels == label), label) for label in unique_labels])
+    weight_per_label = np.bincount(y)
+    # We want to distribute the heaviest weights first
+    ind = np.argsort(weight_per_label)[::-1]
+    weight_per_label = weight_per_label[ind]
+
     # Total weight of each fold
     weight_per_fold = np.zeros(n_folds)
-    # For each sample, a digit between 0 and (n_folds - 1) to tell which fold it belongs to
-    folds = np.zeros(len(labels))
+
+    # Mapping from label index to fold index
+    label_to_fold = np.zeros(len(unique_labels))
     
     # While there are weights, distribute them
     # Specifically, add the biggest weight to the lightest fold
-    while weight_per_label:
+    for label_index, w in enumerate(weight_per_label):
         ind_min = np.argmin(weight_per_fold)
-        w, label = weight_per_label.pop()
         weight_per_fold[ind_min] += w
-        folds[labels == label] = ind_min
+        label_to_fold[ind[label_index]] = ind_min
     
+    folds = label_to_fold[y]
+
     return folds
 
 

From 4706289c251f6ec0ff5066fb9eee50d1082fe11e Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Tue, 23 Jun 2015 18:52:34 +0100
Subject: [PATCH 15/25] Cosmetic: use samples instead of weight for clarity

---
 sklearn/cross_validation.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 28fa786d9516e..29ac0a5244382 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -358,22 +358,22 @@ def disjoint_label_folds(y, n_folds=3):
     unique_labels, y = np.unique(labels, return_inverse=True)
     
     # number of occurrence of each label (its "weight")
-    weight_per_label = np.bincount(y)
-    # We want to distribute the heaviest weights first
-    ind = np.argsort(weight_per_label)[::-1]
-    weight_per_label = weight_per_label[ind]
+    samples_per_label = np.bincount(y)
+    # We want to distribute the most frequent labels first
+    ind = np.argsort(samples_per_label)[::-1]
+    samples_per_label = samples_per_label[ind]
 
     # Total weight of each fold
-    weight_per_fold = np.zeros(n_folds)
+    samples_per_fold = np.zeros(n_folds)
 
     # Mapping from label index to fold index
     label_to_fold = np.zeros(len(unique_labels))
     
     # While there are weights, distribute them
     # Specifically, add the biggest weight to the lightest fold
-    for label_index, w in enumerate(weight_per_label):
-        ind_min = np.argmin(weight_per_fold)
-        weight_per_fold[ind_min] += w
+    for label_index, w in enumerate(samples_per_label):
+        ind_min = np.argmin(samples_per_fold)
+        samples_per_fold[ind_min] += w
         label_to_fold[ind[label_index]] = ind_min
     
     folds = label_to_fold[y]

From b3055396769e9ca5b440135c3cc03ce8861a50ba Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Tue, 23 Jun 2015 20:41:07 +0100
Subject: [PATCH 16/25] Minor fix: removed shuffle parameter

---
 sklearn/cross_validation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 29ac0a5244382..d45cf3984c0a6 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -422,9 +422,9 @@ class DisjointLabelKFold(_BaseKFold):
      [3 4]] [3 4] [1 2]
     """
 
-    def __init__(self, y, n_folds=3, shuffle=False, random_state=None):
+    def __init__(self, y, n_folds=3):
         # No shuffling implemented yet
-        super(DisjointLabelKFold, self).__init__(len(y), n_folds, shuffle, random_state)
+        super(DisjointLabelKFold, self).__init__(len(y), n_folds, False, None)
         self.n_folds = n_folds
         self.n = len(y)
         self.idxs = disjoint_label_folds(y=y, n_folds=n_folds)

From b7fc3d884afff3dceb834dc7117c5883f9ba391a Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Tue, 23 Jun 2015 20:55:58 +0100
Subject: [PATCH 17/25] Cosmetic

---
 sklearn/cross_validation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index d45cf3984c0a6..a85ad0fa527cb 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -337,11 +337,11 @@ def disjoint_label_folds(y, n_folds=3):
     
     Parameters
     ----------
-    y : iterable of shape (n_samples,)
+    y: numpy array, shape (n_samples,)
         Contains an id for each sample.
         The folds are built so that the same id doesn't appear in two different folds.
     
-    n_folds: int, default is 3
+    n_folds: int, default=3
         Number of folds to split the data into.
         
     Returns

From 65d96b3fd6d00074a142736c1c71f0af5fbaf9b8 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Thu, 25 Jun 2015 10:47:00 +0100
Subject: [PATCH 18/25] Use mergesort instead of quicksort for reproducibility.

---
 sklearn/cross_validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index a85ad0fa527cb..ec7ed22655b41 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -360,7 +360,7 @@ def disjoint_label_folds(y, n_folds=3):
     # number of occurrence of each label (its "weight")
     samples_per_label = np.bincount(y)
     # We want to distribute the most frequent labels first
-    ind = np.argsort(samples_per_label)[::-1]
+    ind = np.argsort(samples_per_label, kind='mergesort')[::-1]
     samples_per_label = samples_per_label[ind]
 
     # Total weight of each fold

From fa03f1b1b2dcb1ac92a32bea5621a0863500351b Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Thu, 25 Jun 2015 12:01:32 +0100
Subject: [PATCH 19/25] Changed variable name 'y' to 'label'.

---
 sklearn/cross_validation.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index ec7ed22655b41..c3639f4e99969 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -332,12 +332,12 @@ def __len__(self):
         return self.n_folds
 
 
-def disjoint_label_folds(y, n_folds=3):
+def disjoint_label_folds(labels, n_folds=3):
     """Creates folds where a same label is not in two different folds.
     
     Parameters
     ----------
-    y: numpy array, shape (n_samples,)
+    labels: numpy array, shape (n_samples,)
         Contains an id for each sample.
         The folds are built so that the same id doesn't appear in two different folds.
     
@@ -354,11 +354,11 @@ def disjoint_label_folds(y, n_folds=3):
     -----
     The folds are built by distributing the labels by frequency of appearance.
     """
-    labels = np.array(y)
-    unique_labels, y = np.unique(labels, return_inverse=True)
+    labels = np.array(labels)
+    unique_labels, labels = np.unique(labels, return_inverse=True)
     
     # number of occurrence of each label (its "weight")
-    samples_per_label = np.bincount(y)
+    samples_per_label = np.bincount(labels)
     # We want to distribute the most frequent labels first
     ind = np.argsort(samples_per_label, kind='mergesort')[::-1]
     samples_per_label = samples_per_label[ind]
@@ -376,7 +376,7 @@ def disjoint_label_folds(y, n_folds=3):
         samples_per_fold[ind_min] += w
         label_to_fold[ind[label_index]] = ind_min
     
-    folds = label_to_fold[y]
+    folds = label_to_fold[labels]
 
     return folds
 
@@ -388,7 +388,7 @@ class DisjointLabelKFold(_BaseKFold):
 
     Parameters
     ----------
-    y : array-like with shape (n_samples, )
+    labels : array-like with shape (n_samples, )
         Contains a label for each sample.
         The folds are built so that the same label doesn't appear in two different folds.
 
@@ -422,12 +422,12 @@ class DisjointLabelKFold(_BaseKFold):
      [3 4]] [3 4] [1 2]
     """
 
-    def __init__(self, y, n_folds=3):
+    def __init__(self, labels, n_folds=3):
         # No shuffling implemented yet
-        super(DisjointLabelKFold, self).__init__(len(y), n_folds, False, None)
+        super(DisjointLabelKFold, self).__init__(len(labels), n_folds, False, None)
         self.n_folds = n_folds
-        self.n = len(y)
-        self.idxs = disjoint_label_folds(y=y, n_folds=n_folds)
+        self.n = len(labels)
+        self.idxs = disjoint_label_folds(labels=labels, n_folds=n_folds)
 
     def _iter_test_indices(self):
         for i in range(self.n_folds):

From 3faa60a248d3b099c18d73b1a7e622dfa5293f47 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Mon, 29 Jun 2015 15:36:12 +0100
Subject: [PATCH 20/25] Added test for degenerate case where n_folds >
 n_labels.

---
 sklearn/cross_validation.py            | 5 +++++
 sklearn/tests/test_cross_validation.py | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index c3639f4e99969..ce5731f3e1f06 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -356,6 +356,11 @@ def disjoint_label_folds(labels, n_folds=3):
     """
     labels = np.array(labels)
     unique_labels, labels = np.unique(labels, return_inverse=True)
+    n_labels = len(unique_labels)
+    if n_folds > n_labels:
+        raise ValueError(
+                ("Cannot have number of folds n_folds={0} greater"
+                 " than the number of labels: {1}.").format(n_folds, n_labels))
     
     # number of occurrence of each label (its "weight")
     samples_per_label = np.bincount(labels)
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 25b617c93d3ae..c01acef37e5a8 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -1093,3 +1093,6 @@ def test_disjoint_label_folds():
     for label in np.unique(labels):
         assert_equal(len(np.unique(folds[labels == label])), 1)
         
+    # Should fail if there are more folds than labels
+    labels = np.array([1, 1, 1, 2, 2])
+    assert_raises(ValueError, cval.disjoint_label_folds, labels, n_folds=3)

From 90275bee75aa2e2fe5347ae805864d4abd96b58f Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Mon, 29 Jun 2015 15:37:54 +0100
Subject: [PATCH 21/25] Documented the requirement n_labels > n_folds.

---
 sklearn/cross_validation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index ce5731f3e1f06..c456934592a71 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -353,6 +353,7 @@ def disjoint_label_folds(labels, n_folds=3):
     Notes
     -----
     The folds are built by distributing the labels by frequency of appearance.
+    The number of labels has to be at least equal to the number of folds.
     """
     labels = np.array(labels)
     unique_labels, labels = np.unique(labels, return_inverse=True)
@@ -389,7 +390,8 @@ def disjoint_label_folds(labels, n_folds=3):
 class DisjointLabelKFold(_BaseKFold):
     """Creates K approximately equilibrated folds.
     
-    The same label will not appear in two different folds.
+    The same label will not appear in two different folds
+    (the number of labels has to be at least equal to the number of folds).
 
     Parameters
     ----------

From 0b2dff6b23118ef489d080d4d2bf3b19b27c1385 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Wed, 1 Jul 2015 20:09:38 +0100
Subject: [PATCH 22/25] DOC: improved description + added see also sections.

---
 sklearn/cross_validation.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index c456934592a71..44c636ab55ebf 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -297,6 +297,8 @@ class KFold(_BaseKFold):
     StratifiedKFold: take label information into account to avoid building
     folds with imbalanced class distributions (for binary or multiclass
     classification tasks).
+
+    DisjointLabelKFold: K-fold iterator variant with non-overlapping labels.
     """
 
     def __init__(self, n, n_folds=3, shuffle=False,
@@ -388,10 +390,13 @@ def disjoint_label_folds(labels, n_folds=3):
 
 
 class DisjointLabelKFold(_BaseKFold):
-    """Creates K approximately equilibrated folds.
-    
-    The same label will not appear in two different folds
-    (the number of labels has to be at least equal to the number of folds).
+    """K-fold iterator variant with non-overlapping labels.
+
+    The same label will not appear in two different folds (the number of
+    labels has to be at least equal to the number of folds).
+
+    The folds are approximately balanced in the sense so that the number of
+    distinct labels is approximately the same in each fold.
 
     Parameters
     ----------
@@ -427,8 +432,12 @@ class DisjointLabelKFold(_BaseKFold):
     [[5 6]
      [7 8]] [[1 2]
      [3 4]] [3 4] [1 2]
-    """
 
+    See also
+    --------
+    LeaveOneLabelOut for splitting the data according to explicit,
+    domain-specific stratification of the dataset.
+    """
     def __init__(self, labels, n_folds=3):
         # No shuffling implemented yet
         super(DisjointLabelKFold, self).__init__(len(labels), n_folds, False, None)
@@ -500,6 +509,9 @@ class StratifiedKFold(_BaseKFold):
     All the folds have size trunc(n_samples / n_folds), the last one has the
     complementary.
 
+    See also
+    --------
+    DisjointLabelKFold: K-fold iterator variant with non-overlapping labels.
     """
 
     def __init__(self, y, n_folds=3, shuffle=False,
@@ -606,6 +618,9 @@ class LeaveOneLabelOut(_PartitionIterator):
      [3 4]] [[5 6]
      [7 8]] [1 2] [1 2]
 
+    See also
+    --------
+    DisjointLabelKFold: K-fold iterator variant with non-overlapping labels.
     """
 
     def __init__(self, labels):
@@ -679,6 +694,10 @@ class LeavePLabelOut(_PartitionIterator):
     TRAIN: [0] TEST: [1 2]
     [[1 2]] [[3 4]
      [5 6]] [1] [2 1]
+
+    See also
+    --------
+    DisjointLabelKFold: K-fold iterator variant with non-overlapping labels.
     """
 
     def __init__(self, labels, p):

From 6b4b63fde6059afbae855d2893117347d39d02d1 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Wed, 1 Jul 2015 20:11:44 +0100
Subject: [PATCH 23/25] Fixed dtype of temporary arrays.

---
 sklearn/cross_validation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 44c636ab55ebf..ed7458be80bb1 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -372,10 +372,10 @@ def disjoint_label_folds(labels, n_folds=3):
     samples_per_label = samples_per_label[ind]
 
     # Total weight of each fold
-    samples_per_fold = np.zeros(n_folds)
+    samples_per_fold = np.zeros(n_folds, dtype=np.uint64)
 
     # Mapping from label index to fold index
-    label_to_fold = np.zeros(len(unique_labels))
+    label_to_fold = np.zeros(len(unique_labels), dtype=np.uintp)
     
     # While there are weights, distribute them
     # Specifically, add the biggest weight to the lightest fold

From c9a636c4d5ab44937305352de907aea5952502c1 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Wed, 1 Jul 2015 20:15:31 +0100
Subject: [PATCH 24/25] Improved test: check that one label is not in both test
 and training.

---
 sklearn/tests/test_cross_validation.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index c01acef37e5a8..b7e4e15ef76eb 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -1055,7 +1055,7 @@ def test_disjoint_label_folds():
     
     # Construct the test data
     tolerance = 0.05 * n_samples # 5 percent error allowed
-    labels = np.random.randint(0, n_labels, n_samples)
+    labels = rng.randint(0, n_labels, n_samples)
     folds = cval.disjoint_label_folds(labels, n_folds)
     ideal_n_labels_per_fold = n_samples // n_folds
     
@@ -1064,9 +1064,14 @@ def test_disjoint_label_folds():
     for i in np.unique(folds):
         assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold))
     
-    # Check that each subjects appears only in 1 fold
+    # Check that each label appears only in 1 fold
     for label in np.unique(labels):
         assert_equal(len(np.unique(folds[labels == label])), 1)
+
+    # Check that no label is on both sides of the split
+    labels = np.asarray(labels, dtype=object)  # to allow fancy indexing on labels
+    for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds):
+        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
         
     # Construct the test data
     labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
@@ -1089,9 +1094,14 @@ def test_disjoint_label_folds():
     for i in np.unique(folds):
         assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold))
     
-    # Check that each subjects appears only in 1 fold
+    # Check that each label appears only in 1 fold
     for label in np.unique(labels):
         assert_equal(len(np.unique(folds[labels == label])), 1)
+
+    # Check that no label is on both sides of the split
+    labels = np.asarray(labels, dtype=object)  # to allow fancy indexing on labels
+    for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds):
+        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
         
     # Should fail if there are more folds than labels
     labels = np.array([1, 1, 1, 2, 2])

From ecb0ea0c30315696c891269688635476e6cd9564 Mon Sep 17 00:00:00 2001
From: Jean Kossaifi <jean.kossaifi@gmail.com>
Date: Wed, 1 Jul 2015 20:27:52 +0100
Subject: [PATCH 25/25] Added documentation for DisjoinLabelKFold.

---
 doc/modules/classes.rst          |  1 +
 doc/modules/cross_validation.rst | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 4309c8a21dfc8..6147086d2bb4e 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -168,6 +168,7 @@ Classes
    cross_validation.LeavePOut
    cross_validation.PredefinedSplit
    cross_validation.StratifiedKFold
+   cross_validation.DisjointLabelKFold
    cross_validation.ShuffleSplit
    cross_validation.StratifiedShuffleSplit
 
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 0aa6bf1e3b692..5d6dd2d041a6f 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -261,6 +261,33 @@ two slightly unbalanced classes::
   [0 1 2 4 5 6 7] [3 8 9]
 
 
+Disjoint label KFold
+--------------------
+
+:class:`DisjointLabelKFold` is a variation of *k-fold* which ensures that the same
+label is not in both testing and training sets.
+This is necessary for example if you obtained data from different subjects and you
+want to avoid over-fitting (ie learning person specific features) by testing and 
+training on different subjects.
+
+Imagine you have three subjects, each with an associated number from 1 to 3::
+
+  >>> from sklearn.cross_validation import DisjointLabelKFold
+
+  >>> labels = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
+
+  >>> dlkf = DisjointLabelKFold(labels, 3)
+  >>> for train, test in dlkf:
+  ...     print("%s %s" % (train, test))
+  [0 1 2 3 4 5] [6 7 8 9]
+  [0 1 2 6 7 8 9] [3 4 5]
+  [3 4 5 6 7 8 9] [0 1 2]
+
+Each subject is in a different testing fold, and the same subject is never in both
+testing and training.
+Notice that the folds do not have exactly the same size due to the imbalance in the data.
+
+
 Leave-One-Out - LOO
 -------------------