From cf47467ee94a3fdb9dbfad11729049a62726e745 Mon Sep 17 00:00:00 2001
From: Christoph Sawade <christoph@sawade.me>
Date: Tue, 12 May 2015 09:54:40 +0200
Subject: [PATCH] LabelSegmentedKFold cross-validation iterator

This PR implements a variant of KFold that ensures that the train and
test split is disjoint w.r.t. a third-party label.
---
 sklearn/cross_validation.py            | 80 +++++++++++++++++++++++++-
 sklearn/tests/test_cross_validation.py | 23 ++++++++
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index ad7491e6a3eeb..5077ec1ef71d8 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -12,7 +12,7 @@
 from __future__ import division
 
 import warnings
-from itertools import chain, combinations
+from itertools import chain, combinations, izip_longest
 from math import ceil, floor, factorial
 import numbers
 import time
@@ -37,6 +37,7 @@
            'LeaveOneOut',
            'LeavePLabelOut',
            'LeavePOut',
+           'LabelSegmentedKFold'
            'ShuffleSplit',
            'StratifiedKFold',
            'StratifiedShuffleSplit',
@@ -445,6 +446,83 @@ def __len__(self):
         return self.n_folds
 
 
+class LabelSegmentedKFold(_BaseKFold):
+    """Segmented K-Folds cross validation iterator
+
+    Provides train/test indices to split data in train test sets.
+
+    This cross-validation object is a variation of KFold that
+    returns folds where train and test set are distinct to a third-party
+    provided label.
+
+    The difference between LeavePLabelOut and LabelSegmentedKFold is that
+    the former generates splits using all subsets of size `p` unique labels,
+    while the latter splits the data into k consecutive folds, where each
+    third-party label occurs only once in the test set.
+
+    Parameters
+    ----------
+    labels : array-like of int with shape (n_samples,)
+        Arbitrary domain-specific stratification of the data to be used to draw
+        the splits.
+
+    n_folds : int, default=3
+        Number of folds. Must be at least 2.
+
+    shuffle : boolean, default=False
+        Whether to shuffle each stratification of the data before splitting
+        into batches.
+
+    random_state : None, int or RandomState, default=None
+        Pseudo-random number generator state used for random
+        sampling. If None, use default numpy RNG for shuffling
+
+    Examples
+    --------
+    >>> from sklearn import cross_validation
+    >>> labels = np.array([1, 2, 3, 1, 2, 3, 4, 4, 4])
+    >>> lkf = cross_validation.LabelSegmentedKFold(ylabels, n_folds=3)
+    >>> for train_index, test_index in lkf:
+    ...    print("TRAIN:", labels[train_index], "TEST:", labels[test_index])
+    TRAIN: [2, 3, 2, 3] TEST: [1, 1, 4, 4, 4]
+    TRAIN: [1, 3, 1, 3, 4, 4, 4] TEST: [2, 2]
+    TRAIN: [1, 2, 1, 2, 4, 4, 4] TEST: [3, 3]
+    """
+    def __init__(self, labels, n_folds=3, shuffle=False, random_state=None):
+        self.labels = np.array(labels, copy=True)
+        self.unique_labels = np.unique(labels)
+        super(LabelSegmentedKFold, self).__init__(
+            self.unique_labels.size, n_folds, shuffle, random_state)
+
+        self.n = self.labels.size
+        self.shuffle = shuffle
+        self.n_folds = n_folds
+
+        self.batches = zip(*izip_longest(
+            *[iter(self.unique_labels)]*self.n_folds
+        ))
+        if shuffle:
+            rng = check_random_state(self.random_state)
+            self.batches = np.random.permutation(self.batches)
+
+    def _iter_test_indices(self):
+        for batch in self.batches:
+            yield np.array([l in batch for l in self.labels])
+
+    def __repr__(self):
+        return '%s.%s(labels=%s, n_folds=%i, shuffle=%s, random_state=%s)' % (
+            self.__class__.__module__,
+            self.__class__.__name__,
+            self.labels,
+            self.n_folds,
+            self.shuffle,
+            self.random_state,
+        )
+
+    def __len__(self):
+        return self.n_folds
+
+
 class LeaveOneLabelOut(_PartitionIterator):
     """Leave-One-Label_Out cross-validation iterator
 
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 17ecde996d26d..97ab8a17de69a 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -356,6 +356,29 @@ def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
     assert_greater(mean_score, 0.85)
 
 
+def test_label_segmented_kfold():
+    labels = np.array([1, 2, 3, 1, 2, 3, 4, 4, 4])
+    expected_train = [
+        np.array([1, 2, 4, 5]),          # labels: 2, 3
+        np.array([0, 2, 3, 5, 6, 7, 8]), # labels: 1, 3, 4
+        np.array([0, 1, 3, 4, 6, 7, 8])  # labels: 1, 2, 4
+    ]
+    expected_test = [
+        np.array([0, 3, 6, 7, 8]), # labels: 1, 4
+        np.array([1, 4]),          # labels: 2
+        np.array([2, 5])           # labels: 3
+    ]
+    lkf = cval.LabelSegmentedKFold(labels, n_folds=3, shuffle=False)
+    assert len(lkf) == 3
+    i = 0
+    for train, test in lkf:
+        assert_array_equal(expected_train[i], train)
+        assert_array_equal(expected_test[i], test)
+        i += 1
+
+    lkf.__repr__()
+
+
 def test_shuffle_split():
     ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
     ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)