From cf47467ee94a3fdb9dbfad11729049a62726e745 Mon Sep 17 00:00:00 2001 From: Christoph Sawade Date: Tue, 12 May 2015 09:54:40 +0200 Subject: [PATCH] LabelSegmentedKFold cross-validation iterator This PR implements a variant of KFold that ensures that the train and test split is disjoint w.r.t. a third-party label. --- sklearn/cross_validation.py | 80 +++++++++++++++++++++++++- sklearn/tests/test_cross_validation.py | 23 ++++++++ 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index ad7491e6a3eeb..5077ec1ef71d8 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -12,7 +12,7 @@ from __future__ import division import warnings -from itertools import chain, combinations +from itertools import chain, combinations, izip_longest from math import ceil, floor, factorial import numbers import time @@ -37,6 +37,7 @@ 'LeaveOneOut', 'LeavePLabelOut', 'LeavePOut', + 'LabelSegmentedKFold' 'ShuffleSplit', 'StratifiedKFold', 'StratifiedShuffleSplit', @@ -445,6 +446,83 @@ def __len__(self): return self.n_folds +class LabelSegmentedKFold(_BaseKFold): + """Segmented K-Folds cross validation iterator + + Provides train/test indices to split data in train test sets. + + This cross-validation object is a variation of KFold that + returns folds where train and test set are distinct to a third-party + provided label. + + The difference between LeavePLabelOut and LabelSegmentedKFold is that + the former generates splits using all subsets of size `p` unique labels, + while the latter splits the data into k consecutive folds, where each + third-party label occurs only once in the test set. + + Parameters + ---------- + labels : array-like of int with shape (n_samples,) + Arbitrary domain-specific stratification of the data to be used to draw + the splits. + + n_folds : int, default=3 + Number of folds. Must be at least 2. + + shuffle : boolean, default=False + Whether to shuffle each stratification of the data before splitting + into batches. + + random_state : None, int or RandomState, default=None + Pseudo-random number generator state used for random + sampling. If None, use default numpy RNG for shuffling + + Examples + -------- + >>> from sklearn import cross_validation + >>> labels = np.array([1, 2, 3, 1, 2, 3, 4, 4, 4]) + >>> lkf = cross_validation.LabelSegmentedKFold(ylabels, n_folds=3) + >>> for train_index, test_index in lkf: + ... print("TRAIN:", labels[train_index], "TEST:", labels[test_index]) + TRAIN: [2, 3, 2, 3] TEST: [1, 1, 4, 4, 4] + TRAIN: [1, 3, 1, 3, 4, 4, 4] TEST: [2, 2] + TRAIN: [1, 2, 1, 2, 4, 4, 4] TEST: [3, 3] + """ + def __init__(self, labels, n_folds=3, shuffle=False, random_state=None): + self.labels = np.array(labels, copy=True) + self.unique_labels = np.unique(labels) + super(LabelSegmentedKFold, self).__init__( + self.unique_labels.size, n_folds, shuffle, random_state) + + self.n = self.labels.size + self.shuffle = shuffle + self.n_folds = n_folds + + self.batches = zip(*izip_longest( + *[iter(self.unique_labels)]*self.n_folds + )) + if shuffle: + rng = check_random_state(self.random_state) + self.batches = np.random.permutation(self.batches) + + def _iter_test_indices(self): + for batch in self.batches: + yield np.array([l in batch for l in self.labels]) + + def __repr__(self): + return '%s.%s(labels=%s, n_folds=%i, shuffle=%s, random_state=%s)' % ( + self.__class__.__module__, + self.__class__.__name__, + self.labels, + self.n_folds, + self.shuffle, + self.random_state, + ) + + def __len__(self): + return self.n_folds + + class LeaveOneLabelOut(_PartitionIterator): """Leave-One-Label_Out cross-validation iterator diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 17ecde996d26d..97ab8a17de69a 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -356,6 +356,29 @@ def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 assert_greater(mean_score, 0.85) +def test_label_segmented_kfold(): + labels = np.array([1, 2, 3, 1, 2, 3, 4, 4, 4]) + expected_train = [ + np.array([1, 2, 4, 5]), # labels: 2, 3 + np.array([0, 2, 3, 5, 6, 7, 8]), # labels: 1, 3, 4 + np.array([0, 1, 3, 4, 6, 7, 8]) # labels: 1, 2, 4 + ] + expected_test = [ + np.array([0, 3, 6, 7, 8]), # labels: 1, 4 + np.array([1, 4]), # labels: 2 + np.array([2, 5]) # labels: 3 + ] + lkf = cval.LabelSegmentedKFold(labels, n_folds=3, shuffle=False) + assert len(lkf) == 3 + i = 0 + for train, test in lkf: + assert_array_equal(expected_train[i], train) + assert_array_equal(expected_test[i], test) + i += 1 + + lkf.__repr__() + + def test_shuffle_split(): ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0) ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)