Skip to content

LabelSegmentedKFold cross-validation iterator #4709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 79 additions & 1 deletion sklearn/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from __future__ import division

import warnings
from itertools import chain, combinations
from itertools import chain, combinations, izip_longest
from math import ceil, floor, factorial
import numbers
import time
Expand All @@ -37,6 +37,7 @@
'LeaveOneOut',
'LeavePLabelOut',
'LeavePOut',
'LabelSegmentedKFold'
'ShuffleSplit',
'StratifiedKFold',
'StratifiedShuffleSplit',
Expand Down Expand Up @@ -445,6 +446,83 @@ def __len__(self):
return self.n_folds


class LabelSegmentedKFold(_BaseKFold):
"""Segmented K-Folds cross validation iterator

Provides train/test indices to split data in train test sets.

This cross-validation object is a variation of KFold that
returns folds where train and test set are distinct to a third-party
provided label.

The difference between LeavePLabelOut and LabelSegmentedKFold is that
the former generates splits using all subsets of size `p` unique labels,
while the latter splits the data into k consecutive folds, where each
third-party label occurs only once in the test set.

Parameters
----------
labels : array-like of int with shape (n_samples,)
Arbitrary domain-specific stratification of the data to be used to draw
the splits.

n_folds : int, default=3
Number of folds. Must be at least 2.

shuffle : boolean, default=False
Whether to shuffle each stratification of the data before splitting
into batches.

random_state : None, int or RandomState, default=None
Pseudo-random number generator state used for random
sampling. If None, use default numpy RNG for shuffling

Examples
--------
>>> from sklearn import cross_validation
>>> labels = np.array([1, 2, 3, 1, 2, 3, 4, 4, 4])
>>> lkf = cross_validation.LabelSegmentedKFold(ylabels, n_folds=3)
>>> for train_index, test_index in lkf:
... print("TRAIN:", labels[train_index], "TEST:", labels[test_index])
TRAIN: [2, 3, 2, 3] TEST: [1, 1, 4, 4, 4]
TRAIN: [1, 3, 1, 3, 4, 4, 4] TEST: [2, 2]
TRAIN: [1, 2, 1, 2, 4, 4, 4] TEST: [3, 3]
"""
def __init__(self, labels, n_folds=3, shuffle=False, random_state=None):
self.labels = np.array(labels, copy=True)
self.unique_labels = np.unique(labels)
super(LabelSegmentedKFold, self).__init__(
self.unique_labels.size, n_folds, shuffle, random_state)

self.n = self.labels.size
self.shuffle = shuffle
self.n_folds = n_folds

self.batches = zip(*izip_longest(
*[iter(self.unique_labels)]*self.n_folds
))
if shuffle:
rng = check_random_state(self.random_state)
self.batches = np.random.permutation(self.batches)

def _iter_test_indices(self):
for batch in self.batches:
yield np.array([l in batch for l in self.labels])

def __repr__(self):
return '%s.%s(labels=%s, n_folds=%i, shuffle=%s, random_state=%s)' % (
self.__class__.__module__,
self.__class__.__name__,
self.labels,
self.n_folds,
self.shuffle,
self.random_state,
)

def __len__(self):
return self.n_folds


class LeaveOneLabelOut(_PartitionIterator):
"""Leave-One-Label_Out cross-validation iterator

Expand Down
23 changes: 23 additions & 0 deletions sklearn/tests/test_cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,29 @@ def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372
assert_greater(mean_score, 0.85)


def test_label_segmented_kfold():
labels = np.array([1, 2, 3, 1, 2, 3, 4, 4, 4])
expected_train = [
np.array([1, 2, 4, 5]), # labels: 2, 3
np.array([0, 2, 3, 5, 6, 7, 8]), # labels: 1, 3, 4
np.array([0, 1, 3, 4, 6, 7, 8]) # labels: 1, 2, 4
]
expected_test = [
np.array([0, 3, 6, 7, 8]), # labels: 1, 4
np.array([1, 4]), # labels: 2
np.array([2, 5]) # labels: 3
]
lkf = cval.LabelSegmentedKFold(labels, n_folds=3, shuffle=False)
assert len(lkf) == 3
i = 0
for train, test in lkf:
assert_array_equal(expected_train[i], train)
assert_array_equal(expected_test[i], test)
i += 1

lkf.__repr__()


def test_shuffle_split():
ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
Expand Down