-
-
Notifications
You must be signed in to change notification settings - Fork 26k
[MRG + 1] Added DisjointLabelKFold to perform K-Fold cv on sets with disjoint labels. #4444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
53063ba
37ecdd7
79cea0c
de5d272
a809881
c14f847
b0fc204
cd77b82
1920a75
1916b26
f320bc1
f58dbe6
89358a3
90cfc56
5630d34
19da58a
4706289
b305539
b7fc3d8
65d96b3
fa03f1b
3faa60a
90275be
0b2dff6
6b4b63f
c9a636c
ecb0ea0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -297,6 +297,8 @@ class KFold(_BaseKFold): | |
StratifiedKFold: take label information into account to avoid building | ||
folds with imbalanced class distributions (for binary or multiclass | ||
classification tasks). | ||
|
||
DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. | ||
""" | ||
|
||
def __init__(self, n, n_folds=3, shuffle=False, | ||
|
@@ -332,6 +334,133 @@ def __len__(self): | |
return self.n_folds | ||
|
||
|
||
def disjoint_label_folds(labels, n_folds=3): | ||
"""Creates folds where a same label is not in two different folds. | ||
|
||
Parameters | ||
---------- | ||
labels: numpy array, shape (n_samples,) | ||
Contains an id for each sample. | ||
The folds are built so that the same id doesn't appear in two different folds. | ||
|
||
n_folds: int, default=3 | ||
Number of folds to split the data into. | ||
|
||
Returns | ||
------- | ||
folds: numpy array of shape (n_samples, ) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. folds : numpy array, shape (n_samples,) |
||
Array of integers between 0 and (n_folds - 1). | ||
Folds[i] contains the folds to which sample i is assigned. | ||
|
||
Notes | ||
----- | ||
The folds are built by distributing the labels by frequency of appearance. | ||
The number of labels has to be at least equal to the number of folds. | ||
""" | ||
labels = np.array(labels) | ||
unique_labels, labels = np.unique(labels, return_inverse=True) | ||
n_labels = len(unique_labels) | ||
if n_folds > n_labels: | ||
raise ValueError( | ||
("Cannot have number of folds n_folds={0} greater" | ||
" than the number of labels: {1}.").format(n_folds, n_labels)) | ||
|
||
# number of occurrence of each label (its "weight") | ||
samples_per_label = np.bincount(labels) | ||
# We want to distribute the most frequent labels first | ||
ind = np.argsort(samples_per_label, kind='mergesort')[::-1] | ||
samples_per_label = samples_per_label[ind] | ||
|
||
# Total weight of each fold | ||
samples_per_fold = np.zeros(n_folds, dtype=np.uint64) | ||
|
||
# Mapping from label index to fold index | ||
label_to_fold = np.zeros(len(unique_labels), dtype=np.uintp) | ||
|
||
# While there are weights, distribute them | ||
# Specifically, add the biggest weight to the lightest fold | ||
for label_index, w in enumerate(samples_per_label): | ||
ind_min = np.argmin(samples_per_fold) | ||
samples_per_fold[ind_min] += w | ||
label_to_fold[ind[label_index]] = ind_min | ||
|
||
folds = label_to_fold[labels] | ||
|
||
return folds | ||
|
||
|
||
class DisjointLabelKFold(_BaseKFold): | ||
"""K-fold iterator variant with non-overlapping labels. | ||
|
||
The same label will not appear in two different folds (the number of | ||
labels has to be at least equal to the number of folds). | ||
|
||
The folds are approximately balanced in the sense so that the number of | ||
distinct labels is approximately the same in each fold. | ||
|
||
Parameters | ||
---------- | ||
labels : array-like with shape (n_samples, ) | ||
Contains a label for each sample. | ||
The folds are built so that the same label doesn't appear in two different folds. | ||
|
||
n_folds : int, default is 3 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. int, default=3 |
||
Number of folds. | ||
|
||
Examples | ||
-------- | ||
>>> from sklearn import cross_validation | ||
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) | ||
>>> y = np.array([1, 2, 3, 4]) | ||
>>> labels = np.array([0, 0, 2, 2]) | ||
>>> dl_kfold = cross_validation.DisjointLabelKFold(labels, n_folds=2) | ||
>>> len(dl_kfold) | ||
2 | ||
>>> print(dl_kfold) | ||
sklearn.cross_validation.DisjointLabelKFold(n_labels=4, n_folds=2) | ||
>>> for train_index, test_index in dl_kfold: | ||
... print("TRAIN:", train_index, "TEST:", test_index) | ||
... X_train, X_test = X[train_index], X[test_index] | ||
... y_train, y_test = y[train_index], y[test_index] | ||
... print(X_train, X_test, y_train, y_test) | ||
... | ||
TRAIN: [0 1] TEST: [2 3] | ||
[[1 2] | ||
[3 4]] [[5 6] | ||
[7 8]] [1 2] [3 4] | ||
TRAIN: [2 3] TEST: [0 1] | ||
[[5 6] | ||
[7 8]] [[1 2] | ||
[3 4]] [3 4] [1 2] | ||
|
||
See also | ||
-------- | ||
LeaveOneLabelOut for splitting the data according to explicit, | ||
domain-specific stratification of the dataset. | ||
""" | ||
def __init__(self, labels, n_folds=3): | ||
# No shuffling implemented yet | ||
super(DisjointLabelKFold, self).__init__(len(labels), n_folds, False, None) | ||
self.n_folds = n_folds | ||
self.n = len(labels) | ||
self.idxs = disjoint_label_folds(labels=labels, n_folds=n_folds) | ||
|
||
def _iter_test_indices(self): | ||
for i in range(self.n_folds): | ||
yield (self.idxs == i) | ||
|
||
def __repr__(self): | ||
return '{0}.{1}(n_labels={2}, n_folds={3})'.format( | ||
self.__class__.__module__, | ||
self.__class__.__name__, | ||
self.n, | ||
self.n_folds, | ||
) | ||
|
||
def __len__(self): | ||
return self.n_folds | ||
|
||
|
||
class StratifiedKFold(_BaseKFold): | ||
"""Stratified K-Folds cross validation iterator | ||
|
||
|
@@ -380,6 +509,9 @@ class StratifiedKFold(_BaseKFold): | |
All the folds have size trunc(n_samples / n_folds), the last one has the | ||
complementary. | ||
|
||
See also | ||
-------- | ||
DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. | ||
""" | ||
|
||
def __init__(self, y, n_folds=3, shuffle=False, | ||
|
@@ -486,6 +618,9 @@ class LeaveOneLabelOut(_PartitionIterator): | |
[3 4]] [[5 6] | ||
[7 8]] [1 2] [1 2] | ||
|
||
See also | ||
-------- | ||
DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. | ||
""" | ||
|
||
def __init__(self, labels): | ||
|
@@ -559,6 +694,10 @@ class LeavePLabelOut(_PartitionIterator): | |
TRAIN: [0] TEST: [1 2] | ||
[[1 2]] [[3 4] | ||
[5 6]] [1] [2 1] | ||
|
||
See also | ||
-------- | ||
DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. | ||
""" | ||
|
||
def __init__(self, labels, p): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
from sklearn.utils.testing import assert_almost_equal | ||
from sklearn.utils.testing import assert_raises | ||
from sklearn.utils.testing import assert_greater | ||
from sklearn.utils.testing import assert_greater_equal | ||
from sklearn.utils.testing import assert_less | ||
from sklearn.utils.testing import assert_not_equal | ||
from sklearn.utils.testing import assert_array_almost_equal | ||
|
@@ -1038,3 +1039,70 @@ def test_check_is_partition(): | |
|
||
p[0] = 23 | ||
assert_false(cval._check_is_partition(p, 100)) | ||
|
||
|
||
def test_disjoint_label_folds(): | ||
## Check that the function produces equilibrated folds | ||
## with no label appearing in two different folds | ||
|
||
# Fix the seed for reproducibility | ||
rng = np.random.RandomState(0) | ||
|
||
# Parameters of the test | ||
n_labels = 15 | ||
n_samples = 1000 | ||
n_folds = 5 | ||
|
||
# Construct the test data | ||
tolerance = 0.05 * n_samples # 5 percent error allowed | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where does this number come from? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just used the same default as for the other classes (KFold, etc). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mean test_stratified_shuffle_split_even? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is quite a different test, though, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh sorry, I misread your question, I though you were referring to the default number of folds. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems a bit magic to me, it is not clear why the test should pass. It depends on the distribution of labels, right? I guess with randint and many samples, they will be quite close. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this is true... I chose quite a large number of samples (1000) though and np.random.randint draws integers from a “discrete uniform” distribution in the “half-open” interval [low, high). Should I leave that or do you suggest I simply manually write an array? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's fine for now. |
||
labels = rng.randint(0, n_labels, n_samples) | ||
folds = cval.disjoint_label_folds(labels, n_folds) | ||
ideal_n_labels_per_fold = n_samples // n_folds | ||
|
||
# Check that folds have approximately the same size | ||
assert_equal(len(folds), len(labels)) | ||
for i in np.unique(folds): | ||
assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) | ||
|
||
# Check that each label appears only in 1 fold | ||
for label in np.unique(labels): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you could remove the for loop by constructing a coo_matrix instead, but I'm not sure it's worth it. this one might be cleaner. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree, to the reader it is probably easier to have the loopy version. |
||
assert_equal(len(np.unique(folds[labels == label])), 1) | ||
|
||
# Check that no label is on both sides of the split | ||
labels = np.asarray(labels, dtype=object) # to allow fancy indexing on labels | ||
for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds): | ||
assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) | ||
|
||
# Construct the test data | ||
labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', | ||
'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', | ||
'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', | ||
'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', | ||
'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', | ||
'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', | ||
'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'] | ||
|
||
n_labels = len(np.unique(labels)) | ||
n_samples = len(labels) | ||
n_folds = 5 | ||
tolerance = 0.05 * n_samples # 5 percent error allowed | ||
folds = cval.disjoint_label_folds(labels, n_folds) | ||
ideal_n_labels_per_fold = n_samples // n_folds | ||
|
||
# Check that folds have approximately the same size | ||
assert_equal(len(folds), len(labels)) | ||
for i in np.unique(folds): | ||
assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) | ||
|
||
# Check that each label appears only in 1 fold | ||
for label in np.unique(labels): | ||
assert_equal(len(np.unique(folds[labels == label])), 1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please enhance this test by also checking that using the labels = np.asarray(labels, dtype=object) # to allow fancy indexing on labels
for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds):
assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) |
||
|
||
# Check that no label is on both sides of the split | ||
labels = np.asarray(labels, dtype=object) # to allow fancy indexing on labels | ||
for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds): | ||
assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) | ||
|
||
# Should fail if there are more folds than labels | ||
labels = np.array([1, 1, 1, 2, 2]) | ||
assert_raises(ValueError, cval.disjoint_label_folds, labels, n_folds=3) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
id -> index
And maybe document the dtype; it should be
np.intp
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@larsmans @ogrisel changed to mergesort, thanks.
@larsmans currently the implementation allows the user to have an array of strings for y (for instance the name or id of the subjects in an experiment) as I didn't see any reason to not allow that. Do you want to enforce a np.intp dtype?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think string should be welcome.