From 53063ba667750544f9e57d14043dd06a748d1110 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Tue, 24 Mar 2015 15:32:29 +0000 Subject: [PATCH 01/25] Added subject independent KFold --- sklearn/cross_validation.py | 65 ++++++++++++++++++++++++++ sklearn/tests/test_cross_validation.py | 56 ++++++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index dacf4c4c67f63..7c37148dbba30 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -332,6 +332,71 @@ def __len__(self): return self.n_folds +def subject_independent_folds(subjects, n_folds=3): + """ Creates folds where a same subject is not in two different folds + + Parameters + ---------- + subjects: iterable of shape (n_samples, ) + contains an id for each sample + The folds are built so that the same id doesn't appear in two different folds + + n_folds: int, default is 3 + number of folds to split the data into + + Returns + ------- + folds: numpy array of shape (n_samples, ) + array of integers between 0 and (n_folds - 1) + folds[i] contains the folds to which sample i is assigned. + + Notes + ----- + The folds are built by distributing the subjects by frequency of appearance. + """ + subjects = np.array(subjects) + unique_subjects = np.unique(subjects) + + # number of occurrence of each subject (its "weight") + weight_per_subject = sorted([(sum(subjects == i), i) for i in unique_subjects]) + # Total weight of each fold + weight_per_fold = np.zeros(n_folds) + # For each sample, a digit between 0 and (n_folds - 1) to tell which fold it belongs to + folds = np.zeros(len(subjects)) + + # While there are weights, distribute them + # Specifically, add the biggest weight to the lightest fold + while weight_per_subject: + ind_min = np.argmin(weight_per_fold) + w, actor = weight_per_subject.pop() + weight_per_fold[ind_min] += w + folds[subjects == actor] = ind_min + + return folds + + +class SubjectIndependentKfold(_BaseKFold): + def __init__(self, subjects, n_folds=3): + self.n_folds = n_folds + self.n = len(subjects) + self.idxs = subject_independent_folds(subjects=subjects, n_folds=n_folds) + + def _iter_test_indices(self): + for i in range(self.n_folds): + yield (self.idxs == i) + + def __repr__(self): + return '{}.{}(n_subjects={}, n_folds={})'.format( + self.__class__.__module__, + self.__class__.__name__, + self.n, + self.n_folds, + ) + + def __len__(self): + return self.n_folds + + class StratifiedKFold(_BaseKFold): """Stratified K-Folds cross validation iterator diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 786bf561ec5e2..a1bf5a4293fd0 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -1038,3 +1038,59 @@ def test_check_is_partition(): p[0] = 23 assert_false(cval._check_is_partition(p, 100)) + + +def test_subject_independent_folds(): + """ Check that the function produces equilibrated folds + with no subject appearing in two different folds + """ + # Fix the seed for reproducibility + np.random.seed(0) + + # Parameters of the test + n_subjects = 15 + n_samples = 1000 + n_folds = 5 + + # Construct the test data + tolerance = 0.05 * n_samples # 5 percent error allowed + subjects = np.random.randint(0, n_subjects, n_samples) + folds = cval.subject_independent_folds(subjects, n_folds) + ideal_n_subjects_per_fold = n_samples // n_folds + + # Check that folds have approximately the same size + assert(len(folds)==len(subjects)) + for i in np.unique(folds): + assert(abs(sum(folds == i) - ideal_n_subjects_per_fold) <= tolerance) + + # Check that each subjects appears only in 1 fold + for subject in np.unique(subjects): + assert(len(np.unique(folds[subjects == subject])) == 1) + + subjects = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', + 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', + 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', + 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', + 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', + 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', + 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'] + + n_subjects = len(np.unique(subjects)) + n_samples = len(subjects) + n_folds = 5 + + # Construct the test data + tolerance = 0.05 * n_samples # 5 percent error allowed + subjects = np.random.randint(0, n_subjects, n_samples) + folds = cval.subject_independent_folds(subjects, n_folds) + ideal_n_subjects_per_fold = n_samples // n_folds + + # Check that folds have approximately the same size + assert(len(folds)==len(subjects)) + for i in np.unique(folds): + assert(abs(sum(folds == i) - ideal_n_subjects_per_fold) <= tolerance) + + # Check that each subjects appears only in 1 fold + for subject in np.unique(subjects): + assert(len(np.unique(folds[subjects == subject])) == 1) + From 37ecdd7776f9293c19f233e23e0f2ddffab1d9e6 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Wed, 25 Mar 2015 11:34:33 +0000 Subject: [PATCH 02/25] Changed SubjectIndependentKFold to DisjointGroupKFold cosmetic changes test (fix seed correctly, use assert_equal for meaningful error messages) --- sklearn/cross_validation.py | 44 ++++++++++++++++---------- sklearn/tests/test_cross_validation.py | 43 ++++++++++++------------- 2 files changed, 49 insertions(+), 38 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 7c37148dbba30..47bff1fe8bf0b 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -332,12 +332,12 @@ def __len__(self): return self.n_folds -def subject_independent_folds(subjects, n_folds=3): - """ Creates folds where a same subject is not in two different folds +def disjoint_group_folds(groups, n_folds=3): + """Creates folds where a same group is not in two different folds Parameters ---------- - subjects: iterable of shape (n_samples, ) + groups: iterable of shape (n_samples, ) contains an id for each sample The folds are built so that the same id doesn't appear in two different folds @@ -352,34 +352,46 @@ def subject_independent_folds(subjects, n_folds=3): Notes ----- - The folds are built by distributing the subjects by frequency of appearance. + The folds are built by distributing the groups by frequency of appearance. """ - subjects = np.array(subjects) - unique_subjects = np.unique(subjects) + groups = np.array(groups) + unique_groups = np.unique(groups) - # number of occurrence of each subject (its "weight") - weight_per_subject = sorted([(sum(subjects == i), i) for i in unique_subjects]) + # number of occurrence of each group (its "weight") + weight_per_group = sorted([(sum(groups == group_id), group_id) for group_id in unique_groups]) # Total weight of each fold weight_per_fold = np.zeros(n_folds) # For each sample, a digit between 0 and (n_folds - 1) to tell which fold it belongs to - folds = np.zeros(len(subjects)) + folds = np.zeros(len(groups)) # While there are weights, distribute them # Specifically, add the biggest weight to the lightest fold - while weight_per_subject: + while weight_per_group: ind_min = np.argmin(weight_per_fold) - w, actor = weight_per_subject.pop() + w, group_id = weight_per_group.pop() weight_per_fold[ind_min] += w - folds[subjects == actor] = ind_min + folds[groups == group_id] = ind_min return folds -class SubjectIndependentKfold(_BaseKFold): - def __init__(self, subjects, n_folds=3): +class DisjointGroupKfold(_BaseKFold): + def __init__(self, groups, n_folds=3): + """Creates K approximately equilibrated folds + where the same group will not appear in two different folds + + Parameters + ---------- + groups: numpy array of shape (n_samples, ) + contains an id for each sample + The folds are built so that the same id doesn't appear in two different folds + + n_folds: int, default is 3 + number of folds + """ self.n_folds = n_folds - self.n = len(subjects) - self.idxs = subject_independent_folds(subjects=subjects, n_folds=n_folds) + self.n = len(groups) + self.idxs = disjoint_group_folds(groups=groups, n_folds=n_folds) def _iter_test_indices(self): for i in range(self.n_folds): diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index a1bf5a4293fd0..8bfb713e4b0e9 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -12,6 +12,7 @@ from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_greater +from sklearn.utils.testing import assert_greater_equal from sklearn.utils.testing import assert_less from sklearn.utils.testing import assert_not_equal from sklearn.utils.testing import assert_array_almost_equal @@ -1042,32 +1043,33 @@ def test_check_is_partition(): def test_subject_independent_folds(): """ Check that the function produces equilibrated folds - with no subject appearing in two different folds + with no group appearing in two different folds """ # Fix the seed for reproducibility - np.random.seed(0) + rng = np.random.RandomState(0) # Parameters of the test - n_subjects = 15 + n_groups = 15 n_samples = 1000 n_folds = 5 # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed - subjects = np.random.randint(0, n_subjects, n_samples) - folds = cval.subject_independent_folds(subjects, n_folds) - ideal_n_subjects_per_fold = n_samples // n_folds + groups = np.random.randint(0, n_groups, n_samples) + folds = cval.disjoint_group_folds(groups, n_folds) + ideal_n_groups_per_fold = n_samples // n_folds # Check that folds have approximately the same size - assert(len(folds)==len(subjects)) + assert_equal(len(folds), len(groups)) for i in np.unique(folds): - assert(abs(sum(folds == i) - ideal_n_subjects_per_fold) <= tolerance) + assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each subjects appears only in 1 fold - for subject in np.unique(subjects): - assert(len(np.unique(folds[subjects == subject])) == 1) + for group in np.unique(groups): + assert_equal(len(np.unique(folds[groups == group])), 1) - subjects = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', + # Construct the test data + groups = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', @@ -1075,22 +1077,19 @@ def test_subject_independent_folds(): 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'] - n_subjects = len(np.unique(subjects)) - n_samples = len(subjects) + n_groups = len(np.unique(groups)) + n_samples = len(groups) n_folds = 5 - - # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed - subjects = np.random.randint(0, n_subjects, n_samples) - folds = cval.subject_independent_folds(subjects, n_folds) - ideal_n_subjects_per_fold = n_samples // n_folds + folds = cval.disjoint_group_folds(groups, n_folds) + ideal_n_groups_per_fold = n_samples // n_folds # Check that folds have approximately the same size - assert(len(folds)==len(subjects)) + assert_equal(len(folds), len(groups)) for i in np.unique(folds): - assert(abs(sum(folds == i) - ideal_n_subjects_per_fold) <= tolerance) + assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each subjects appears only in 1 fold - for subject in np.unique(subjects): - assert(len(np.unique(folds[subjects == subject])) == 1) + for group in np.unique(groups): + assert_equal(len(np.unique(folds[groups == group])), 1) From 79cea0c32196a7a59eda8a6241904b7458f5ffe4 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Thu, 26 Mar 2015 11:15:11 +0000 Subject: [PATCH 03/25] Changed name to DisjointLabelKFold --- sklearn/cross_validation.py | 38 +++++++++++++------------- sklearn/tests/test_cross_validation.py | 38 +++++++++++++------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 47bff1fe8bf0b..9d6ceb9e4dda6 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -332,12 +332,12 @@ def __len__(self): return self.n_folds -def disjoint_group_folds(groups, n_folds=3): - """Creates folds where a same group is not in two different folds +def disjoint_label_folds(y, n_folds=3): + """Creates folds where a same label is not in two different folds Parameters ---------- - groups: iterable of shape (n_samples, ) + y: iterable of shape (n_samples, ) contains an id for each sample The folds are built so that the same id doesn't appear in two different folds @@ -352,46 +352,46 @@ def disjoint_group_folds(groups, n_folds=3): Notes ----- - The folds are built by distributing the groups by frequency of appearance. + The folds are built by distributing the labels by frequency of appearance. """ - groups = np.array(groups) - unique_groups = np.unique(groups) + labels = np.array(y) + unique_labels = np.unique(labels) - # number of occurrence of each group (its "weight") - weight_per_group = sorted([(sum(groups == group_id), group_id) for group_id in unique_groups]) + # number of occurrence of each label (its "weight") + weight_per_label = sorted([(sum(labels == label), label) for label in unique_labels]) # Total weight of each fold weight_per_fold = np.zeros(n_folds) # For each sample, a digit between 0 and (n_folds - 1) to tell which fold it belongs to - folds = np.zeros(len(groups)) + folds = np.zeros(len(labels)) # While there are weights, distribute them # Specifically, add the biggest weight to the lightest fold - while weight_per_group: + while weight_per_label: ind_min = np.argmin(weight_per_fold) - w, group_id = weight_per_group.pop() + w, label = weight_per_label.pop() weight_per_fold[ind_min] += w - folds[groups == group_id] = ind_min + folds[labels == label] = ind_min return folds -class DisjointGroupKfold(_BaseKFold): - def __init__(self, groups, n_folds=3): +class DisjointLabelKfold(_BaseKFold): + def __init__(self, y, n_folds=3): """Creates K approximately equilibrated folds - where the same group will not appear in two different folds + where the same label will not appear in two different folds Parameters ---------- - groups: numpy array of shape (n_samples, ) + y: numpy array of shape (n_samples, ) contains an id for each sample - The folds are built so that the same id doesn't appear in two different folds + The folds are built so that the same label doesn't appear in two different folds n_folds: int, default is 3 number of folds """ self.n_folds = n_folds - self.n = len(groups) - self.idxs = disjoint_group_folds(groups=groups, n_folds=n_folds) + self.n = len(y) + self.idxs = disjoint_label_folds(y=y, n_folds=n_folds) def _iter_test_indices(self): for i in range(self.n_folds): diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 8bfb713e4b0e9..6db640c1925e4 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -1041,35 +1041,35 @@ def test_check_is_partition(): assert_false(cval._check_is_partition(p, 100)) -def test_subject_independent_folds(): +def test_disjoint_label_folds(): """ Check that the function produces equilibrated folds - with no group appearing in two different folds + with no label appearing in two different folds """ # Fix the seed for reproducibility rng = np.random.RandomState(0) # Parameters of the test - n_groups = 15 + n_labels = 15 n_samples = 1000 n_folds = 5 # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed - groups = np.random.randint(0, n_groups, n_samples) - folds = cval.disjoint_group_folds(groups, n_folds) - ideal_n_groups_per_fold = n_samples // n_folds + labels = np.random.randint(0, n_labels, n_samples) + folds = cval.disjoint_label_folds(labels, n_folds) + ideal_n_labels_per_fold = n_samples // n_folds # Check that folds have approximately the same size - assert_equal(len(folds), len(groups)) + assert_equal(len(folds), len(labels)) for i in np.unique(folds): - assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) + assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each subjects appears only in 1 fold - for group in np.unique(groups): - assert_equal(len(np.unique(folds[groups == group])), 1) + for label in np.unique(labels): + assert_equal(len(np.unique(folds[labels == label])), 1) # Construct the test data - groups = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', + labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', @@ -1077,19 +1077,19 @@ def test_subject_independent_folds(): 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'] - n_groups = len(np.unique(groups)) - n_samples = len(groups) + n_labels = len(np.unique(labels)) + n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed - folds = cval.disjoint_group_folds(groups, n_folds) - ideal_n_groups_per_fold = n_samples // n_folds + folds = cval.disjoint_label_folds(labels, n_folds) + ideal_n_labels_per_fold = n_samples // n_folds # Check that folds have approximately the same size - assert_equal(len(folds), len(groups)) + assert_equal(len(folds), len(labels)) for i in np.unique(folds): - assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) + assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each subjects appears only in 1 fold - for group in np.unique(groups): - assert_equal(len(np.unique(folds[groups == group])), 1) + for label in np.unique(labels): + assert_equal(len(np.unique(folds[labels == label])), 1) From de5d272eaf71b96ca98f7c4b84a8488db2b8b268 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Thu, 26 Mar 2015 12:31:41 +0000 Subject: [PATCH 04/25] Added example of use --- sklearn/cross_validation.py | 40 ++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 9d6ceb9e4dda6..d2e26c5bbbf17 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -382,12 +382,38 @@ def __init__(self, y, n_folds=3): Parameters ---------- - y: numpy array of shape (n_samples, ) - contains an id for each sample - The folds are built so that the same label doesn't appear in two different folds - - n_folds: int, default is 3 - number of folds + y : array-like with shape (n_samples, ) + Contains a label for each sample. + The folds are built so that the same label doesn't appear in two different folds. + + n_folds : int, default is 3 + Number of folds. + + Examples + -------- + >>> from sklearn import cross_validation + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + >>> y = np.array([1, 2, 3, 4]) + >>> labels = np.array([0, 0, 2, 2]) + >>> dl_kfold = cross_validation.DisjointLabelKfold(labels, n_folds=2) + >>> len(dl_kfold) + 2 + >>> print(dl_kfold) + sklearn.cross_validation.DisjointLabelKfold(n_labels=4, n_folds=2) + >>> for train_index, test_index in dl_kfold: + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + ... print(X_train, X_test, y_train, y_test) + ... + TRAIN: [0 1] TEST: [2 3] + [[1 2] + [3 4]] [[5 6] + [7 8]] [1 2] [3 4] + TRAIN: [2 3] TEST: [0 1] + [[5 6] + [7 8]] [[1 2] + [3 4]] [3 4] [1 2] """ self.n_folds = n_folds self.n = len(y) @@ -398,7 +424,7 @@ def _iter_test_indices(self): yield (self.idxs == i) def __repr__(self): - return '{}.{}(n_subjects={}, n_folds={})'.format( + return '{}.{}(n_labels={}, n_folds={})'.format( self.__class__.__module__, self.__class__.__name__, self.n, From a8098819f5983efc21767e74fac2509f2ae74726 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Thu, 26 Mar 2015 12:56:33 +0000 Subject: [PATCH 05/25] FIX: whitespace related doctest failure --- sklearn/cross_validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index d2e26c5bbbf17..db7071dd05276 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -408,12 +408,12 @@ def __init__(self, y, n_folds=3): ... TRAIN: [0 1] TEST: [2 3] [[1 2] - [3 4]] [[5 6] - [7 8]] [1 2] [3 4] + [3 4]] [[5 6] + [7 8]] [1 2] [3 4] TRAIN: [2 3] TEST: [0 1] [[5 6] - [7 8]] [[1 2] - [3 4]] [3 4] [1 2] + [7 8]] [[1 2] + [3 4]] [3 4] [1 2] """ self.n_folds = n_folds self.n = len(y) From c14f84784031085b0ba912174c3fd41b56263367 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Thu, 26 Mar 2015 13:24:01 +0000 Subject: [PATCH 06/25] FIX: Python 2.6 requires the field numbers in print --- sklearn/cross_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index db7071dd05276..53420c97157cf 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -424,7 +424,7 @@ def _iter_test_indices(self): yield (self.idxs == i) def __repr__(self): - return '{}.{}(n_labels={}, n_folds={})'.format( + return '{0}.{1}(n_labels={2}, n_folds={3})'.format( self.__class__.__module__, self.__class__.__name__, self.n, From b0fc20422f26e1134e8b618ae047ba22444acf57 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Thu, 26 Mar 2015 13:45:28 +0000 Subject: [PATCH 07/25] FIX: change docstring to comment in test function --- sklearn/tests/test_cross_validation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 6db640c1925e4..25b617c93d3ae 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -1042,9 +1042,9 @@ def test_check_is_partition(): def test_disjoint_label_folds(): - """ Check that the function produces equilibrated folds - with no label appearing in two different folds - """ + ## Check that the function produces equilibrated folds + ## with no label appearing in two different folds + # Fix the seed for reproducibility rng = np.random.RandomState(0) From 1916b26425772f177b155c012c0d46ade2ddc8b3 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Thu, 14 May 2015 12:16:34 +0100 Subject: [PATCH 08/25] DOC: moved docstring from function to class --- sklearn/cross_validation.py | 77 +++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 53420c97157cf..965596ce74475 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -376,45 +376,46 @@ def disjoint_label_folds(y, n_folds=3): class DisjointLabelKfold(_BaseKFold): + """Creates K approximately equilibrated folds + where the same label will not appear in two different folds + + Parameters + ---------- + y : array-like with shape (n_samples, ) + Contains a label for each sample. + The folds are built so that the same label doesn't appear in two different folds. + + n_folds : int, default is 3 + Number of folds. + + Examples + -------- + >>> from sklearn import cross_validation + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + >>> y = np.array([1, 2, 3, 4]) + >>> labels = np.array([0, 0, 2, 2]) + >>> dl_kfold = cross_validation.DisjointLabelKfold(labels, n_folds=2) + >>> len(dl_kfold) + 2 + >>> print(dl_kfold) + sklearn.cross_validation.DisjointLabelKfold(n_labels=4, n_folds=2) + >>> for train_index, test_index in dl_kfold: + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + ... print(X_train, X_test, y_train, y_test) + ... + TRAIN: [0 1] TEST: [2 3] + [[1 2] + [3 4]] [[5 6] + [7 8]] [1 2] [3 4] + TRAIN: [2 3] TEST: [0 1] + [[5 6] + [7 8]] [[1 2] + [3 4]] [3 4] [1 2] + """ + def __init__(self, y, n_folds=3): - """Creates K approximately equilibrated folds - where the same label will not appear in two different folds - - Parameters - ---------- - y : array-like with shape (n_samples, ) - Contains a label for each sample. - The folds are built so that the same label doesn't appear in two different folds. - - n_folds : int, default is 3 - Number of folds. - - Examples - -------- - >>> from sklearn import cross_validation - >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) - >>> y = np.array([1, 2, 3, 4]) - >>> labels = np.array([0, 0, 2, 2]) - >>> dl_kfold = cross_validation.DisjointLabelKfold(labels, n_folds=2) - >>> len(dl_kfold) - 2 - >>> print(dl_kfold) - sklearn.cross_validation.DisjointLabelKfold(n_labels=4, n_folds=2) - >>> for train_index, test_index in dl_kfold: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - ... print(X_train, X_test, y_train, y_test) - ... - TRAIN: [0 1] TEST: [2 3] - [[1 2] - [3 4]] [[5 6] - [7 8]] [1 2] [3 4] - TRAIN: [2 3] TEST: [0 1] - [[5 6] - [7 8]] [[1 2] - [3 4]] [3 4] [1 2] - """ self.n_folds = n_folds self.n = len(y) self.idxs = disjoint_label_folds(y=y, n_folds=n_folds) From f320bc1a384221e359803a0f84319a1979712344 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Thu, 14 May 2015 12:29:42 +0100 Subject: [PATCH 09/25] FIX: added call to parent class --- sklearn/cross_validation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 965596ce74475..a56271ef0b160 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -416,6 +416,8 @@ class DisjointLabelKfold(_BaseKFold): """ def __init__(self, y, n_folds=3): + # No shuffling implemented yet + super(KFold, self).__init__(n, n_folds, False, None) self.n_folds = n_folds self.n = len(y) self.idxs = disjoint_label_folds(y=y, n_folds=n_folds) From f58dbe663b9b4e250719d98bb5394685dfd20630 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Thu, 14 May 2015 12:44:28 +0100 Subject: [PATCH 10/25] FIX: error in calling the parent --- sklearn/cross_validation.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index a56271ef0b160..42bcdb97ef860 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -406,18 +406,18 @@ class DisjointLabelKfold(_BaseKFold): ... print(X_train, X_test, y_train, y_test) ... TRAIN: [0 1] TEST: [2 3] - [[1 2] - [3 4]] [[5 6] - [7 8]] [1 2] [3 4] + [[1 2] + [3 4]] [[5 6] + [7 8]] [1 2] [3 4] TRAIN: [2 3] TEST: [0 1] - [[5 6] - [7 8]] [[1 2] - [3 4]] [3 4] [1 2] + [[5 6] + [7 8]] [[1 2] + [3 4]] [3 4] [1 2] """ def __init__(self, y, n_folds=3): # No shuffling implemented yet - super(KFold, self).__init__(n, n_folds, False, None) + super(DisjointLabelKfold, self).__init__(len(y), n_folds, False, None) self.n_folds = n_folds self.n = len(y) self.idxs = disjoint_label_folds(y=y, n_folds=n_folds) From 89358a3670f444e7810e9c7627a90f7ba7d36b6a Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Thu, 14 May 2015 13:30:24 +0100 Subject: [PATCH 11/25] DOC: fixed doctest --- sklearn/cross_validation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 42bcdb97ef860..b90c2dcd9618c 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -406,13 +406,13 @@ class DisjointLabelKfold(_BaseKFold): ... print(X_train, X_test, y_train, y_test) ... TRAIN: [0 1] TEST: [2 3] - [[1 2] - [3 4]] [[5 6] - [7 8]] [1 2] [3 4] + [[1 2] + [3 4]] [[5 6] + [7 8]] [1 2] [3 4] TRAIN: [2 3] TEST: [0 1] - [[5 6] - [7 8]] [[1 2] - [3 4]] [3 4] [1 2] + [[5 6] + [7 8]] [[1 2] + [3 4]] [3 4] [1 2] """ def __init__(self, y, n_folds=3): From 90cfc566feff55de32bcf060ef7d644edf845649 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Thu, 14 May 2015 14:33:42 +0100 Subject: [PATCH 12/25] FIX: doctest --- sklearn/cross_validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index b90c2dcd9618c..5b5a7e7260e65 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -407,12 +407,12 @@ class DisjointLabelKfold(_BaseKFold): ... TRAIN: [0 1] TEST: [2 3] [[1 2] - [3 4]] [[5 6] - [7 8]] [1 2] [3 4] + [3 4]] [[5 6] + [7 8]] [1 2] [3 4] TRAIN: [2 3] TEST: [0 1] [[5 6] - [7 8]] [[1 2] - [3 4]] [3 4] [1 2] + [7 8]] [[1 2] + [3 4]] [3 4] [1 2] """ def __init__(self, y, n_folds=3): From 5630d34512665d3260f9dc5e3131dfa25f508f27 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Tue, 23 Jun 2015 16:35:47 +0100 Subject: [PATCH 13/25] Cosmetic changes (minor refactoring) --- sklearn/cross_validation.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 5b5a7e7260e65..5c26d5ab2bf66 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -333,29 +333,29 @@ def __len__(self): def disjoint_label_folds(y, n_folds=3): - """Creates folds where a same label is not in two different folds + """Creates folds where a same label is not in two different folds. Parameters ---------- - y: iterable of shape (n_samples, ) - contains an id for each sample - The folds are built so that the same id doesn't appear in two different folds + y : iterable of shape (n_samples,) + Contains an id for each sample. + The folds are built so that the same id doesn't appear in two different folds. n_folds: int, default is 3 - number of folds to split the data into + Number of folds to split the data into. Returns ------- folds: numpy array of shape (n_samples, ) - array of integers between 0 and (n_folds - 1) - folds[i] contains the folds to which sample i is assigned. + Array of integers between 0 and (n_folds - 1). + Folds[i] contains the folds to which sample i is assigned. Notes ----- The folds are built by distributing the labels by frequency of appearance. """ labels = np.array(y) - unique_labels = np.unique(labels) + unique_labels, y = np.unique(labels, return_inverse=True) # number of occurrence of each label (its "weight") weight_per_label = sorted([(sum(labels == label), label) for label in unique_labels]) @@ -375,9 +375,10 @@ def disjoint_label_folds(y, n_folds=3): return folds -class DisjointLabelKfold(_BaseKFold): - """Creates K approximately equilibrated folds - where the same label will not appear in two different folds +class DisjointLabelKFold(_BaseKFold): + """Creates K approximately equilibrated folds. + + The same label will not appear in two different folds. Parameters ---------- @@ -394,11 +395,11 @@ class DisjointLabelKfold(_BaseKFold): >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) >>> y = np.array([1, 2, 3, 4]) >>> labels = np.array([0, 0, 2, 2]) - >>> dl_kfold = cross_validation.DisjointLabelKfold(labels, n_folds=2) + >>> dl_kfold = cross_validation.DisjointLabelKFold(labels, n_folds=2) >>> len(dl_kfold) 2 >>> print(dl_kfold) - sklearn.cross_validation.DisjointLabelKfold(n_labels=4, n_folds=2) + sklearn.cross_validation.DisjointLabelKFold(n_labels=4, n_folds=2) >>> for train_index, test_index in dl_kfold: ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] @@ -415,9 +416,9 @@ class DisjointLabelKfold(_BaseKFold): [3 4]] [3 4] [1 2] """ - def __init__(self, y, n_folds=3): + def __init__(self, y, n_folds=3, shuffle=False, random_state=None): # No shuffling implemented yet - super(DisjointLabelKfold, self).__init__(len(y), n_folds, False, None) + super(DisjointLabelKFold, self).__init__(len(y), n_folds, shuffle, random_state) self.n_folds = n_folds self.n = len(y) self.idxs = disjoint_label_folds(y=y, n_folds=n_folds) From 19da58ae24cd3b49b884303ed090d090dd917bce Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Tue, 23 Jun 2015 17:17:53 +0100 Subject: [PATCH 14/25] Optimised code (use np.bincount) --- sklearn/cross_validation.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 5c26d5ab2bf66..28fa786d9516e 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -358,20 +358,26 @@ def disjoint_label_folds(y, n_folds=3): unique_labels, y = np.unique(labels, return_inverse=True) # number of occurrence of each label (its "weight") - weight_per_label = sorted([(sum(labels == label), label) for label in unique_labels]) + weight_per_label = np.bincount(y) + # We want to distribute the heaviest weights first + ind = np.argsort(weight_per_label)[::-1] + weight_per_label = weight_per_label[ind] + # Total weight of each fold weight_per_fold = np.zeros(n_folds) - # For each sample, a digit between 0 and (n_folds - 1) to tell which fold it belongs to - folds = np.zeros(len(labels)) + + # Mapping from label index to fold index + label_to_fold = np.zeros(len(unique_labels)) # While there are weights, distribute them # Specifically, add the biggest weight to the lightest fold - while weight_per_label: + for label_index, w in enumerate(weight_per_label): ind_min = np.argmin(weight_per_fold) - w, label = weight_per_label.pop() weight_per_fold[ind_min] += w - folds[labels == label] = ind_min + label_to_fold[ind[label_index]] = ind_min + folds = label_to_fold[y] + return folds From 4706289c251f6ec0ff5066fb9eee50d1082fe11e Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Tue, 23 Jun 2015 18:52:34 +0100 Subject: [PATCH 15/25] Cosmetic: use samples instead of weight for clarity --- sklearn/cross_validation.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 28fa786d9516e..29ac0a5244382 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -358,22 +358,22 @@ def disjoint_label_folds(y, n_folds=3): unique_labels, y = np.unique(labels, return_inverse=True) # number of occurrence of each label (its "weight") - weight_per_label = np.bincount(y) - # We want to distribute the heaviest weights first - ind = np.argsort(weight_per_label)[::-1] - weight_per_label = weight_per_label[ind] + samples_per_label = np.bincount(y) + # We want to distribute the most frequent labels first + ind = np.argsort(samples_per_label)[::-1] + samples_per_label = samples_per_label[ind] # Total weight of each fold - weight_per_fold = np.zeros(n_folds) + samples_per_fold = np.zeros(n_folds) # Mapping from label index to fold index label_to_fold = np.zeros(len(unique_labels)) # While there are weights, distribute them # Specifically, add the biggest weight to the lightest fold - for label_index, w in enumerate(weight_per_label): - ind_min = np.argmin(weight_per_fold) - weight_per_fold[ind_min] += w + for label_index, w in enumerate(samples_per_label): + ind_min = np.argmin(samples_per_fold) + samples_per_fold[ind_min] += w label_to_fold[ind[label_index]] = ind_min folds = label_to_fold[y] From b3055396769e9ca5b440135c3cc03ce8861a50ba Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Tue, 23 Jun 2015 20:41:07 +0100 Subject: [PATCH 16/25] Minor fix: removed shuffle parameter --- sklearn/cross_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 29ac0a5244382..d45cf3984c0a6 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -422,9 +422,9 @@ class DisjointLabelKFold(_BaseKFold): [3 4]] [3 4] [1 2] """ - def __init__(self, y, n_folds=3, shuffle=False, random_state=None): + def __init__(self, y, n_folds=3): # No shuffling implemented yet - super(DisjointLabelKFold, self).__init__(len(y), n_folds, shuffle, random_state) + super(DisjointLabelKFold, self).__init__(len(y), n_folds, False, None) self.n_folds = n_folds self.n = len(y) self.idxs = disjoint_label_folds(y=y, n_folds=n_folds) From b7fc3d884afff3dceb834dc7117c5883f9ba391a Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Tue, 23 Jun 2015 20:55:58 +0100 Subject: [PATCH 17/25] Cosmetic --- sklearn/cross_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index d45cf3984c0a6..a85ad0fa527cb 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -337,11 +337,11 @@ def disjoint_label_folds(y, n_folds=3): Parameters ---------- - y : iterable of shape (n_samples,) + y: numpy array, shape (n_samples,) Contains an id for each sample. The folds are built so that the same id doesn't appear in two different folds. - n_folds: int, default is 3 + n_folds: int, default=3 Number of folds to split the data into. Returns From 65d96b3fd6d00074a142736c1c71f0af5fbaf9b8 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Thu, 25 Jun 2015 10:47:00 +0100 Subject: [PATCH 18/25] Use mergesort instead of quicksort for reproducibility. --- sklearn/cross_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index a85ad0fa527cb..ec7ed22655b41 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -360,7 +360,7 @@ def disjoint_label_folds(y, n_folds=3): # number of occurrence of each label (its "weight") samples_per_label = np.bincount(y) # We want to distribute the most frequent labels first - ind = np.argsort(samples_per_label)[::-1] + ind = np.argsort(samples_per_label, kind='mergesort')[::-1] samples_per_label = samples_per_label[ind] # Total weight of each fold From fa03f1b1b2dcb1ac92a32bea5621a0863500351b Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Thu, 25 Jun 2015 12:01:32 +0100 Subject: [PATCH 19/25] Changed variable name 'y' to 'label'. --- sklearn/cross_validation.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index ec7ed22655b41..c3639f4e99969 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -332,12 +332,12 @@ def __len__(self): return self.n_folds -def disjoint_label_folds(y, n_folds=3): +def disjoint_label_folds(labels, n_folds=3): """Creates folds where a same label is not in two different folds. Parameters ---------- - y: numpy array, shape (n_samples,) + labels: numpy array, shape (n_samples,) Contains an id for each sample. The folds are built so that the same id doesn't appear in two different folds. @@ -354,11 +354,11 @@ def disjoint_label_folds(y, n_folds=3): ----- The folds are built by distributing the labels by frequency of appearance. """ - labels = np.array(y) - unique_labels, y = np.unique(labels, return_inverse=True) + labels = np.array(labels) + unique_labels, labels = np.unique(labels, return_inverse=True) # number of occurrence of each label (its "weight") - samples_per_label = np.bincount(y) + samples_per_label = np.bincount(labels) # We want to distribute the most frequent labels first ind = np.argsort(samples_per_label, kind='mergesort')[::-1] samples_per_label = samples_per_label[ind] @@ -376,7 +376,7 @@ def disjoint_label_folds(y, n_folds=3): samples_per_fold[ind_min] += w label_to_fold[ind[label_index]] = ind_min - folds = label_to_fold[y] + folds = label_to_fold[labels] return folds @@ -388,7 +388,7 @@ class DisjointLabelKFold(_BaseKFold): Parameters ---------- - y : array-like with shape (n_samples, ) + labels : array-like with shape (n_samples, ) Contains a label for each sample. The folds are built so that the same label doesn't appear in two different folds. @@ -422,12 +422,12 @@ class DisjointLabelKFold(_BaseKFold): [3 4]] [3 4] [1 2] """ - def __init__(self, y, n_folds=3): + def __init__(self, labels, n_folds=3): # No shuffling implemented yet - super(DisjointLabelKFold, self).__init__(len(y), n_folds, False, None) + super(DisjointLabelKFold, self).__init__(len(labels), n_folds, False, None) self.n_folds = n_folds - self.n = len(y) - self.idxs = disjoint_label_folds(y=y, n_folds=n_folds) + self.n = len(labels) + self.idxs = disjoint_label_folds(labels=labels, n_folds=n_folds) def _iter_test_indices(self): for i in range(self.n_folds): From 3faa60a248d3b099c18d73b1a7e622dfa5293f47 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Mon, 29 Jun 2015 15:36:12 +0100 Subject: [PATCH 20/25] Added test for degenerate case where n_folds > n_labels. --- sklearn/cross_validation.py | 5 +++++ sklearn/tests/test_cross_validation.py | 3 +++ 2 files changed, 8 insertions(+) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index c3639f4e99969..ce5731f3e1f06 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -356,6 +356,11 @@ def disjoint_label_folds(labels, n_folds=3): """ labels = np.array(labels) unique_labels, labels = np.unique(labels, return_inverse=True) + n_labels = len(unique_labels) + if n_folds > n_labels: + raise ValueError( + ("Cannot have number of folds n_folds={0} greater" + " than the number of labels: {1}.").format(n_folds, n_labels)) # number of occurrence of each label (its "weight") samples_per_label = np.bincount(labels) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 25b617c93d3ae..c01acef37e5a8 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -1093,3 +1093,6 @@ def test_disjoint_label_folds(): for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) + # Should fail if there are more folds than labels + labels = np.array([1, 1, 1, 2, 2]) + assert_raises(ValueError, cval.disjoint_label_folds, labels, n_folds=3) From 90275bee75aa2e2fe5347ae805864d4abd96b58f Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Mon, 29 Jun 2015 15:37:54 +0100 Subject: [PATCH 21/25] Documented the requirement n_labels > n_folds. --- sklearn/cross_validation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index ce5731f3e1f06..c456934592a71 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -353,6 +353,7 @@ def disjoint_label_folds(labels, n_folds=3): Notes ----- The folds are built by distributing the labels by frequency of appearance. + The number of labels has to be at least equal to the number of folds. """ labels = np.array(labels) unique_labels, labels = np.unique(labels, return_inverse=True) @@ -389,7 +390,8 @@ def disjoint_label_folds(labels, n_folds=3): class DisjointLabelKFold(_BaseKFold): """Creates K approximately equilibrated folds. - The same label will not appear in two different folds. + The same label will not appear in two different folds + (the number of labels has to be at least equal to the number of folds). Parameters ---------- From 0b2dff6b23118ef489d080d4d2bf3b19b27c1385 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Wed, 1 Jul 2015 20:09:38 +0100 Subject: [PATCH 22/25] DOC: improved description + added see also sections. --- sklearn/cross_validation.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index c456934592a71..44c636ab55ebf 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -297,6 +297,8 @@ class KFold(_BaseKFold): StratifiedKFold: take label information into account to avoid building folds with imbalanced class distributions (for binary or multiclass classification tasks). + + DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. """ def __init__(self, n, n_folds=3, shuffle=False, @@ -388,10 +390,13 @@ def disjoint_label_folds(labels, n_folds=3): class DisjointLabelKFold(_BaseKFold): - """Creates K approximately equilibrated folds. - - The same label will not appear in two different folds - (the number of labels has to be at least equal to the number of folds). + """K-fold iterator variant with non-overlapping labels. + + The same label will not appear in two different folds (the number of + labels has to be at least equal to the number of folds). + + The folds are approximately balanced in the sense so that the number of + distinct labels is approximately the same in each fold. Parameters ---------- @@ -427,8 +432,12 @@ class DisjointLabelKFold(_BaseKFold): [[5 6] [7 8]] [[1 2] [3 4]] [3 4] [1 2] - """ + See also + -------- + LeaveOneLabelOut for splitting the data according to explicit, + domain-specific stratification of the dataset. + """ def __init__(self, labels, n_folds=3): # No shuffling implemented yet super(DisjointLabelKFold, self).__init__(len(labels), n_folds, False, None) @@ -500,6 +509,9 @@ class StratifiedKFold(_BaseKFold): All the folds have size trunc(n_samples / n_folds), the last one has the complementary. + See also + -------- + DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. """ def __init__(self, y, n_folds=3, shuffle=False, @@ -606,6 +618,9 @@ class LeaveOneLabelOut(_PartitionIterator): [3 4]] [[5 6] [7 8]] [1 2] [1 2] + See also + -------- + DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. """ def __init__(self, labels): @@ -679,6 +694,10 @@ class LeavePLabelOut(_PartitionIterator): TRAIN: [0] TEST: [1 2] [[1 2]] [[3 4] [5 6]] [1] [2 1] + + See also + -------- + DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. """ def __init__(self, labels, p): From 6b4b63fde6059afbae855d2893117347d39d02d1 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Wed, 1 Jul 2015 20:11:44 +0100 Subject: [PATCH 23/25] Fixed dtype of temporary arrays. --- sklearn/cross_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 44c636ab55ebf..ed7458be80bb1 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -372,10 +372,10 @@ def disjoint_label_folds(labels, n_folds=3): samples_per_label = samples_per_label[ind] # Total weight of each fold - samples_per_fold = np.zeros(n_folds) + samples_per_fold = np.zeros(n_folds, dtype=np.uint64) # Mapping from label index to fold index - label_to_fold = np.zeros(len(unique_labels)) + label_to_fold = np.zeros(len(unique_labels), dtype=np.uintp) # While there are weights, distribute them # Specifically, add the biggest weight to the lightest fold From c9a636c4d5ab44937305352de907aea5952502c1 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Wed, 1 Jul 2015 20:15:31 +0100 Subject: [PATCH 24/25] Improved test: check that one label is not in both test and training. --- sklearn/tests/test_cross_validation.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index c01acef37e5a8..b7e4e15ef76eb 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -1055,7 +1055,7 @@ def test_disjoint_label_folds(): # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed - labels = np.random.randint(0, n_labels, n_samples) + labels = rng.randint(0, n_labels, n_samples) folds = cval.disjoint_label_folds(labels, n_folds) ideal_n_labels_per_fold = n_samples // n_folds @@ -1064,9 +1064,14 @@ def test_disjoint_label_folds(): for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) - # Check that each subjects appears only in 1 fold + # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) + + # Check that no label is on both sides of the split + labels = np.asarray(labels, dtype=object) # to allow fancy indexing on labels + for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds): + assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', @@ -1089,9 +1094,14 @@ def test_disjoint_label_folds(): for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) - # Check that each subjects appears only in 1 fold + # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) + + # Check that no label is on both sides of the split + labels = np.asarray(labels, dtype=object) # to allow fancy indexing on labels + for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds): + assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) From ecb0ea0c30315696c891269688635476e6cd9564 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Wed, 1 Jul 2015 20:27:52 +0100 Subject: [PATCH 25/25] Added documentation for DisjoinLabelKFold. --- doc/modules/classes.rst | 1 + doc/modules/cross_validation.rst | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 4309c8a21dfc8..6147086d2bb4e 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -168,6 +168,7 @@ Classes cross_validation.LeavePOut cross_validation.PredefinedSplit cross_validation.StratifiedKFold + cross_validation.DisjointLabelKFold cross_validation.ShuffleSplit cross_validation.StratifiedShuffleSplit diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 0aa6bf1e3b692..5d6dd2d041a6f 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -261,6 +261,33 @@ two slightly unbalanced classes:: [0 1 2 4 5 6 7] [3 8 9] +Disjoint label KFold +-------------------- + +:class:`DisjointLabelKFold` is a variation of *k-fold* which ensures that the same +label is not in both testing and training sets. +This is necessary for example if you obtained data from different subjects and you +want to avoid over-fitting (ie learning person specific features) by testing and +training on different subjects. + +Imagine you have three subjects, each with an associated number from 1 to 3:: + + >>> from sklearn.cross_validation import DisjointLabelKFold + + >>> labels = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] + + >>> dlkf = DisjointLabelKFold(labels, 3) + >>> for train, test in dlkf: + ... print("%s %s" % (train, test)) + [0 1 2 3 4 5] [6 7 8 9] + [0 1 2 6 7 8 9] [3 4 5] + [3 4 5 6 7 8 9] [0 1 2] + +Each subject is in a different testing fold, and the same subject is never in both +testing and training. +Notice that the folds do not have exactly the same size due to the imbalance in the data. + + Leave-One-Out - LOO -------------------