From 6eb40285192f5530e6ef3e37aa70b0571164e1d8 Mon Sep 17 00:00:00 2001 From: Jean Kossaifi Date: Tue, 24 Mar 2015 15:32:29 +0000 Subject: [PATCH 1/4] Added subject independent KFold Changed SubjectIndependentKFold to DisjointGroupKFold cosmetic changes test (fix seed correctly, use assert_equal for meaningful error messages) Changed name to DisjointLabelKFold Added example of use FIX: whitespace related doctest failure FIX: Python 2.6 requires the field numbers in print FIX: change docstring to comment in test function DOC: moved docstring from function to class FIX: added call to parent class FIX: error in calling the parent DOC: fixed doctest FIX: doctest Cosmetic changes (minor refactoring) Optimised code (use np.bincount) Cosmetic: use samples instead of weight for clarity Minor fix: removed shuffle parameter Cosmetic Use mergesort instead of quicksort for reproducibility. Changed variable name 'y' to 'label'. Added test for degenerate case where n_folds > n_labels. Documented the requirement n_labels > n_folds. DOC: improved description + added see also sections. Fixed dtype of temporary arrays. Improved test: check that one label is not in both test and training. Added documentation for DisjoinLabelKFold. --- doc/modules/classes.rst | 1 + doc/modules/cross_validation.rst | 27 +++++ sklearn/cross_validation.py | 139 +++++++++++++++++++++++++ sklearn/tests/test_cross_validation.py | 68 ++++++++++++ 4 files changed, 235 insertions(+) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index d8b3dd4dbe1d6..d50a7375a1889 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -168,6 +168,7 @@ Classes cross_validation.LeavePOut cross_validation.PredefinedSplit cross_validation.StratifiedKFold + cross_validation.DisjointLabelKFold cross_validation.ShuffleSplit cross_validation.LabelShuffleSplit cross_validation.StratifiedShuffleSplit diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 53afdf53550b1..ad1e9a8ddcf9c 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -261,6 +261,33 @@ two slightly unbalanced classes:: [0 1 2 4 5 6 7] [3 8 9] +Disjoint label KFold +-------------------- + +:class:`DisjointLabelKFold` is a variation of *k-fold* which ensures that the same +label is not in both testing and training sets. +This is necessary for example if you obtained data from different subjects and you +want to avoid over-fitting (ie learning person specific features) by testing and +training on different subjects. + +Imagine you have three subjects, each with an associated number from 1 to 3:: + + >>> from sklearn.cross_validation import DisjointLabelKFold + + >>> labels = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] + + >>> dlkf = DisjointLabelKFold(labels, 3) + >>> for train, test in dlkf: + ... print("%s %s" % (train, test)) + [0 1 2 3 4 5] [6 7 8 9] + [0 1 2 6 7 8 9] [3 4 5] + [3 4 5 6 7 8 9] [0 1 2] + +Each subject is in a different testing fold, and the same subject is never in both +testing and training. +Notice that the folds do not have exactly the same size due to the imbalance in the data. + + Leave-One-Out - LOO ------------------- diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 4c49bee86517d..c9b88832b102a 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -304,6 +304,8 @@ class KFold(_BaseKFold): StratifiedKFold: take label information into account to avoid building folds with imbalanced class distributions (for binary or multiclass classification tasks). + + DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. """ def __init__(self, n, n_folds=3, shuffle=False, @@ -339,6 +341,133 @@ def __len__(self): return self.n_folds +def disjoint_label_folds(labels, n_folds=3): + """Creates folds where a same label is not in two different folds. + + Parameters + ---------- + labels: numpy array, shape (n_samples,) + Contains an id for each sample. + The folds are built so that the same id doesn't appear in two different folds. + + n_folds: int, default=3 + Number of folds to split the data into. + + Returns + ------- + folds: numpy array of shape (n_samples, ) + Array of integers between 0 and (n_folds - 1). + Folds[i] contains the folds to which sample i is assigned. + + Notes + ----- + The folds are built by distributing the labels by frequency of appearance. + The number of labels has to be at least equal to the number of folds. + """ + labels = np.array(labels) + unique_labels, labels = np.unique(labels, return_inverse=True) + n_labels = len(unique_labels) + if n_folds > n_labels: + raise ValueError( + ("Cannot have number of folds n_folds={0} greater" + " than the number of labels: {1}.").format(n_folds, n_labels)) + + # number of occurrence of each label (its "weight") + samples_per_label = np.bincount(labels) + # We want to distribute the most frequent labels first + ind = np.argsort(samples_per_label, kind='mergesort')[::-1] + samples_per_label = samples_per_label[ind] + + # Total weight of each fold + samples_per_fold = np.zeros(n_folds, dtype=np.uint64) + + # Mapping from label index to fold index + label_to_fold = np.zeros(len(unique_labels), dtype=np.uintp) + + # While there are weights, distribute them + # Specifically, add the biggest weight to the lightest fold + for label_index, w in enumerate(samples_per_label): + ind_min = np.argmin(samples_per_fold) + samples_per_fold[ind_min] += w + label_to_fold[ind[label_index]] = ind_min + + folds = label_to_fold[labels] + + return folds + + +class DisjointLabelKFold(_BaseKFold): + """K-fold iterator variant with non-overlapping labels. + + The same label will not appear in two different folds (the number of + labels has to be at least equal to the number of folds). + + The folds are approximately balanced in the sense so that the number of + distinct labels is approximately the same in each fold. + + Parameters + ---------- + labels : array-like with shape (n_samples, ) + Contains a label for each sample. + The folds are built so that the same label doesn't appear in two different folds. + + n_folds : int, default is 3 + Number of folds. + + Examples + -------- + >>> from sklearn import cross_validation + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + >>> y = np.array([1, 2, 3, 4]) + >>> labels = np.array([0, 0, 2, 2]) + >>> dl_kfold = cross_validation.DisjointLabelKFold(labels, n_folds=2) + >>> len(dl_kfold) + 2 + >>> print(dl_kfold) + sklearn.cross_validation.DisjointLabelKFold(n_labels=4, n_folds=2) + >>> for train_index, test_index in dl_kfold: + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + ... print(X_train, X_test, y_train, y_test) + ... + TRAIN: [0 1] TEST: [2 3] + [[1 2] + [3 4]] [[5 6] + [7 8]] [1 2] [3 4] + TRAIN: [2 3] TEST: [0 1] + [[5 6] + [7 8]] [[1 2] + [3 4]] [3 4] [1 2] + + See also + -------- + LeaveOneLabelOut for splitting the data according to explicit, + domain-specific stratification of the dataset. + """ + def __init__(self, labels, n_folds=3): + # No shuffling implemented yet + super(DisjointLabelKFold, self).__init__(len(labels), n_folds, False, None) + self.n_folds = n_folds + self.n = len(labels) + self.idxs = disjoint_label_folds(labels=labels, n_folds=n_folds) + + def _iter_test_indices(self): + for i in range(self.n_folds): + yield (self.idxs == i) + + def __repr__(self): + return '{0}.{1}(n_labels={2}, n_folds={3})'.format( + self.__class__.__module__, + self.__class__.__name__, + self.n, + self.n_folds, + ) + + def __len__(self): + return self.n_folds + + class StratifiedKFold(_BaseKFold): """Stratified K-Folds cross validation iterator @@ -389,6 +518,9 @@ class StratifiedKFold(_BaseKFold): All the folds have size trunc(n_samples / n_folds), the last one has the complementary. + See also + -------- + DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. """ def __init__(self, y, n_folds=3, shuffle=False, @@ -497,6 +629,9 @@ class LeaveOneLabelOut(_PartitionIterator): [3 4]] [[5 6] [7 8]] [1 2] [1 2] + See also + -------- + DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. """ def __init__(self, labels): @@ -572,6 +707,10 @@ class LeavePLabelOut(_PartitionIterator): TRAIN: [0] TEST: [1 2] [[1 2]] [[3 4] [5 6]] [1] [2 1] + + See also + -------- + DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. """ def __init__(self, labels, p): diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 3c45261103411..1bdd195621615 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -13,6 +13,7 @@ from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_greater +from sklearn.utils.testing import assert_greater_equal from sklearn.utils.testing import assert_less from sklearn.utils.testing import assert_not_equal from sklearn.utils.testing import assert_array_almost_equal @@ -1153,3 +1154,70 @@ def test_cross_val_predict_sparse_prediction(): preds_sparse = cval.cross_val_predict(classif, X_sparse, y_sparse, cv=10) preds_sparse = preds_sparse.toarray() assert_array_almost_equal(preds_sparse, preds) + + +def test_disjoint_label_folds(): + ## Check that the function produces equilibrated folds + ## with no label appearing in two different folds + + # Fix the seed for reproducibility + rng = np.random.RandomState(0) + + # Parameters of the test + n_labels = 15 + n_samples = 1000 + n_folds = 5 + + # Construct the test data + tolerance = 0.05 * n_samples # 5 percent error allowed + labels = rng.randint(0, n_labels, n_samples) + folds = cval.disjoint_label_folds(labels, n_folds) + ideal_n_labels_per_fold = n_samples // n_folds + + # Check that folds have approximately the same size + assert_equal(len(folds), len(labels)) + for i in np.unique(folds): + assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) + + # Check that each label appears only in 1 fold + for label in np.unique(labels): + assert_equal(len(np.unique(folds[labels == label])), 1) + + # Check that no label is on both sides of the split + labels = np.asarray(labels, dtype=object) # to allow fancy indexing on labels + for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds): + assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) + + # Construct the test data + labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', + 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', + 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', + 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', + 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', + 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', + 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'] + + n_labels = len(np.unique(labels)) + n_samples = len(labels) + n_folds = 5 + tolerance = 0.05 * n_samples # 5 percent error allowed + folds = cval.disjoint_label_folds(labels, n_folds) + ideal_n_labels_per_fold = n_samples // n_folds + + # Check that folds have approximately the same size + assert_equal(len(folds), len(labels)) + for i in np.unique(folds): + assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) + + # Check that each label appears only in 1 fold + for label in np.unique(labels): + assert_equal(len(np.unique(folds[labels == label])), 1) + + # Check that no label is on both sides of the split + labels = np.asarray(labels, dtype=object) # to allow fancy indexing on labels + for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds): + assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) + + # Should fail if there are more folds than labels + labels = np.array([1, 1, 1, 2, 2]) + assert_raises(ValueError, cval.disjoint_label_folds, labels, n_folds=3) From cf7bc7b035efaefea7dc523e814a65cc3d0c5514 Mon Sep 17 00:00:00 2001 From: Gilles Louppe Date: Sun, 30 Aug 2015 18:56:57 +0200 Subject: [PATCH 2/4] ENH: rename to LabelKFold COSMIT: doc, pep8, etc Refactor code --- doc/modules/classes.rst | 7 +- doc/modules/cross_validation.rst | 32 ++--- sklearn/cross_validation.py | 169 ++++++++++++------------- sklearn/tests/test_cross_validation.py | 132 ++++++++++--------- 4 files changed, 164 insertions(+), 176 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index d50a7375a1889..6001c3d9f92a0 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -162,17 +162,18 @@ Classes :template: class.rst cross_validation.KFold + cross_validation.LabelKFold + cross_validation.LabelShuffleSplit cross_validation.LeaveOneLabelOut cross_validation.LeaveOneOut cross_validation.LeavePLabelOut cross_validation.LeavePOut cross_validation.PredefinedSplit - cross_validation.StratifiedKFold - cross_validation.DisjointLabelKFold cross_validation.ShuffleSplit - cross_validation.LabelShuffleSplit + cross_validation.StratifiedKFold cross_validation.StratifiedShuffleSplit + .. autosummary:: :toctree: generated/ :template: function.rst diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index ad1e9a8ddcf9c..93c30eb342f19 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -261,31 +261,31 @@ two slightly unbalanced classes:: [0 1 2 4 5 6 7] [3 8 9] -Disjoint label KFold --------------------- +Label k-fold +------------ -:class:`DisjointLabelKFold` is a variation of *k-fold* which ensures that the same -label is not in both testing and training sets. -This is necessary for example if you obtained data from different subjects and you -want to avoid over-fitting (ie learning person specific features) by testing and -training on different subjects. +:class:`LabelKFold` is a variation of *k-fold* which ensures that the same +label is not in both testing and training sets. This is necessary for example +if you obtained data from different subjects and you want to avoid over-fitting +(i.e., learning person specific features) by testing and training on different +subjects. Imagine you have three subjects, each with an associated number from 1 to 3:: - >>> from sklearn.cross_validation import DisjointLabelKFold + >>> from sklearn.cross_validation import LabelKFold >>> labels = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] - >>> dlkf = DisjointLabelKFold(labels, 3) - >>> for train, test in dlkf: + >>> lkf = LabelKFold(labels, 3) + >>> for train, test in lkf: ... print("%s %s" % (train, test)) [0 1 2 3 4 5] [6 7 8 9] [0 1 2 6 7 8 9] [3 4 5] [3 4 5 6 7 8 9] [0 1 2] -Each subject is in a different testing fold, and the same subject is never in both -testing and training. -Notice that the folds do not have exactly the same size due to the imbalance in the data. +Each subject is in a different testing fold, and the same subject is never in +both testing and training. Notice that the folds do not have exactly the same +size due to the imbalance in the data. Leave-One-Out - LOO @@ -462,15 +462,15 @@ Label-Shuffle-Split :class:`LabelShuffleSplit` -The :class:`LabelShuffleSplit` iterator behaves as a combination of -:class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a +The :class:`LabelShuffleSplit` iterator behaves as a combination of +:class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a sequence of randomized partitions in which a subset of labels are held out for each split. Here is a usage example:: >>> from sklearn.cross_validation import LabelShuffleSplit - + >>> labels = [1, 1, 2, 2, 3, 3, 4, 4] >>> slo = LabelShuffleSplit(labels, n_iter=4, test_size=0.5, ... random_state=0) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index c9b88832b102a..a9808a8de65bf 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -33,6 +33,7 @@ from .utils.fixes import bincount __all__ = ['KFold', + 'LabelKFold', 'LeaveOneLabelOut', 'LeaveOneOut', 'LeavePLabelOut', @@ -273,7 +274,7 @@ class KFold(_BaseKFold): Whether to shuffle the data before splitting into batches. random_state : None, int or RandomState - When shuffle=True, pseudo-random number generator state used for + When shuffle=True, pseudo-random number generator state used for shuffling. If None, use default numpy RNG for shuffling. Examples @@ -305,7 +306,7 @@ class KFold(_BaseKFold): folds with imbalanced class distributions (for binary or multiclass classification tasks). - DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. + LabelKFold: K-fold iterator variant with non-overlapping labels. """ def __init__(self, n, n_folds=3, shuffle=False, @@ -341,78 +342,31 @@ def __len__(self): return self.n_folds -def disjoint_label_folds(labels, n_folds=3): - """Creates folds where a same label is not in two different folds. - - Parameters - ---------- - labels: numpy array, shape (n_samples,) - Contains an id for each sample. - The folds are built so that the same id doesn't appear in two different folds. - - n_folds: int, default=3 - Number of folds to split the data into. - - Returns - ------- - folds: numpy array of shape (n_samples, ) - Array of integers between 0 and (n_folds - 1). - Folds[i] contains the folds to which sample i is assigned. - - Notes - ----- - The folds are built by distributing the labels by frequency of appearance. - The number of labels has to be at least equal to the number of folds. - """ - labels = np.array(labels) - unique_labels, labels = np.unique(labels, return_inverse=True) - n_labels = len(unique_labels) - if n_folds > n_labels: - raise ValueError( - ("Cannot have number of folds n_folds={0} greater" - " than the number of labels: {1}.").format(n_folds, n_labels)) - - # number of occurrence of each label (its "weight") - samples_per_label = np.bincount(labels) - # We want to distribute the most frequent labels first - ind = np.argsort(samples_per_label, kind='mergesort')[::-1] - samples_per_label = samples_per_label[ind] - - # Total weight of each fold - samples_per_fold = np.zeros(n_folds, dtype=np.uint64) - - # Mapping from label index to fold index - label_to_fold = np.zeros(len(unique_labels), dtype=np.uintp) - - # While there are weights, distribute them - # Specifically, add the biggest weight to the lightest fold - for label_index, w in enumerate(samples_per_label): - ind_min = np.argmin(samples_per_fold) - samples_per_fold[ind_min] += w - label_to_fold[ind[label_index]] = ind_min - - folds = label_to_fold[labels] - - return folds - - -class DisjointLabelKFold(_BaseKFold): +class LabelKFold(_BaseKFold): """K-fold iterator variant with non-overlapping labels. The same label will not appear in two different folds (the number of - labels has to be at least equal to the number of folds). + distinct labels has to be at least equal to the number of folds). - The folds are approximately balanced in the sense so that the number of + The folds are approximately balanced in the sense that the number of distinct labels is approximately the same in each fold. Parameters ---------- labels : array-like with shape (n_samples, ) Contains a label for each sample. - The folds are built so that the same label doesn't appear in two different folds. + The folds are built so that the same label does not appear in two + different folds. - n_folds : int, default is 3 - Number of folds. + n_folds : int, default=3 + Number of folds. Must be at least 2. + + shuffle : boolean, optional + Whether to shuffle the data before splitting into batches. + + random_state : None, int or RandomState + When shuffle=True, pseudo-random number generator state used for + shuffling. If None, use default numpy RNG for shuffling. Examples -------- @@ -420,17 +374,17 @@ class DisjointLabelKFold(_BaseKFold): >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) >>> y = np.array([1, 2, 3, 4]) >>> labels = np.array([0, 0, 2, 2]) - >>> dl_kfold = cross_validation.DisjointLabelKFold(labels, n_folds=2) - >>> len(dl_kfold) + >>> label_kfold = cross_validation.LabelKFold(labels, n_folds=2) + >>> len(label_kfold) 2 - >>> print(dl_kfold) - sklearn.cross_validation.DisjointLabelKFold(n_labels=4, n_folds=2) - >>> for train_index, test_index in dl_kfold: + >>> print(label_kfold) + sklearn.cross_validation.LabelKFold(n_labels=4, n_folds=2) + >>> for train_index, test_index in label_kfold: ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] ... y_train, y_test = y[train_index], y[test_index] ... print(X_train, X_test, y_train, y_test) - ... + ... TRAIN: [0 1] TEST: [2 3] [[1 2] [3 4]] [[5 6] @@ -445,12 +399,47 @@ class DisjointLabelKFold(_BaseKFold): LeaveOneLabelOut for splitting the data according to explicit, domain-specific stratification of the dataset. """ - def __init__(self, labels, n_folds=3): - # No shuffling implemented yet - super(DisjointLabelKFold, self).__init__(len(labels), n_folds, False, None) - self.n_folds = n_folds + def __init__(self, labels, n_folds=3, shuffle=False, random_state=None): + super(LabelKFold, self).__init__(len(labels), n_folds, shuffle, + random_state) + self.n = len(labels) - self.idxs = disjoint_label_folds(labels=labels, n_folds=n_folds) + self.n_folds = n_folds + + unique_labels, labels = np.unique(labels, return_inverse=True) + n_labels = len(unique_labels) + + if n_folds > n_labels: + raise ValueError( + ("Cannot have number of folds n_folds={0} greater" + " than the number of labels: {1}.").format(n_folds, + n_labels)) + + # Weight labels by their number of occurences + samples_per_label = np.bincount(labels) + + # Distribute the most frequent labels first + ind = np.argsort(samples_per_label)[::-1] + samples_per_label = samples_per_label[ind] + + # Total weight of each fold + samples_per_fold = np.zeros(n_folds, dtype=np.uint64) + + # Mapping from label index to fold index + label_to_fold = np.zeros(len(unique_labels), dtype=np.uintp) + + # While there are weights, distribute them + # Specifically, add the biggest weight to the lightest fold + for label_index, w in enumerate(samples_per_label): + ind_min = np.argmin(samples_per_fold) + samples_per_fold[ind_min] += w + label_to_fold[ind[label_index]] = ind_min + + self.idxs = label_to_fold[labels] + + if shuffle: + rng = check_random_state(self.random_state) + rng.shuffle(self.idxs) def _iter_test_indices(self): for i in range(self.n_folds): @@ -492,7 +481,7 @@ class StratifiedKFold(_BaseKFold): into batches. random_state : None, int or RandomState - When shuffle=True, pseudo-random number generator state used for + When shuffle=True, pseudo-random number generator state used for shuffling. If None, use default numpy RNG for shuffling. Examples @@ -520,7 +509,7 @@ class StratifiedKFold(_BaseKFold): See also -------- - DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. + LabelKFold: K-fold iterator variant with non-overlapping labels. """ def __init__(self, y, n_folds=3, shuffle=False, @@ -631,7 +620,7 @@ class LeaveOneLabelOut(_PartitionIterator): See also -------- - DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. + LabelKFold: K-fold iterator variant with non-overlapping labels. """ def __init__(self, labels): @@ -710,7 +699,7 @@ class LeavePLabelOut(_PartitionIterator): See also -------- - DisjointLabelKFold: K-fold iterator variant with non-overlapping labels. + LabelKFold: K-fold iterator variant with non-overlapping labels. """ def __init__(self, labels, p): @@ -1218,11 +1207,11 @@ def cross_val_predict(estimator, X, y=None, cv=None, n_jobs=1, supervised learning. cv : integer or cross-validation generator, optional, default=3 - A cross-validation generator to use. If int, determines the number - of folds in StratifiedKFold if estimator is a classifier and the - target y is binary or multiclass, or the number of folds in KFold + A cross-validation generator to use. If int, determines the number + of folds in StratifiedKFold if estimator is a classifier and the + target y is binary or multiclass, or the number of folds in KFold otherwise. - Specific cross-validation objects can be passed, see + Specific cross-validation objects can be passed, see sklearn.cross_validation module for the list of possible objects. This generator must include all elements in the test set exactly once. Otherwise, a ValueError is raised. @@ -1387,11 +1376,11 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, ``scorer(estimator, X, y)``. cv : integer or cross-validation generator, optional, default=3 - A cross-validation generator to use. If int, determines the number - of folds in StratifiedKFold if estimator is a classifier and the - target y is binary or multiclass, or the number of folds in KFold + A cross-validation generator to use. If int, determines the number + of folds in StratifiedKFold if estimator is a classifier and the + target y is binary or multiclass, or the number of folds in KFold otherwise. - Specific cross-validation objects can be passed, see + Specific cross-validation objects can be passed, see sklearn.cross_validation module for the list of possible objects. n_jobs : integer, optional @@ -1708,11 +1697,11 @@ def permutation_test_score(estimator, X, y, cv=None, ``scorer(estimator, X, y)``. cv : integer or cross-validation generator, optional, default=3 - A cross-validation generator to use. If int, determines the number - of folds in StratifiedKFold if estimator is a classifier and the - target y is binary or multiclass, or the number of folds in KFold + A cross-validation generator to use. If int, determines the number + of folds in StratifiedKFold if estimator is a classifier and the + target y is binary or multiclass, or the number of folds in KFold otherwise. - Specific cross-validation objects can be passed, see + Specific cross-validation objects can be passed, see sklearn.cross_validation module for the list of possible objects. n_permutations : integer, optional diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 1bdd195621615..df2d90877bc50 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -359,6 +359,71 @@ def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 assert_greater(mean_score, 0.85) +def test_label_kfold(): + rng = np.random.RandomState(0) + + # Parameters of the test + n_labels = 15 + n_samples = 1000 + n_folds = 5 + + # Construct the test data + tolerance = 0.05 * n_samples # 5 percent error allowed + labels = rng.randint(0, n_labels, n_samples) + folds = cval.LabelKFold(labels, n_folds).idxs + ideal_n_labels_per_fold = n_samples // n_folds + + # Check that folds have approximately the same size + assert_equal(len(folds), len(labels)) + for i in np.unique(folds): + assert_greater_equal(tolerance, + abs(sum(folds == i) - ideal_n_labels_per_fold)) + + # Check that each label appears only in 1 fold + for label in np.unique(labels): + assert_equal(len(np.unique(folds[labels == label])), 1) + + # Check that no label is on both sides of the split + labels = np.asarray(labels, dtype=object) + for train, test in cval.LabelKFold(labels, n_folds=n_folds): + assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) + + # Construct the test data + labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', + 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', + 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', + 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', + 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', + 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', + 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'] + + n_labels = len(np.unique(labels)) + n_samples = len(labels) + n_folds = 5 + tolerance = 0.05 * n_samples # 5 percent error allowed + folds = cval.LabelKFold(labels, n_folds).idxs + ideal_n_labels_per_fold = n_samples // n_folds + + # Check that folds have approximately the same size + assert_equal(len(folds), len(labels)) + for i in np.unique(folds): + assert_greater_equal(tolerance, + abs(sum(folds == i) - ideal_n_labels_per_fold)) + + # Check that each label appears only in 1 fold + for label in np.unique(labels): + assert_equal(len(np.unique(folds[labels == label])), 1) + + # Check that no label is on both sides of the split + labels = np.asarray(labels, dtype=object) + for train, test in cval.LabelKFold(labels, n_folds=n_folds): + assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) + + # Should fail if there are more folds than labels + labels = np.array([1, 1, 1, 2, 2]) + assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3) + + def test_shuffle_split(): ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0) ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0) @@ -1154,70 +1219,3 @@ def test_cross_val_predict_sparse_prediction(): preds_sparse = cval.cross_val_predict(classif, X_sparse, y_sparse, cv=10) preds_sparse = preds_sparse.toarray() assert_array_almost_equal(preds_sparse, preds) - - -def test_disjoint_label_folds(): - ## Check that the function produces equilibrated folds - ## with no label appearing in two different folds - - # Fix the seed for reproducibility - rng = np.random.RandomState(0) - - # Parameters of the test - n_labels = 15 - n_samples = 1000 - n_folds = 5 - - # Construct the test data - tolerance = 0.05 * n_samples # 5 percent error allowed - labels = rng.randint(0, n_labels, n_samples) - folds = cval.disjoint_label_folds(labels, n_folds) - ideal_n_labels_per_fold = n_samples // n_folds - - # Check that folds have approximately the same size - assert_equal(len(folds), len(labels)) - for i in np.unique(folds): - assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) - - # Check that each label appears only in 1 fold - for label in np.unique(labels): - assert_equal(len(np.unique(folds[labels == label])), 1) - - # Check that no label is on both sides of the split - labels = np.asarray(labels, dtype=object) # to allow fancy indexing on labels - for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds): - assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) - - # Construct the test data - labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', - 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', - 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', - 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', - 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', - 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', - 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'] - - n_labels = len(np.unique(labels)) - n_samples = len(labels) - n_folds = 5 - tolerance = 0.05 * n_samples # 5 percent error allowed - folds = cval.disjoint_label_folds(labels, n_folds) - ideal_n_labels_per_fold = n_samples // n_folds - - # Check that folds have approximately the same size - assert_equal(len(folds), len(labels)) - for i in np.unique(folds): - assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) - - # Check that each label appears only in 1 fold - for label in np.unique(labels): - assert_equal(len(np.unique(folds[labels == label])), 1) - - # Check that no label is on both sides of the split - labels = np.asarray(labels, dtype=object) # to allow fancy indexing on labels - for train, test in cval.DisjointLabelKFold(labels, n_folds=n_folds): - assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) - - # Should fail if there are more folds than labels - labels = np.array([1, 1, 1, 2, 2]) - assert_raises(ValueError, cval.disjoint_label_folds, labels, n_folds=3) From c769a65e156b7281c5423ef359782a7be60a62af Mon Sep 17 00:00:00 2001 From: Gilles Louppe Date: Mon, 7 Sep 2015 10:08:17 +0200 Subject: [PATCH 3/4] COSMIT: variable names, documentation, etc --- doc/modules/cross_validation.rst | 2 +- sklearn/cross_validation.py | 33 ++++++++++++++++---------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 93c30eb342f19..e76b95c6e48be 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -276,7 +276,7 @@ Imagine you have three subjects, each with an associated number from 1 to 3:: >>> labels = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] - >>> lkf = LabelKFold(labels, 3) + >>> lkf = LabelKFold(labels, n_folds=3) >>> for train, test in lkf: ... print("%s %s" % (train, test)) [0 1 2 3 4 5] [6 7 8 9] diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index a9808a8de65bf..2e988d5985cf8 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -279,10 +279,10 @@ class KFold(_BaseKFold): Examples -------- - >>> from sklearn import cross_validation + >>> from sklearn.cross_validation import KFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([1, 2, 3, 4]) - >>> kf = cross_validation.KFold(4, n_folds=2) + >>> kf = KFold(4, n_folds=2) >>> len(kf) 2 >>> print(kf) # doctest: +NORMALIZE_WHITESPACE @@ -370,11 +370,11 @@ class LabelKFold(_BaseKFold): Examples -------- - >>> from sklearn import cross_validation + >>> from sklearn.cross_validation import LabelKFold >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) >>> y = np.array([1, 2, 3, 4]) >>> labels = np.array([0, 0, 2, 2]) - >>> label_kfold = cross_validation.LabelKFold(labels, n_folds=2) + >>> label_kfold = LabelKFold(labels, n_folds=2) >>> len(label_kfold) 2 >>> print(label_kfold) @@ -416,24 +416,23 @@ def __init__(self, labels, n_folds=3, shuffle=False, random_state=None): n_labels)) # Weight labels by their number of occurences - samples_per_label = np.bincount(labels) + n_samples_per_label = np.bincount(labels) # Distribute the most frequent labels first - ind = np.argsort(samples_per_label)[::-1] - samples_per_label = samples_per_label[ind] + indices = np.argsort(n_samples_per_label)[::-1] + n_samples_per_label = n_samples_per_label[indices] # Total weight of each fold - samples_per_fold = np.zeros(n_folds, dtype=np.uint64) + n_samples_per_fold = np.zeros(n_folds) # Mapping from label index to fold index - label_to_fold = np.zeros(len(unique_labels), dtype=np.uintp) + label_to_fold = np.zeros(len(unique_labels)) - # While there are weights, distribute them - # Specifically, add the biggest weight to the lightest fold - for label_index, w in enumerate(samples_per_label): - ind_min = np.argmin(samples_per_fold) - samples_per_fold[ind_min] += w - label_to_fold[ind[label_index]] = ind_min + # Distribute samples by adding the largest weight to the lightest fold + for label_index, weight in enumerate(n_samples_per_label): + lightest_fold = np.argmin(n_samples_per_fold) + n_samples_per_fold[lightest_fold] += weight + label_to_fold[indices[label_index]] = lightest_fold self.idxs = label_to_fold[labels] @@ -486,10 +485,10 @@ class StratifiedKFold(_BaseKFold): Examples -------- - >>> from sklearn import cross_validation + >>> from sklearn.cross_validation import StratifiedKFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([0, 0, 1, 1]) - >>> skf = cross_validation.StratifiedKFold(y, n_folds=2) + >>> skf = StratifiedKFold(y, n_folds=2) >>> len(skf) 2 >>> print(skf) # doctest: +NORMALIZE_WHITESPACE From 410e7f2c2a2a909e47683f9d43d87962628d7b99 Mon Sep 17 00:00:00 2001 From: Gilles Louppe Date: Tue, 8 Sep 2015 07:48:26 +0200 Subject: [PATCH 4/4] ENH: remove unnecessary assignments --- doc/whats_new.rst | 13 ++++++++----- sklearn/cross_validation.py | 3 --- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 7ba8e03378b11..66df0155d76f4 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -34,9 +34,12 @@ New features function into a ``Pipeline``-compatible transformer object. By Joe Jevnik. - - :class:`cross_validation.LabelShuffleSplit` generates random train-test - splits, similar to :class:`cross_validation.ShuffleSplit`, except that - the splits are conditioned on a label array. By `Brian McFee`_. + - The new classes :class:`cross_validation.LabelKFold` and + :class:`cross_validation.LabelShuffleSplit` generate train-test folds, + respectively similar to :class:`cross_validation.KFold` and + :class:`cross_validation.ShuffleSplit`, except that the folds are + conditioned on a label array. By `Brian McFee`_, Jean Kossaifi and + `Gilles Louppe`_. Enhancements @@ -127,11 +130,11 @@ Enhancements - Allow :func:`datasets.make_multilabel_classification` to output a sparse ``y``. By Kashif Rasul. - + - :class:`cluster.DBSCAN` now accepts a sparse matrix of precomputed distances, allowing memory-efficient distance precomputation. By `Joel Nothman`_. - + - :class:`tree.DecisionTreeClassifier` now exposes an ``apply`` method for retrieving the leaf indices samples are predicted as. By `Daniel Galvez`_ and `Gilles Louppe`_. diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 2e988d5985cf8..117327dbc814a 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -403,9 +403,6 @@ def __init__(self, labels, n_folds=3, shuffle=False, random_state=None): super(LabelKFold, self).__init__(len(labels), n_folds, shuffle, random_state) - self.n = len(labels) - self.n_folds = n_folds - unique_labels, labels = np.unique(labels, return_inverse=True) n_labels = len(unique_labels)