Changed SubjectIndependentKFold to DisjointGroupKFold

JeanKossaifi · JeanKossaifi · commit 37ecdd7776f9 · 2015-03-25T11:34:33.000Z
cosmetic changes  test (fix seed correctly, use assert_equal for
meaningful error messages)
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
@@ -332,12 +332,12 @@ def __len__(self):
         return self.n_folds
 
 
-def subject_independent_folds(subjects, n_folds=3):
-    """ Creates folds where a same subject is not in two different folds
+def disjoint_group_folds(groups, n_folds=3):
+    """Creates folds where a same group is not in two different folds
     
     Parameters
     ----------
-    subjects: iterable of shape (n_samples, )
+    groups: iterable of shape (n_samples, )
         contains an id for each sample
         The folds are built so that the same id doesn't appear in two different folds
     
@@ -352,34 +352,46 @@ def subject_independent_folds(subjects, n_folds=3):
         
     Notes
     -----
-    The folds are built by distributing the subjects by frequency of appearance.
+    The folds are built by distributing the groups by frequency of appearance.
     """
-    subjects = np.array(subjects)
-    unique_subjects = np.unique(subjects)
+    groups = np.array(groups)
+    unique_groups = np.unique(groups)
     
-    # number of occurrence of each subject (its "weight")
-    weight_per_subject = sorted([(sum(subjects == i), i) for i in unique_subjects])
+    # number of occurrence of each group (its "weight")
+    weight_per_group = sorted([(sum(groups == group_id), group_id) for group_id in unique_groups])
     # Total weight of each fold
     weight_per_fold = np.zeros(n_folds)
     # For each sample, a digit between 0 and (n_folds - 1) to tell which fold it belongs to
-    folds = np.zeros(len(subjects))
+    folds = np.zeros(len(groups))
     
     # While there are weights, distribute them
     # Specifically, add the biggest weight to the lightest fold
-    while weight_per_subject:
+    while weight_per_group:
         ind_min = np.argmin(weight_per_fold)
-        w, actor = weight_per_subject.pop()
+        w, group_id = weight_per_group.pop()
         weight_per_fold[ind_min] += w
-        folds[subjects == actor] = ind_min
+        folds[groups == group_id] = ind_min
     
     return folds
 
 
-class SubjectIndependentKfold(_BaseKFold):
-    def __init__(self, subjects, n_folds=3):
+class DisjointGroupKfold(_BaseKFold):
+    def __init__(self, groups, n_folds=3):
+        """Creates K approximately equilibrated folds
+            where the same group will not appear in two different folds
+
+        Parameters
+        ----------
+        groups: numpy array of shape (n_samples, )
+            contains an id for each sample
+            The folds are built so that the same id doesn't appear in two different folds
+
+        n_folds: int, default is 3
+            number of folds
+        """
         self.n_folds = n_folds
-        self.n = len(subjects)
-        self.idxs = subject_independent_folds(subjects=subjects, n_folds=n_folds)
+        self.n = len(groups)
+        self.idxs = disjoint_group_folds(groups=groups, n_folds=n_folds)
 
     def _iter_test_indices(self):
         for i in range(self.n_folds):
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
@@ -12,6 +12,7 @@
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_greater
+from sklearn.utils.testing import assert_greater_equal
 from sklearn.utils.testing import assert_less
 from sklearn.utils.testing import assert_not_equal
 from sklearn.utils.testing import assert_array_almost_equal
@@ -1042,55 +1043,53 @@ def test_check_is_partition():
 
 def test_subject_independent_folds():
     """ Check that the function produces equilibrated folds
-        with no subject appearing in two different folds
+        with no group appearing in two different folds
     """
     # Fix the seed for reproducibility 
-    np.random.seed(0)
+    rng = np.random.RandomState(0)
     
     # Parameters of the test
-    n_subjects = 15
+    n_groups = 15
     n_samples = 1000
     n_folds = 5
     
     # Construct the test data
     tolerance = 0.05 * n_samples # 5 percent error allowed
-    subjects = np.random.randint(0, n_subjects, n_samples)
-    folds = cval.subject_independent_folds(subjects, n_folds)
-    ideal_n_subjects_per_fold = n_samples // n_folds
+    groups = np.random.randint(0, n_groups, n_samples)
+    folds = cval.disjoint_group_folds(groups, n_folds)
+    ideal_n_groups_per_fold = n_samples // n_folds
     
     # Check that folds have approximately the same size
-    assert(len(folds)==len(subjects))
+    assert_equal(len(folds), len(groups))
     for i in np.unique(folds):
-        assert(abs(sum(folds == i) - ideal_n_subjects_per_fold) <= tolerance)
+        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold))
     
     # Check that each subjects appears only in 1 fold
-    for subject in np.unique(subjects):
-        assert(len(np.unique(folds[subjects == subject])) == 1)
+    for group in np.unique(groups):
+        assert_equal(len(np.unique(folds[groups == group])), 1)
         
-    subjects = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
+    # Construct the test data
+    groups = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
                 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
                 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
                 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
                 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
                 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis',
                 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']
     
-    n_subjects = len(np.unique(subjects))
-    n_samples = len(subjects)
+    n_groups = len(np.unique(groups))
+    n_samples = len(groups)
     n_folds = 5
-    
-    # Construct the test data
     tolerance = 0.05 * n_samples # 5 percent error allowed
-    subjects = np.random.randint(0, n_subjects, n_samples)
-    folds = cval.subject_independent_folds(subjects, n_folds)
-    ideal_n_subjects_per_fold = n_samples // n_folds
+    folds = cval.disjoint_group_folds(groups, n_folds)
+    ideal_n_groups_per_fold = n_samples // n_folds
     
     # Check that folds have approximately the same size
-    assert(len(folds)==len(subjects))
+    assert_equal(len(folds), len(groups))
     for i in np.unique(folds):
-        assert(abs(sum(folds == i) - ideal_n_subjects_per_fold) <= tolerance)
+        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold))
     
     # Check that each subjects appears only in 1 fold
-    for subject in np.unique(subjects):
-        assert(len(np.unique(folds[subjects == subject])) == 1)
+    for group in np.unique(groups):
+        assert_equal(len(np.unique(folds[groups == group])), 1)