From b1f2c8f65fc6eda5ddfd18e997753fcbdd3319ba Mon Sep 17 00:00:00 2001 From: Andreas van Cranenburgh Date: Wed, 19 Jul 2017 18:05:40 +0200 Subject: [PATCH 1/6] add stratify and shuffle variants for GroupKFold --- doc/modules/cross_validation.rst | 19 +++++++++- sklearn/model_selection/_split.py | 40 +++++++++++++++----- sklearn/model_selection/tests/test_split.py | 42 +++++++++++---------- 3 files changed, 70 insertions(+), 31 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index a43c5cf675cb8..b85f897c89e6a 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -532,11 +532,11 @@ parameter. Group k-fold ------------ -class:GroupKFold is a variation of k-fold which ensures that the same group is +:class:`GroupKFold` is a variation of k-fold which ensures that the same group is not represented in both testing and training sets. For example if the data is obtained from different subjects with several samples per-subject and if the model is flexible enough to learn from highly person specific features it -could fail to generalize to new subjects. class:GroupKFold makes it possible +could fail to generalize to new subjects. :class:`GroupKFold` makes it possible to detect this kind of overfitting situations. Imagine you have three subjects, each with an associated number from 1 to 3:: @@ -558,6 +558,21 @@ Each subject is in a different testing fold, and the same subject is never in both testing and training. Notice that the folds do not have exactly the same size due to the imbalance in the data. +The same group will not appear in two different folds; +this is a hard constraint. After this constraint is enforced, +there are still multiple ways to divide groups across folds. + +The default, ``method='balance'``, will balance the sizes of the folds, +such that each has approximately the same amount of items, as far as possible. +With ``method='stratify'``, items are spread across the folds by stratifying on +the ``y`` variable, as far as possible. Since this is done by sorting, it works +for continuous variables as well. +Finally, ``method='shuffle'`` distributes groups across folds randomly. + +The latter two options work best when groups are relatively small, to avoid +folds of uneven sizes. The stratification relies on the ``y``-value of the +first item of a group being representative of its group. + Leave One Group Out ------------------- diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 386d439184117..41ccf115108b3 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -440,13 +440,16 @@ class GroupKFold(_BaseKFold): The same group will not appear in two different folds (the number of distinct groups has to be at least equal to the number of folds). - The folds are approximately balanced in the sense that the number of - distinct groups is approximately the same in each fold. - Parameters ---------- n_splits : int, default=3 Number of folds. Must be at least 2. + method: string, default='balance' + One of 'balance', 'stratify', 'shuffle'. + By default, try to equalize the sizes of the resulting folds. + If 'stratify', sort groups according to ``y`` variable and distribute + evenly across folds. + If 'shuffle', shuffle the groups to randomize their assignments to folds. Examples -------- @@ -480,7 +483,11 @@ class GroupKFold(_BaseKFold): For splitting the data according to explicit domain-specific stratification of the dataset. """ - def __init__(self, n_splits=3): + def __init__(self, n_splits=3, method='balance'): + if method not in ('balance', 'stratify', 'shuffle'): + raise ValueError("The 'method' parameter should be in " + "('balance', 'stratify', 'shuffle')") + self.method = method super(GroupKFold, self).__init__(n_splits, shuffle=False, random_state=None) @@ -489,7 +496,8 @@ def _iter_test_indices(self, X, y, groups): raise ValueError("The 'groups' parameter should not be None.") groups = check_array(groups, ensure_2d=False, dtype=None) - unique_groups, groups = np.unique(groups, return_inverse=True) + unique_groups, unique_indices, groups = np.unique( + groups, return_index=True, return_inverse=True) n_groups = len(unique_groups) if self.n_splits > n_groups: @@ -500,17 +508,31 @@ def _iter_test_indices(self, X, y, groups): # Weight groups by their number of occurrences n_samples_per_group = np.bincount(groups) - # Distribute the most frequent groups first - indices = np.argsort(n_samples_per_group)[::-1] + if self.method == 'balance': + # Distribute the most frequent groups first + indices = np.argsort(n_samples_per_group)[::-1] + elif self.method == 'stratify': + # Distribute according to y values + if y is None: + raise ValueError("The 'y' parameter should not be None.") + y = check_array(y, ensure_2d=False, dtype=None) + indices = np.argsort(y[unique_indices]) + elif self.method == 'shuffle': + # Shuffle the groups + rng = check_random_state(self.random_state) + indices = np.arange(n_groups) + rng.shuffle(indices) + else: + raise ValueError n_samples_per_group = n_samples_per_group[indices] # Total weight of each fold n_samples_per_fold = np.zeros(self.n_splits) # Mapping from group index to fold index - group_to_fold = np.zeros(len(unique_groups)) + group_to_fold = np.zeros(n_groups) - # Distribute samples by adding the largest weight to the lightest fold + # Distribute samples by adding groups to the lightest fold for group_index, weight in enumerate(n_samples_per_group): lightest_fold = np.argmin(n_samples_per_fold) n_samples_per_fold[lightest_fold] += weight diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 300bb8953efae..15ebae0ce6e0e 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1220,28 +1220,30 @@ def test_group_kfold(): ideal_n_groups_per_fold = n_samples // n_splits - len(np.unique(groups)) - # Get the test fold indices from the test set indices of each fold - folds = np.zeros(n_samples) - lkf = GroupKFold(n_splits=n_splits) - for i, (_, test) in enumerate(lkf.split(X, y, groups)): - folds[test] = i - - # Check that folds have approximately the same size - assert_equal(len(folds), len(groups)) - for i in np.unique(folds): - assert_greater_equal(tolerance, - abs(sum(folds == i) - ideal_n_groups_per_fold)) - - # Check that each group appears only in 1 fold - for group in np.unique(groups): - assert_equal(len(np.unique(folds[groups == group])), 1) + for method in ('balance', 'stratify', 'shuffle'): + # Get the test fold indices from the test set indices of each fold + folds = np.zeros(n_samples) + lkf = GroupKFold(n_splits=n_splits, method=method) + for i, (_, test) in enumerate(lkf.split(X, y, groups)): + folds[test] = i + + # Check that folds have approximately the same size + if method == 'balance': + assert_equal(len(folds), len(groups)) + for i in np.unique(folds): + assert_greater_equal(tolerance, + abs(sum(folds == i) - ideal_n_groups_per_fold)) + + # Check that each group appears only in 1 fold + for group in np.unique(groups): + assert_equal(len(np.unique(folds[groups == group])), 1) - # Check that no group is on both sides of the split - groups = np.asarray(groups, dtype=object) - for train, test in lkf.split(X, y, groups): - assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) + # Check that no group is on both sides of the split + groups = np.asarray(groups, dtype=object) + for train, test in lkf.split(X, y, groups): + assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) + lkf = GroupKFold(n_splits=n_splits, method='balance') # Construct the test data groups = np.array(['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', From ee6567c3f409530db97c495c41c72a3734eb272c Mon Sep 17 00:00:00 2001 From: Andreas van Cranenburgh Date: Thu, 20 Jul 2017 14:35:00 +0200 Subject: [PATCH 2/6] add example --- doc/modules/cross_validation.rst | 4 +++ examples/model_selection/plot_groupkfold.py | 30 +++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 examples/model_selection/plot_groupkfold.py diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index b85f897c89e6a..0e4b62ea0f919 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -573,6 +573,10 @@ The latter two options work best when groups are relatively small, to avoid folds of uneven sizes. The stratification relies on the ``y``-value of the first item of a group being representative of its group. +.. topic:: Examples + + * :ref:`sphx_glr_auto_examples_model_selection_plot_groupkfold.py`, + Leave One Group Out ------------------- diff --git a/examples/model_selection/plot_groupkfold.py b/examples/model_selection/plot_groupkfold.py new file mode 100644 index 0000000000000..93553341497e1 --- /dev/null +++ b/examples/model_selection/plot_groupkfold.py @@ -0,0 +1,30 @@ +""" +==================== +Group K-Fold methods +==================== + +This example demonstrates when the stratify option of GroupKFold has an +advantage. +""" +from matplotlib import pyplot as plt +import numpy as np +from sklearn.model_selection import GroupKFold + +print(__doc__) + +rng = np.random.RandomState(0) +n_samples = 1000 +n_groups = 100 +X = np.arange(n_samples) +y = np.sort(rng.normal(size=n_samples)) +groups = np.sort(rng.randint(0, n_groups, n_samples)) + +fig, axes = plt.subplots(1, 3, figsize=(18, 4), sharex=True, sharey=True) +for n, method in enumerate(('balance', 'stratify', 'shuffle')): + cv = GroupKFold(2, method=method) + for m, (train, test) in enumerate(cv.split(X, y, groups)): + axes[n].hist(y[test], bins=20, histtype='step') + print('%s fold %d: %d items' % (method, m + 1, len(test))) + axes[n].set_xlabel(method) + +plt.show() From b625fa1b441b260eff8a4daeca95d71b1f44d8dd Mon Sep 17 00:00:00 2001 From: Andreas van Cranenburgh Date: Fri, 21 Jul 2017 13:41:10 +0200 Subject: [PATCH 3/6] use median value for stratify --- doc/modules/cross_validation.rst | 6 +++--- sklearn/model_selection/_split.py | 18 +++++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 0e4b62ea0f919..b689dadc16137 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -569,9 +569,9 @@ the ``y`` variable, as far as possible. Since this is done by sorting, it works for continuous variables as well. Finally, ``method='shuffle'`` distributes groups across folds randomly. -The latter two options work best when groups are relatively small, to avoid -folds of uneven sizes. The stratification relies on the ``y``-value of the -first item of a group being representative of its group. +The latter two options work best when groups are relatively small (i.e., there +are many groups), to avoid folds of uneven sizes. The stratification relies on +the median ``y``-value of each group being representative of its group. .. topic:: Examples diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 41ccf115108b3..d492bff177f49 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -447,9 +447,10 @@ class GroupKFold(_BaseKFold): method: string, default='balance' One of 'balance', 'stratify', 'shuffle'. By default, try to equalize the sizes of the resulting folds. - If 'stratify', sort groups according to ``y`` variable and distribute - evenly across folds. - If 'shuffle', shuffle the groups to randomize their assignments to folds. + If 'stratify', sort groups according to their median ``y`` values + and distribute evenly across folds. + If 'shuffle', shuffle the groups to randomize their assignments to + folds. Examples -------- @@ -512,11 +513,18 @@ def _iter_test_indices(self, X, y, groups): # Distribute the most frequent groups first indices = np.argsort(n_samples_per_group)[::-1] elif self.method == 'stratify': - # Distribute according to y values + # Distribute according to median y value per group if y is None: raise ValueError("The 'y' parameter should not be None.") y = check_array(y, ensure_2d=False, dtype=None) - indices = np.argsort(y[unique_indices]) + y_by_group = dict.fromkeys(unique_groups, []) + for group, y_value in zip(groups, y): + y_by_group[group].append(y_value) + # manual median; np.median doesn't work when groups are strings. + median_by_group = [ + sorted(y_by_group[group])[len(y_by_group[group]) // 2] + for group in unique_groups] + indices = np.argsort(median_by_group) elif self.method == 'shuffle': # Shuffle the groups rng = check_random_state(self.random_state) From 2f4ce3afbe56f80dea0a158cee99a0bc2aefa494 Mon Sep 17 00:00:00 2001 From: Andreas van Cranenburgh Date: Sat, 22 Jul 2017 14:53:45 +0200 Subject: [PATCH 4/6] fix doctest failure --- sklearn/model_selection/_split.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index d492bff177f49..d60538c3cdea7 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -458,11 +458,11 @@ class GroupKFold(_BaseKFold): >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) >>> y = np.array([1, 2, 3, 4]) >>> groups = np.array([0, 0, 2, 2]) - >>> group_kfold = GroupKFold(n_splits=2) + >>> group_kfold = GroupKFold(method='balance', n_splits=2) >>> group_kfold.get_n_splits(X, y, groups) 2 >>> print(group_kfold) - GroupKFold(n_splits=2) + GroupKFold(method='balance', n_splits=2) >>> for train_index, test_index in group_kfold.split(X, y, groups): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] From dc8af386e301f3f1af5aa4b285ba03ef3189d89d Mon Sep 17 00:00:00 2001 From: Andreas van Cranenburgh Date: Sat, 22 Jul 2017 15:38:42 +0200 Subject: [PATCH 5/6] fix flake8 issues --- sklearn/model_selection/_split.py | 2 +- sklearn/model_selection/tests/test_split.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index d60538c3cdea7..9c902237e95ac 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -487,7 +487,7 @@ class GroupKFold(_BaseKFold): def __init__(self, n_splits=3, method='balance'): if method not in ('balance', 'stratify', 'shuffle'): raise ValueError("The 'method' parameter should be in " - "('balance', 'stratify', 'shuffle')") + "('balance', 'stratify', 'shuffle')") self.method = method super(GroupKFold, self).__init__(n_splits, shuffle=False, random_state=None) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 15ebae0ce6e0e..bca23370fadfa 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1231,7 +1231,8 @@ def test_group_kfold(): if method == 'balance': assert_equal(len(folds), len(groups)) for i in np.unique(folds): - assert_greater_equal(tolerance, + assert_greater_equal( + tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold From 90173807ea63c24f006e53f3b3b6edc25e5b19ba Mon Sep 17 00:00:00 2001 From: Andreas van Cranenburgh Date: Thu, 27 Jul 2017 17:52:10 +0200 Subject: [PATCH 6/6] add stratify_mode; improve documentation --- doc/modules/cross_validation.rst | 34 +++++++++++-------- examples/model_selection/plot_groupkfold.py | 11 +++--- sklearn/model_selection/_split.py | 37 +++++++++++++-------- sklearn/model_selection/tests/test_split.py | 2 +- 4 files changed, 52 insertions(+), 32 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index b689dadc16137..2cf1258fc80a1 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -532,12 +532,12 @@ parameter. Group k-fold ------------ -:class:`GroupKFold` is a variation of k-fold which ensures that the same group is -not represented in both testing and training sets. For example if the data is -obtained from different subjects with several samples per-subject and if the +:class:`GroupKFold` is a variation of k-fold which ensures that the same group +is not represented in both testing and training sets. For example if the data +is obtained from different subjects with several samples per-subject and if the model is flexible enough to learn from highly person specific features it could fail to generalize to new subjects. :class:`GroupKFold` makes it possible -to detect this kind of overfitting situations. +to avoid these kind of overfitting situations. Imagine you have three subjects, each with an associated number from 1 to 3:: @@ -560,18 +560,26 @@ size due to the imbalance in the data. The same group will not appear in two different folds; this is a hard constraint. After this constraint is enforced, -there are still multiple ways to divide groups across folds. - -The default, ``method='balance'``, will balance the sizes of the folds, -such that each has approximately the same amount of items, as far as possible. -With ``method='stratify'``, items are spread across the folds by stratifying on -the ``y`` variable, as far as possible. Since this is done by sorting, it works -for continuous variables as well. -Finally, ``method='shuffle'`` distributes groups across folds randomly. +there are still multiple ways to divide groups across folds. A greedy strategy +is used to create folds of approximately the same size: at each step, the fold +with the least number of items is assigned a new group. The order in which +groups are assigned can be used to tweak the distribution of the resulting +folds. + +The default, ``method='balance'``, will try to balance the sizes of the folds, +by assigning the largest groups first. With ``method='stratify_median'`` or +``method='stratify_mode``, items are spread across the folds by stratifying on +the ``y`` variable, as far as possible. Median should be used for continuous +variables, and mode for discrete variables. Stratification may be important +when the ``y`` variable has a skewed distribution; stratification can help +ensure that rare ``y`` values are represented in each fold. +Finally, ``method='shuffle'`` adds randomness by shuffling the groups. This +strategy is useful when you want to generate multiple sets of folds; repeated +use of the other methods would deterministically result in the same folds. The latter two options work best when groups are relatively small (i.e., there are many groups), to avoid folds of uneven sizes. The stratification relies on -the median ``y``-value of each group being representative of its group. +picking ``y``-values of each group that are representative of its group. .. topic:: Examples diff --git a/examples/model_selection/plot_groupkfold.py b/examples/model_selection/plot_groupkfold.py index 93553341497e1..9621d906a2a86 100644 --- a/examples/model_selection/plot_groupkfold.py +++ b/examples/model_selection/plot_groupkfold.py @@ -15,16 +15,19 @@ rng = np.random.RandomState(0) n_samples = 1000 n_groups = 100 +n_folds = 2 X = np.arange(n_samples) +# Sort data points to highlight the effect of stratification y = np.sort(rng.normal(size=n_samples)) groups = np.sort(rng.randint(0, n_groups, n_samples)) fig, axes = plt.subplots(1, 3, figsize=(18, 4), sharex=True, sharey=True) -for n, method in enumerate(('balance', 'stratify', 'shuffle')): - cv = GroupKFold(2, method=method) +for n, method in enumerate(('balance', 'stratify_median', 'shuffle')): + cv = GroupKFold(n_folds, method=method) for m, (train, test) in enumerate(cv.split(X, y, groups)): - axes[n].hist(y[test], bins=20, histtype='step') + axes[n].hist(y[test], bins=20, histtype='step', + label='fold %d' % (m + 1)) print('%s fold %d: %d items' % (method, m + 1, len(test))) axes[n].set_xlabel(method) - + axes[n].legend(loc='upper right') plt.show() diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 9c902237e95ac..e9596000ad061 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -445,10 +445,12 @@ class GroupKFold(_BaseKFold): n_splits : int, default=3 Number of folds. Must be at least 2. method: string, default='balance' - One of 'balance', 'stratify', 'shuffle'. + One of 'balance', 'stratify_median', 'stratify_mode', 'shuffle'. By default, try to equalize the sizes of the resulting folds. - If 'stratify', sort groups according to their median ``y`` values - and distribute evenly across folds. + If 'stratify_median', distribute groups evenly across folds according + to their median ``y`` values; use when ``y`` is continuous. + If 'stratify_mode', distribute groups evenly across folds according to + the mode of their ``y`` values; use when ``y`` is discrete. If 'shuffle', shuffle the groups to randomize their assignments to folds. @@ -485,9 +487,11 @@ class GroupKFold(_BaseKFold): stratification of the dataset. """ def __init__(self, n_splits=3, method='balance'): - if method not in ('balance', 'stratify', 'shuffle'): - raise ValueError("The 'method' parameter should be in " - "('balance', 'stratify', 'shuffle')") + if method not in ('balance', 'stratify_median', 'stratify_mode', + 'shuffle'): + raise ValueError("The 'method' parameter should be one of: " + "'balance', 'stratify_median', 'stratify_mode', " + "'shuffle'") self.method = method super(GroupKFold, self).__init__(n_splits, shuffle=False, random_state=None) @@ -512,7 +516,7 @@ def _iter_test_indices(self, X, y, groups): if self.method == 'balance': # Distribute the most frequent groups first indices = np.argsort(n_samples_per_group)[::-1] - elif self.method == 'stratify': + elif self.method.startswith('stratify_'): # Distribute according to median y value per group if y is None: raise ValueError("The 'y' parameter should not be None.") @@ -520,18 +524,23 @@ def _iter_test_indices(self, X, y, groups): y_by_group = dict.fromkeys(unique_groups, []) for group, y_value in zip(groups, y): y_by_group[group].append(y_value) - # manual median; np.median doesn't work when groups are strings. - median_by_group = [ - sorted(y_by_group[group])[len(y_by_group[group]) // 2] - for group in unique_groups] - indices = np.argsort(median_by_group) + if self.method == 'stratify_median': + # manual median; np.median doesn't work when groups are strings + by_group = [ + sorted(y_by_group[group])[len(y_by_group[group]) // 2] + for group in unique_groups] + elif self.method == 'stratify_mode': + def mode(x): + values, counts = np.unique(x, return_counts=True) + return values[np.argmax(counts)] + + by_group = [mode(y_by_group[group]) for group in unique_groups] + indices = np.argsort(by_group) elif self.method == 'shuffle': # Shuffle the groups rng = check_random_state(self.random_state) indices = np.arange(n_groups) rng.shuffle(indices) - else: - raise ValueError n_samples_per_group = n_samples_per_group[indices] # Total weight of each fold diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index bca23370fadfa..d46b8625a5677 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1220,7 +1220,7 @@ def test_group_kfold(): ideal_n_groups_per_fold = n_samples // n_splits - for method in ('balance', 'stratify', 'shuffle'): + for method in ('balance', 'stratify_median', 'stratify_mode', 'shuffle'): # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = GroupKFold(n_splits=n_splits, method=method)