From 5dd260c9d1e22bf24dfde2e44e18573ec8f5bed7 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Thu, 24 Nov 2016 16:26:22 +0100 Subject: [PATCH 01/10] Set a random random_state at init to ensure deterministic randomness --- sklearn/model_selection/_split.py | 28 +++++++++++++++++---- sklearn/model_selection/tests/test_split.py | 21 +++++++++++----- 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index b2ed060e31717..fb28191ff1945 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -284,7 +284,16 @@ def __init__(self, n_splits, shuffle, random_state): self.n_splits = n_splits self.shuffle = shuffle + # For repr self.random_state = random_state + if random_state is None: + # This is done to ensure that the multiple calls to split + # are random for each initialization of splitter but consistent + # across multiple calls for the same initialization. + self._random_state = check_random_state( + random_state).randint(99999999) + else: + self._random_state = random_state def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. @@ -407,7 +416,7 @@ def _iter_test_indices(self, X, y=None, groups=None): n_samples = _num_samples(X) indices = np.arange(n_samples) if self.shuffle: - check_random_state(self.random_state).shuffle(indices) + check_random_state(self._random_state).shuffle(indices) n_splits = self.n_splits fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int) @@ -560,9 +569,9 @@ def __init__(self, n_splits=3, shuffle=False, random_state=None): def _make_test_folds(self, X, y=None, groups=None): if self.shuffle: - rng = check_random_state(self.random_state) + rng = check_random_state(self._random_state) else: - rng = self.random_state + rng = self._random_state y = np.asarray(y) n_samples = y.shape[0] unique_y, y_inversed = np.unique(y, return_inverse=True) @@ -922,7 +931,16 @@ def __init__(self, n_splits=10, test_size=0.1, train_size=None, self.n_splits = n_splits self.test_size = test_size self.train_size = train_size + # For repr self.random_state = random_state + if random_state is None: + # This is done to ensure that the multiple calls to split + # are random for each initialization of splitter but consistent + # across multiple calls for the same initialization. + self._random_state = check_random_state( + random_state).randint(99999999) + else: + self._random_state = random_state def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. @@ -1042,7 +1060,7 @@ def _iter_indices(self, X, y=None, groups=None): n_samples = _num_samples(X) n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, self.train_size) - rng = check_random_state(self.random_state) + rng = check_random_state(self._random_state) for i in range(self.n_splits): # random partition permutation = rng.permutation(n_samples) @@ -1269,7 +1287,7 @@ def _iter_indices(self, X, y, groups=None): 'equal to the number of classes = %d' % (n_test, n_classes)) - rng = check_random_state(self.random_state) + rng = check_random_state(self._random_state) for _ in range(self.n_splits): # if there are ties in the class-counts, we want diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index fba323492be85..b0ad88ef1b8de 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -430,19 +430,28 @@ def test_shuffle_kfold_stratifiedkfold_reproducibility(): np.testing.assert_equal(list(cv.split(X, y)), list(cv.split(X, y))) np.testing.assert_equal(list(cv.split(X2, y2)), list(cv.split(X2, y2))) + # Tests to ensure consistent behaviour even when random_state is not set. kf = KFold(3, shuffle=True) skf = StratifiedKFold(3, shuffle=True) - - for cv in (kf, skf): + kf1 = KFold(3, shuffle=True) + kf2 = KFold(3, shuffle=True) + skf1 = StratifiedKFold(3, shuffle=True) + skf2 = StratifiedKFold(3, shuffle=True) + for cvs in ((kf, kf1, kf2), (skf, skf1, skf2)): for data in zip((X, X2), (y, y2)): + # For the same initialilzation, splits should be same across + # multiple split calls, even when random_state is not set. + np.testing.assert_equal(list(cvs[0].split(*data)), + list(cvs[0].split(*data))) + try: - np.testing.assert_equal(list(cv.split(*data)), - list(cv.split(*data))) + np.testing.assert_equal(list(cvs[1].split(*data)), + list(cvs[2].split(*data))) except AssertionError: pass else: - raise AssertionError("The splits for data, %s, are same even " - "when random state is not set" % data) + raise AssertionError("When random_state is not set, the splits" + " are same for different initializations") def test_shuffle_stratifiedkfold(): From 75448b9697dcfcb79ac939f0213cb10af5a1bb96 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 28 Nov 2016 12:47:46 +0100 Subject: [PATCH 02/10] COSMIT --- sklearn/model_selection/_split.py | 2 +- sklearn/model_selection/tests/test_split.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index fb28191ff1945..944ae552385b1 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -291,7 +291,7 @@ def __init__(self, n_splits, shuffle, random_state): # are random for each initialization of splitter but consistent # across multiple calls for the same initialization. self._random_state = check_random_state( - random_state).randint(99999999) + random_state).randint(np.iinfo(np.int32).max) else: self._random_state = random_state diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index b0ad88ef1b8de..30213551fd73f 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -439,11 +439,13 @@ def test_shuffle_kfold_stratifiedkfold_reproducibility(): skf2 = StratifiedKFold(3, shuffle=True) for cvs in ((kf, kf1, kf2), (skf, skf1, skf2)): for data in zip((X, X2), (y, y2)): - # For the same initialilzation, splits should be same across + # For the same initialization, splits should be same across # multiple split calls, even when random_state is not set. np.testing.assert_equal(list(cvs[0].split(*data)), list(cvs[0].split(*data))) + # For different initialisations, splits should not be same when + # random_state is not set. try: np.testing.assert_equal(list(cvs[1].split(*data)), list(cvs[2].split(*data))) From 6f5947d8f5d631d572372f60ac784aa1d8300460 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 28 Nov 2016 13:22:27 +0100 Subject: [PATCH 03/10] Use np.iinfo(...).max instead of 99999.. --- sklearn/model_selection/_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 944ae552385b1..855fdb1756785 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -938,7 +938,7 @@ def __init__(self, n_splits=10, test_size=0.1, train_size=None, # are random for each initialization of splitter but consistent # across multiple calls for the same initialization. self._random_state = check_random_state( - random_state).randint(99999999) + random_state).randint(np.iinfo(np.int32).max) else: self._random_state = random_state From d05efd43afcc6b59d8bd8a61947fdb39d60f0e73 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Fri, 2 Dec 2016 16:07:55 +0100 Subject: [PATCH 04/10] Handle np.random.RandomState instances; Set expl. state for None in repr --- sklearn/model_selection/_split.py | 27 ++++++++------------- sklearn/model_selection/tests/test_split.py | 13 ++++++---- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 855fdb1756785..9b39a9fcb2571 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -284,16 +284,14 @@ def __init__(self, n_splits, shuffle, random_state): self.n_splits = n_splits self.shuffle = shuffle - # For repr - self.random_state = random_state - if random_state is None: + if not isinstance(random_state, (np.integer, numbers.Integral)): # This is done to ensure that the multiple calls to split # are random for each initialization of splitter but consistent # across multiple calls for the same initialization. - self._random_state = check_random_state( + self.random_state = check_random_state( random_state).randint(np.iinfo(np.int32).max) else: - self._random_state = random_state + self.random_state = random_state def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. @@ -416,7 +414,7 @@ def _iter_test_indices(self, X, y=None, groups=None): n_samples = _num_samples(X) indices = np.arange(n_samples) if self.shuffle: - check_random_state(self._random_state).shuffle(indices) + check_random_state(self.random_state).shuffle(indices) n_splits = self.n_splits fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int) @@ -568,10 +566,7 @@ def __init__(self, n_splits=3, shuffle=False, random_state=None): super(StratifiedKFold, self).__init__(n_splits, shuffle, random_state) def _make_test_folds(self, X, y=None, groups=None): - if self.shuffle: - rng = check_random_state(self._random_state) - else: - rng = self._random_state + rng = check_random_state(self.random_state) y = np.asarray(y) n_samples = y.shape[0] unique_y, y_inversed = np.unique(y, return_inverse=True) @@ -931,16 +926,14 @@ def __init__(self, n_splits=10, test_size=0.1, train_size=None, self.n_splits = n_splits self.test_size = test_size self.train_size = train_size - # For repr - self.random_state = random_state - if random_state is None: + if not isinstance(random_state, (np.integer, numbers.Integral)): # This is done to ensure that the multiple calls to split # are random for each initialization of splitter but consistent # across multiple calls for the same initialization. - self._random_state = check_random_state( + self.random_state = check_random_state( random_state).randint(np.iinfo(np.int32).max) else: - self._random_state = random_state + self.random_state = random_state def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. @@ -1060,7 +1053,7 @@ def _iter_indices(self, X, y=None, groups=None): n_samples = _num_samples(X) n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, self.train_size) - rng = check_random_state(self._random_state) + rng = check_random_state(self.random_state) for i in range(self.n_splits): # random partition permutation = rng.permutation(n_samples) @@ -1287,7 +1280,7 @@ def _iter_indices(self, X, y, groups=None): 'equal to the number of classes = %d' % (n_test, n_classes)) - rng = check_random_state(self._random_state) + rng = check_random_state(self.random_state) for _ in range(self.n_splits): # if there are ties in the class-counts, we want diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 30213551fd73f..0879aa26f6138 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -147,8 +147,8 @@ def test_cross_validator_with_default_params(): groups = np.array([1, 2, 3, 4]) loo = LeaveOneOut() lpo = LeavePOut(p) - kf = KFold(n_splits) - skf = StratifiedKFold(n_splits) + kf = KFold(n_splits, random_state=0) + skf = StratifiedKFold(n_splits, random_state=0) lolo = LeaveOneGroupOut() lopo = LeavePGroupsOut(p) ss = ShuffleSplit(random_state=0) @@ -156,8 +156,8 @@ def test_cross_validator_with_default_params(): loo_repr = "LeaveOneOut()" lpo_repr = "LeavePOut(p=2)" - kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" - skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" + kf_repr = "KFold(n_splits=2, random_state=0, shuffle=False)" + skf_repr = "StratifiedKFold(n_splits=2, random_state=0, shuffle=False)" lolo_repr = "LeaveOneGroupOut()" lopo_repr = "LeavePGroupsOut(n_groups=2)" ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, " @@ -425,8 +425,11 @@ def test_shuffle_kfold_stratifiedkfold_reproducibility(): kf = KFold(3, shuffle=True, random_state=0) skf = StratifiedKFold(3, shuffle=True, random_state=0) + kf2 = KFold(3, shuffle=True, random_state=np.random.RandomState(0)) + skf2 = StratifiedKFold(3, shuffle=True, + random_state=np.random.RandomState(0)) - for cv in (kf, skf): + for cv in (kf, skf, kf2, skf2): np.testing.assert_equal(list(cv.split(X, y)), list(cv.split(X, y))) np.testing.assert_equal(list(cv.split(X2, y2)), list(cv.split(X2, y2))) From 374d2ed4322d32cd33a36bddaeede6841081f5a7 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Fri, 2 Dec 2016 17:08:20 +0100 Subject: [PATCH 05/10] Fix doctest --- sklearn/model_selection/_split.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 9b39a9fcb2571..a3d3dd66ae28a 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -378,11 +378,11 @@ class KFold(_BaseKFold): >>> from sklearn.model_selection import KFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([1, 2, 3, 4]) - >>> kf = KFold(n_splits=2) + >>> kf = KFold(n_splits=2, random_state=0) >>> kf.get_n_splits(X) 2 >>> print(kf) # doctest: +NORMALIZE_WHITESPACE - KFold(n_splits=2, random_state=None, shuffle=False) + KFold(n_splits=2, random_state=0, shuffle=False) >>> for train_index, test_index in kf.split(X): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] @@ -543,11 +543,11 @@ class StratifiedKFold(_BaseKFold): >>> from sklearn.model_selection import StratifiedKFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([0, 0, 1, 1]) - >>> skf = StratifiedKFold(n_splits=2) + >>> skf = StratifiedKFold(n_splits=2, random_state=0) >>> skf.get_n_splits(X, y) 2 >>> print(skf) # doctest: +NORMALIZE_WHITESPACE - StratifiedKFold(n_splits=2, random_state=None, shuffle=False) + StratifiedKFold(n_splits=2, random_state=0, shuffle=False) >>> for train_index, test_index in skf.split(X, y): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] From d3db61799079de21de2ebbe915817bb3bd71bf30 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 5 Dec 2016 16:08:41 +0100 Subject: [PATCH 06/10] Change order instead of negating --- sklearn/model_selection/_split.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index a3d3dd66ae28a..4a2d6fcaff3cc 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -284,14 +284,14 @@ def __init__(self, n_splits, shuffle, random_state): self.n_splits = n_splits self.shuffle = shuffle - if not isinstance(random_state, (np.integer, numbers.Integral)): + if isinstance(random_state, (np.integer, numbers.Integral)): + self.random_state = random_state + else: # This is done to ensure that the multiple calls to split # are random for each initialization of splitter but consistent # across multiple calls for the same initialization. self.random_state = check_random_state( random_state).randint(np.iinfo(np.int32).max) - else: - self.random_state = random_state def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. @@ -926,14 +926,14 @@ def __init__(self, n_splits=10, test_size=0.1, train_size=None, self.n_splits = n_splits self.test_size = test_size self.train_size = train_size - if not isinstance(random_state, (np.integer, numbers.Integral)): + if isinstance(random_state, (np.integer, numbers.Integral)): + self.random_state = random_state + else: # This is done to ensure that the multiple calls to split # are random for each initialization of splitter but consistent # across multiple calls for the same initialization. self.random_state = check_random_state( random_state).randint(np.iinfo(np.int32).max) - else: - self.random_state = random_state def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. From c762363276caa65c4ba1e0fc1b3cbe3c9c8afdb9 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 5 Dec 2016 16:11:13 +0100 Subject: [PATCH 07/10] Comment clarity --- sklearn/model_selection/tests/test_split.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 0879aa26f6138..d99ae3e24924d 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -429,11 +429,13 @@ def test_shuffle_kfold_stratifiedkfold_reproducibility(): skf2 = StratifiedKFold(3, shuffle=True, random_state=np.random.RandomState(0)) + # 1) Test to ensure consistent behavior when random_state is set explicitly for cv in (kf, skf, kf2, skf2): + # Check that calling split twice yields the same results np.testing.assert_equal(list(cv.split(X, y)), list(cv.split(X, y))) np.testing.assert_equal(list(cv.split(X2, y2)), list(cv.split(X2, y2))) - # Tests to ensure consistent behaviour even when random_state is not set. + # 2) Tests to ensure consistent behavior even when random_state is not set kf = KFold(3, shuffle=True) skf = StratifiedKFold(3, shuffle=True) kf1 = KFold(3, shuffle=True) From 318312e51c9c7277a793c63c405a87e1bd8c4048 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Tue, 6 Dec 2016 16:22:23 +0100 Subject: [PATCH 08/10] Modify random_state only when shuffle is set to True --- sklearn/model_selection/_split.py | 7 ++++--- sklearn/model_selection/tests/test_split.py | 12 ++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 4a2d6fcaff3cc..f0298ec4d1904 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -284,14 +284,15 @@ def __init__(self, n_splits, shuffle, random_state): self.n_splits = n_splits self.shuffle = shuffle - if isinstance(random_state, (np.integer, numbers.Integral)): - self.random_state = random_state - else: + if shuffle and not isinstance(random_state, + (np.integer, numbers.Integral)): # This is done to ensure that the multiple calls to split # are random for each initialization of splitter but consistent # across multiple calls for the same initialization. self.random_state = check_random_state( random_state).randint(np.iinfo(np.int32).max) + else: + self.random_state = random_state def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index d99ae3e24924d..e265a8d4ed6b4 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -147,20 +147,20 @@ def test_cross_validator_with_default_params(): groups = np.array([1, 2, 3, 4]) loo = LeaveOneOut() lpo = LeavePOut(p) - kf = KFold(n_splits, random_state=0) - skf = StratifiedKFold(n_splits, random_state=0) + kf = KFold(n_splits) + skf = StratifiedKFold(n_splits) lolo = LeaveOneGroupOut() lopo = LeavePGroupsOut(p) - ss = ShuffleSplit(random_state=0) + ss = ShuffleSplit(random_state=42) ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 loo_repr = "LeaveOneOut()" lpo_repr = "LeavePOut(p=2)" - kf_repr = "KFold(n_splits=2, random_state=0, shuffle=False)" - skf_repr = "StratifiedKFold(n_splits=2, random_state=0, shuffle=False)" + kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" + skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" lolo_repr = "LeaveOneGroupOut()" lopo_repr = "LeavePGroupsOut(n_groups=2)" - ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, " + ss_repr = ("ShuffleSplit(n_splits=10, random_state=42, test_size=0.1, " "train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" From a637c9932831941a6d5e1e9ec4305dce807f475b Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Tue, 6 Dec 2016 16:23:59 +0100 Subject: [PATCH 09/10] Remove random_state param from example docs --- sklearn/model_selection/_split.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index f0298ec4d1904..21a0cac699aa0 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -379,11 +379,11 @@ class KFold(_BaseKFold): >>> from sklearn.model_selection import KFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([1, 2, 3, 4]) - >>> kf = KFold(n_splits=2, random_state=0) + >>> kf = KFold(n_splits=2) >>> kf.get_n_splits(X) 2 >>> print(kf) # doctest: +NORMALIZE_WHITESPACE - KFold(n_splits=2, random_state=0, shuffle=False) + KFold(n_splits=2, random_state=None, shuffle=False) >>> for train_index, test_index in kf.split(X): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] @@ -544,11 +544,11 @@ class StratifiedKFold(_BaseKFold): >>> from sklearn.model_selection import StratifiedKFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([0, 0, 1, 1]) - >>> skf = StratifiedKFold(n_splits=2, random_state=0) + >>> skf = StratifiedKFold(n_splits=2) >>> skf.get_n_splits(X, y) 2 >>> print(skf) # doctest: +NORMALIZE_WHITESPACE - StratifiedKFold(n_splits=2, random_state=0, shuffle=False) + StratifiedKFold(n_splits=2, random_state=None, shuffle=False) >>> for train_index, test_index in skf.split(X, y): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] From bc9f22c20f6ae90a480767659ec3651fef056002 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Tue, 6 Dec 2016 16:35:19 +0100 Subject: [PATCH 10/10] Simplify tests --- sklearn/model_selection/tests/test_split.py | 41 +++++++++++---------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index e265a8d4ed6b4..4fee7ef090de4 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -423,37 +423,38 @@ def test_shuffle_kfold_stratifiedkfold_reproducibility(): X2 = np.ones(16) # Not divisible by 3 y2 = [0] * 8 + [1] * 8 + # random_state set to int with shuffle=True kf = KFold(3, shuffle=True, random_state=0) skf = StratifiedKFold(3, shuffle=True, random_state=0) + # random_state set to RandomState object with shuffle=True kf2 = KFold(3, shuffle=True, random_state=np.random.RandomState(0)) skf2 = StratifiedKFold(3, shuffle=True, random_state=np.random.RandomState(0)) - - # 1) Test to ensure consistent behavior when random_state is set explicitly - for cv in (kf, skf, kf2, skf2): - # Check that calling split twice yields the same results - np.testing.assert_equal(list(cv.split(X, y)), list(cv.split(X, y))) - np.testing.assert_equal(list(cv.split(X2, y2)), list(cv.split(X2, y2))) - - # 2) Tests to ensure consistent behavior even when random_state is not set - kf = KFold(3, shuffle=True) - skf = StratifiedKFold(3, shuffle=True) + # random_state not set with shuffle=True + kf3 = KFold(3, shuffle=True) + skf3 = StratifiedKFold(3, shuffle=True) + + # 1) Test to ensure consistent behavior for multiple split calls + # irrespective of random_state + for cv in (kf, skf, kf2, skf2, kf3, skf3): + for data in ((X, y), (X2, y2)): + # Check that calling split twice yields the same results + np.testing.assert_equal(list(cv.split(*data)), + list(cv.split(*data))) + + # 2) Tests to ensure different initilization produce different splits, + # when random_state is not set kf1 = KFold(3, shuffle=True) - kf2 = KFold(3, shuffle=True) skf1 = StratifiedKFold(3, shuffle=True) + kf2 = KFold(3, shuffle=True) skf2 = StratifiedKFold(3, shuffle=True) - for cvs in ((kf, kf1, kf2), (skf, skf1, skf2)): - for data in zip((X, X2), (y, y2)): - # For the same initialization, splits should be same across - # multiple split calls, even when random_state is not set. - np.testing.assert_equal(list(cvs[0].split(*data)), - list(cvs[0].split(*data))) - + for cv1, cv2 in ((kf1, kf2), (skf1, skf2)): + for data in ((X, y), (X2, y2)): # For different initialisations, splits should not be same when # random_state is not set. try: - np.testing.assert_equal(list(cvs[1].split(*data)), - list(cvs[2].split(*data))) + np.testing.assert_equal(list(cv1.split(*data)), + list(cv2.split(*data))) except AssertionError: pass else: