From 3e70301a4d4eaca26f962de935477d0e118bf027 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Fri, 4 Nov 2016 11:19:04 +0100 Subject: [PATCH 01/16] DOC Add NOTE that unless random_state is set, split will not be identical --- sklearn/model_selection/_split.py | 42 +++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 642e8107e185d..04cc49e06d752 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -84,6 +84,13 @@ def split(self, X, y=None, groups=None): test : ndarray The testing set indices for that split. + + Note + ---- + + Multiple calls to the ``split`` method will not return identical + training or testing sets if ``random_state`` parameter exists and is + not explicitly set to an integer value. """ X, y, groups = indexable(X, y, groups) indices = np.arange(_num_samples(X)) @@ -309,6 +316,13 @@ def split(self, X, y=None, groups=None): test : ndarray The testing set indices for that split. + + Note + ---- + + Multiple calls to the ``split`` method will not return identical + training or testing sets if ``random_state`` parameter exists and is + not explicitly set to an integer value. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) @@ -632,6 +646,13 @@ def split(self, X, y, groups=None): test : ndarray The testing set indices for that split. + + Note + ---- + + Multiple calls to the ``split`` method will not return identical + training or testing sets unless ``random_state`` is set to an integer + value. """ return super(StratifiedKFold, self).split(X, y, groups) @@ -709,6 +730,13 @@ def split(self, X, y=None, groups=None): test : ndarray The testing set indices for that split. + + Note + ---- + + Multiple calls to the ``split`` method will not return identical + training or testing sets unless ``random_state`` is set to an integer + value. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) @@ -938,6 +966,13 @@ def split(self, X, y=None, groups=None): test : ndarray The testing set indices for that split. + + Note + ---- + + Multiple calls to the ``split`` method will not return identical + training or testing sets unless ``random_state`` is set to an integer + value. """ X, y, groups = indexable(X, y, groups) for train, test in self._iter_indices(X, y, groups): @@ -1304,6 +1339,13 @@ def split(self, X, y, groups=None): test : ndarray The testing set indices for that split. + + Note + ---- + + Multiple calls to the ``split`` method will not return identical + training or testing sets unless ``random_state`` is set to an integer + value. """ return super(StratifiedShuffleSplit, self).split(X, y, groups) From 791766f92d27b02d5f5f4db6184cb7eceede37ba Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Fri, 4 Nov 2016 11:39:52 +0100 Subject: [PATCH 02/16] TST use np.testing.assert_equal for nested lists/arrays --- sklearn/model_selection/tests/test_split.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 936abf03ac055..ff620b5088ae8 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -880,10 +880,15 @@ def test_cv_iterable_wrapper(): # results kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y) kf_randomized_iter_wrapped = check_cv(kf_randomized_iter) - assert_array_equal(list(kf_randomized_iter_wrapped.split(X, y)), - list(kf_randomized_iter_wrapped.split(X, y))) - assert_true(np.any(np.array(list(kf_iter_wrapped.split(X, y))) != - np.array(list(kf_randomized_iter_wrapped.split(X, y))))) + np.testing.assert_array_equal( + list(kf_randomized_iter_wrapped.split(X, y)), + list(kf_randomized_iter_wrapped.split(X, y))) + try: + np.testing.assert_equal( + np.array(list(kf_iter_wrapped.split(X, y))), + np.array(list(kf_randomized_iter_wrapped.split(X, y)))) + except AssertionError: + pass def test_group_kfold(): From 5b226fef8c3f1770cbd8905ca0b231b765956af2 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Fri, 4 Nov 2016 11:40:12 +0100 Subject: [PATCH 03/16] TST Make sure cv param can be a generator --- sklearn/model_selection/tests/test_search.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 1ce28755075a4..f733be29d2282 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1175,12 +1175,30 @@ def test_grid_search_cv_splits_consistency(): cv=KFold(n_splits=n_splits)) gs2.fit(X, y) + # Give generator as a cv parameter + gs3 = GridSearchCV(LinearSVC(random_state=0), + param_grid={'C': [0.1, 0.2, 0.3]}, + cv=KFold(n_splits=n_splits, shuffle=True, + random_state=0).split(X, y)) + gs3.fit(X, y) + + gs4 = GridSearchCV(LinearSVC(random_state=0), + param_grid={'C': [0.1, 0.2, 0.3]}, + cv=KFold(n_splits=n_splits, shuffle=True, + random_state=0)) + gs4.fit(X, y) + + def _pop_time_keys(cv_results): for key in ('mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time'): cv_results.pop(key) return cv_results + # Check if generators as supported as cv and that the splits are consistent + np.testing.assert_equal(_pop_time_keys(gs3.cv_results_), + _pop_time_keys(gs4.cv_results_)) + # OneTimeSplitter is a non-re-entrant cv where split can be called only # once if ``cv.split`` is called once per param setting in GridSearchCV.fit # the 2nd and 3rd parameter will not be evaluated as no train/test indices From 4f188250b3d2bcd4d9855032bf85f9a0df1adf6c Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Fri, 4 Nov 2016 18:02:49 +0100 Subject: [PATCH 04/16] DOC rank_ becomes a link when rendered --- sklearn/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index d2f5542ebd32f..2be5c9abfdcd6 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -816,7 +816,7 @@ class GridSearchCV(BaseSearchCV): For instance the below given table +------------+-----------+------------+-----------------+---+---------+ - |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_....| + |param_kernel|param_gamma|param_degree|split0_test_score|...| rank... | +============+===========+============+=================+===+=========+ | 'poly' | -- | 2 | 0.8 |...| 2 | +------------+-----------+------------+-----------------+---+---------+ From 9f483828f8c1ebd1baed88ce4f5b1b3589ab6dea Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Sun, 6 Nov 2016 21:34:41 +0100 Subject: [PATCH 05/16] Use test_... --- sklearn/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 2be5c9abfdcd6..8a78d14ceb774 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -816,7 +816,7 @@ class GridSearchCV(BaseSearchCV): For instance the below given table +------------+-----------+------------+-----------------+---+---------+ - |param_kernel|param_gamma|param_degree|split0_test_score|...| rank... | + |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...| +============+===========+============+=================+===+=========+ | 'poly' | -- | 2 | 0.8 |...| 2 | +------------+-----------+------------+-----------------+---+---------+ From 29eef94b8e1a4e245544d4678364926dc9280a1e Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Sun, 6 Nov 2016 21:35:07 +0100 Subject: [PATCH 06/16] Remove blank line; Add if shuffle is True --- sklearn/model_selection/_split.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 04cc49e06d752..489056b2e089d 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -87,7 +87,6 @@ def split(self, X, y=None, groups=None): Note ---- - Multiple calls to the ``split`` method will not return identical training or testing sets if ``random_state`` parameter exists and is not explicitly set to an integer value. @@ -319,7 +318,6 @@ def split(self, X, y=None, groups=None): Note ---- - Multiple calls to the ``split`` method will not return identical training or testing sets if ``random_state`` parameter exists and is not explicitly set to an integer value. @@ -649,10 +647,9 @@ def split(self, X, y, groups=None): Note ---- - Multiple calls to the ``split`` method will not return identical training or testing sets unless ``random_state`` is set to an integer - value. + value, if ``shuffle=True``. """ return super(StratifiedKFold, self).split(X, y, groups) @@ -733,10 +730,9 @@ def split(self, X, y=None, groups=None): Note ---- - Multiple calls to the ``split`` method will not return identical training or testing sets unless ``random_state`` is set to an integer - value. + value, if ``shuffle=True``. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) @@ -969,7 +965,6 @@ def split(self, X, y=None, groups=None): Note ---- - Multiple calls to the ``split`` method will not return identical training or testing sets unless ``random_state`` is set to an integer value. @@ -1342,7 +1337,6 @@ def split(self, X, y, groups=None): Note ---- - Multiple calls to the ``split`` method will not return identical training or testing sets unless ``random_state`` is set to an integer value. From cb5ff5d62e28458423e6f76f579bad9f7560049d Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 7 Nov 2016 13:47:10 +0100 Subject: [PATCH 07/16] Fix tests --- sklearn/model_selection/tests/test_split.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index ff620b5088ae8..b9391a0261ad6 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -889,6 +889,10 @@ def test_cv_iterable_wrapper(): np.array(list(kf_randomized_iter_wrapped.split(X, y)))) except AssertionError: pass + else: + raise AssertionError("The splits for data are same for randomized " + "and non-randomized versions of kfold iter " + "wrapped by _CVIterableWrapper") def test_group_kfold(): From 1c6d169e214be5ce8c99232cac94f69b9fe891e1 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 7 Nov 2016 13:51:54 +0100 Subject: [PATCH 08/16] Explicitly test for GeneratorType --- sklearn/model_selection/tests/test_search.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index f733be29d2282..e2fb9e6e8ab0e 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -7,6 +7,7 @@ from itertools import chain, product import pickle import sys +from types import GeneratorType import numpy as np import scipy.sparse as sp @@ -1175,6 +1176,10 @@ def test_grid_search_cv_splits_consistency(): cv=KFold(n_splits=n_splits)) gs2.fit(X, y) + assert_true(isinstance(KFold(n_splits=n_splits, + shuffle=True, random_state=0).split(X, y), + GeneratorType)) + # Give generator as a cv parameter gs3 = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [0.1, 0.2, 0.3]}, From d355dcab8421635e2581ad2ab7c6125cca61c685 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Thu, 10 Nov 2016 13:24:25 +0100 Subject: [PATCH 09/16] TST Add the else clause --- sklearn/model_selection/tests/test_search.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index e2fb9e6e8ab0e..de12731aff214 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -858,11 +858,15 @@ def test_search_cv_results_rank_tie_breaking(): cv_results['mean_test_score'][2]) except AssertionError: pass + else: + raise AssertionError("The values are not different.") try: assert_almost_equal(cv_results['mean_train_score'][1], cv_results['mean_train_score'][2]) except AssertionError: pass + else: + raise AssertionError("The values are not different.") # 'min' rank should be assigned to the tied candidates assert_almost_equal(search.cv_results_['rank_test_score'], [1, 1, 3]) From dff2f5a4cc3a985c87b98ba9a78aa9a0b70641dc Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Thu, 10 Nov 2016 13:28:03 +0100 Subject: [PATCH 10/16] TST Add comment on usage of np.testing.assert_array_equal --- sklearn/model_selection/tests/test_split.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index b9391a0261ad6..f2c3b3f27ab3f 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -356,9 +356,11 @@ def test_shuffle_kfold_stratifiedkfold_reproducibility(): for cv in (kf, skf): for data in zip((X, X2), (y, y2)): + # Test if the two splits are different + # numpy's assert_equal properly compares nested lists try: - np.testing.assert_equal(list(cv.split(*data)), - list(cv.split(*data))) + np.testing.assert_array_equal(list(cv.split(*data)), + list(cv.split(*data))) except AssertionError: pass else: @@ -880,6 +882,7 @@ def test_cv_iterable_wrapper(): # results kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y) kf_randomized_iter_wrapped = check_cv(kf_randomized_iter) + # numpy's assert_array_equal properly compares nested lists np.testing.assert_array_equal( list(kf_randomized_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y))) From af8107d16b0cb223acd6477ac7d3b57146f7dff3 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Thu, 29 Jun 2017 18:41:29 +0200 Subject: [PATCH 11/16] TYPO --- sklearn/model_selection/tests/test_search.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index ab2837a319cef..52e4af3183288 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1301,7 +1301,8 @@ def _pop_time_keys(cv_results): cv_results.pop(key) return cv_results - # Check if generators as supported as cv and that the splits are consistent + # Check if generators are supported as cv and + # that the splits are consistent np.testing.assert_equal(_pop_time_keys(gs3.cv_results_), _pop_time_keys(gs4.cv_results_)) From a2fcd332c801516abc87aa4104d7bbdc641b832b Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Thu, 29 Jun 2017 19:13:54 +0200 Subject: [PATCH 12/16] MNT Remove if ; --- sklearn/model_selection/_split.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 1c48ea1fd5ffc..9bbb331e5cd96 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -579,10 +579,7 @@ def __init__(self, n_splits=3, shuffle=False, random_state=None): super(StratifiedKFold, self).__init__(n_splits, shuffle, random_state) def _make_test_folds(self, X, y=None, groups=None): - if self.shuffle: - rng = check_random_state(self.random_state) - else: - rng = self.random_state + rng = self.random_state y = np.asarray(y) n_samples = y.shape[0] unique_y, y_inversed = np.unique(y, return_inverse=True) From f02d50e82d77e656b36fa25c10ff57cb251a062a Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Wed, 12 Jul 2017 11:06:20 -0500 Subject: [PATCH 13/16] Address Joel's comments --- doc/modules/cross_validation.rst | 4 +++ sklearn/model_selection/_split.py | 36 ++++++++++---------- sklearn/model_selection/tests/test_search.py | 18 +++------- 3 files changed, 26 insertions(+), 32 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index ab7d2227447b1..69ca6e1edc8c1 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -728,6 +728,10 @@ to shuffle the data indices before splitting them. Note that: * To ensure results are repeatable (*on the same platform*), use a fixed value for ``random_state``. +The randomized CV splitters may return different results for each call of +split. This can be avoided (and identical results returned for each split) by +setting ``random_state`` to an integer. + Cross validation and model selection ==================================== diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 973c0b79b061e..0122f01059369 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -86,9 +86,9 @@ def split(self, X, y=None, groups=None): Note ---- - Multiple calls to the ``split`` method will not return identical - training or testing sets if ``random_state`` parameter exists and is - not explicitly set to an integer value. + Randomized CV splitters may return different results for each call of + split. This can be avoided (and identical results returned for each + split) by setting ``random_state`` to an integer. """ X, y, groups = indexable(X, y, groups) indices = np.arange(_num_samples(X)) @@ -317,9 +317,9 @@ def split(self, X, y=None, groups=None): Note ---- - Multiple calls to the ``split`` method will not return identical - training or testing sets if ``random_state`` parameter exists and is - not explicitly set to an integer value. + Randomized CV splitters may return different results for each call of + split. This can be avoided (and identical results returned for each + split) by setting ``random_state`` to an integer. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) @@ -657,9 +657,9 @@ def split(self, X, y, groups=None): Note ---- - Multiple calls to the ``split`` method will not return identical - training or testing sets unless ``random_state`` is set to an integer - value, if ``shuffle=True``. + Randomized CV splitters may return different results for each call of + split. This can be avoided (and identical results returned for each + split) by setting random_state to an integer. """ y = check_array(y, ensure_2d=False, dtype=None) return super(StratifiedKFold, self).split(X, y, groups) @@ -744,9 +744,9 @@ def split(self, X, y=None, groups=None): Note ---- - Multiple calls to the ``split`` method will not return identical - training or testing sets unless ``random_state`` is set to an integer - value, if ``shuffle=True``. + Randomized CV splitters may return different results for each call of + split. This can be avoided (and identical results returned for each + split) by setting random_state to an integer. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) @@ -1188,9 +1188,9 @@ def split(self, X, y=None, groups=None): Note ---- - Multiple calls to the ``split`` method will not return identical - training or testing sets unless ``random_state`` is set to an integer - value. + Randomized CV splitters may return different results for each call of + split. This can be avoided (and identical results returned for each + split) by setting random_state to an integer. """ X, y, groups = indexable(X, y, groups) for train, test in self._iter_indices(X, y, groups): @@ -1608,9 +1608,9 @@ def split(self, X, y, groups=None): Note ---- - Multiple calls to the ``split`` method will not return identical - training or testing sets unless ``random_state`` is set to an integer - value. + Randomized CV splitters may return different results for each call of + split. This can be avoided (and identical results returned for each + split) by setting random_state to an integer. """ y = check_array(y, ensure_2d=False, dtype=None) return super(StratifiedShuffleSplit, self).split(X, y, groups) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 8065a4e8f14df..058269c0d2c0b 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1071,20 +1071,10 @@ def test_search_cv_results_rank_tie_breaking(): cv_results['mean_test_score'][1]) assert_almost_equal(cv_results['mean_train_score'][0], cv_results['mean_train_score'][1]) - try: - assert_almost_equal(cv_results['mean_test_score'][1], - cv_results['mean_test_score'][2]) - except AssertionError: - pass - else: - raise AssertionError("The values are not different.") - try: - assert_almost_equal(cv_results['mean_train_score'][1], - cv_results['mean_train_score'][2]) - except AssertionError: - pass - else: - raise AssertionError("The values are not different.") + assert_false(np.allclose(cv_results['mean_test_score'][1], + cv_results['mean_test_score'][2])) + assert_false(np.allclose(cv_results['mean_train_score'][1], + cv_results['mean_train_score'][2])) # 'min' rank should be assigned to the tied candidates assert_almost_equal(search.cv_results_['rank_test_score'], [1, 1, 3]) From 8a913597cdb05b6ca5332ecad898de5c38715dc5 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Sat, 15 Jul 2017 10:16:03 -0500 Subject: [PATCH 14/16] merge the identical points in doc --- doc/modules/cross_validation.rst | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 69ca6e1edc8c1..a3064c3c9f6f6 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -725,12 +725,7 @@ to shuffle the data indices before splitting them. Note that: shuffling will be different every time ``KFold(..., shuffle=True)`` is iterated. However, ``GridSearchCV`` will use the same shuffling for each set of parameters validated by a single call to its ``fit`` method. -* To ensure results are repeatable (*on the same platform*), use a fixed value - for ``random_state``. - -The randomized CV splitters may return different results for each call of -split. This can be avoided (and identical results returned for each split) by -setting ``random_state`` to an integer. +* To get identical results for each split, set ``random_state`` to an integer. Cross validation and model selection ==================================== From a2346974fdc3b4c4fde8e2b30b7dee258febad36 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Sat, 15 Jul 2017 12:02:14 -0500 Subject: [PATCH 15/16] DOC address Andy's comments --- sklearn/model_selection/_split.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 0122f01059369..386d439184117 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -87,8 +87,8 @@ def split(self, X, y=None, groups=None): Note ---- Randomized CV splitters may return different results for each call of - split. This can be avoided (and identical results returned for each - split) by setting ``random_state`` to an integer. + split. You can make the results identical by setting ``random_state`` + to an integer. """ X, y, groups = indexable(X, y, groups) indices = np.arange(_num_samples(X)) @@ -318,8 +318,8 @@ def split(self, X, y=None, groups=None): Note ---- Randomized CV splitters may return different results for each call of - split. This can be avoided (and identical results returned for each - split) by setting ``random_state`` to an integer. + split. You can make the results identical by setting ``random_state`` + to an integer. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) @@ -658,8 +658,8 @@ def split(self, X, y, groups=None): Note ---- Randomized CV splitters may return different results for each call of - split. This can be avoided (and identical results returned for each - split) by setting random_state to an integer. + split. You can make the results identical by setting ``random_state`` + to an integer. """ y = check_array(y, ensure_2d=False, dtype=None) return super(StratifiedKFold, self).split(X, y, groups) @@ -745,8 +745,8 @@ def split(self, X, y=None, groups=None): Note ---- Randomized CV splitters may return different results for each call of - split. This can be avoided (and identical results returned for each - split) by setting random_state to an integer. + split. You can make the results identical by setting ``random_state`` + to an integer. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) @@ -1189,8 +1189,8 @@ def split(self, X, y=None, groups=None): Note ---- Randomized CV splitters may return different results for each call of - split. This can be avoided (and identical results returned for each - split) by setting random_state to an integer. + split. You can make the results identical by setting ``random_state`` + to an integer. """ X, y, groups = indexable(X, y, groups) for train, test in self._iter_indices(X, y, groups): @@ -1609,8 +1609,8 @@ def split(self, X, y, groups=None): Note ---- Randomized CV splitters may return different results for each call of - split. This can be avoided (and identical results returned for each - split) by setting random_state to an integer. + split. You can make the results identical by setting ``random_state`` + to an integer. """ y = check_array(y, ensure_2d=False, dtype=None) return super(StratifiedShuffleSplit, self).split(X, y, groups) From b4c633f1d1c7a1d935541c6dfb07bcf853ff33da Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Sun, 16 Jul 2017 12:02:56 -0500 Subject: [PATCH 16/16] Move comment to before the check for generator type --- sklearn/model_selection/tests/test_search.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 058269c0d2c0b..5e667727d9dda 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1416,11 +1416,10 @@ def test_grid_search_cv_splits_consistency(): cv=KFold(n_splits=n_splits)) gs2.fit(X, y) + # Give generator as a cv parameter assert_true(isinstance(KFold(n_splits=n_splits, shuffle=True, random_state=0).split(X, y), GeneratorType)) - - # Give generator as a cv parameter gs3 = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [0.1, 0.2, 0.3]}, cv=KFold(n_splits=n_splits, shuffle=True,