-
-
Notifications
You must be signed in to change notification settings - Fork 26.2k
[MRG] Update default test_size of ShuffleSplit for 0.21 #13483
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d99f5f3
3633109
f19a172
0d3d6a6
9773524
8a24fac
78a5aab
300e0fc
d9c4807
7821f65
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1267,13 +1267,13 @@ def __init__(self, n_splits=5, n_repeats=10, random_state=None): | |
class BaseShuffleSplit(metaclass=ABCMeta): | ||
"""Base class for ShuffleSplit and StratifiedShuffleSplit""" | ||
|
||
def __init__(self, n_splits=10, test_size="default", train_size=None, | ||
def __init__(self, n_splits=10, test_size=None, train_size=None, | ||
random_state=None): | ||
_validate_shuffle_split_init(test_size, train_size) | ||
self.n_splits = n_splits | ||
self.test_size = test_size | ||
self.train_size = train_size | ||
self.random_state = random_state | ||
self._default_test_size = 0.1 | ||
|
||
def split(self, X, y=None, groups=None): | ||
"""Generate indices to split data into training and test set. | ||
|
@@ -1354,15 +1354,12 @@ class ShuffleSplit(BaseShuffleSplit): | |
n_splits : int, default 10 | ||
Number of re-shuffling & splitting iterations. | ||
|
||
test_size : float, int, None, default=0.1 | ||
test_size : float, int, None, default=None | ||
If float, should be between 0.0 and 1.0 and represent the proportion | ||
of the dataset to include in the test split. If int, represents the | ||
absolute number of test samples. If None, the value is set to the | ||
complement of the train size. By default (the parameter is | ||
unspecified), the value is set to 0.1. | ||
The default will change in version 0.21. It will remain 0.1 only | ||
if ``train_size`` is unspecified, otherwise it will complement | ||
the specified ``train_size``. | ||
complement of the train size. If ``train_size`` is also None, it will | ||
be set to 0.1. | ||
|
||
train_size : float, int, or None, default=None | ||
If float, should be between 0.0 and 1.0 and represent the | ||
|
@@ -1406,12 +1403,21 @@ class ShuffleSplit(BaseShuffleSplit): | |
TRAIN: [3 4 1] TEST: [5 2] | ||
TRAIN: [3 5 1] TEST: [2 4] | ||
""" | ||
def __init__(self, n_splits=10, test_size=None, train_size=None, | ||
random_state=None): | ||
super().__init__( | ||
n_splits=n_splits, | ||
test_size=test_size, | ||
train_size=train_size, | ||
random_state=random_state) | ||
self._default_test_size = 0.1 | ||
|
||
def _iter_indices(self, X, y=None, groups=None): | ||
n_samples = _num_samples(X) | ||
n_train, n_test = _validate_shuffle_split(n_samples, | ||
self.test_size, | ||
self.train_size) | ||
n_train, n_test = _validate_shuffle_split( | ||
n_samples, self.test_size, self.train_size, | ||
default_test_size=self._default_test_size) | ||
|
||
rng = check_random_state(self.random_state) | ||
for i in range(self.n_splits): | ||
# random partition | ||
|
@@ -1449,14 +1455,12 @@ class GroupShuffleSplit(ShuffleSplit): | |
n_splits : int (default 5) | ||
Number of re-shuffling & splitting iterations. | ||
|
||
test_size : float, int, None, optional | ||
test_size : float, int, None, optional (default=None) | ||
If float, should be between 0.0 and 1.0 and represent the proportion | ||
of the dataset to include in the test split. If int, represents the | ||
absolute number of test groups. If None, the value is set to the | ||
complement of the train size. By default, the value is set to 0.2. | ||
The default will change in version 0.21. It will remain 0.2 only | ||
if ``train_size`` is unspecified, otherwise it will complement | ||
the specified ``train_size``. | ||
complement of the train size. If ``train_size`` is also None, it will | ||
be set to 0.2. | ||
|
||
train_size : float, int, or None, default is None | ||
If float, should be between 0.0 and 1.0 and represent the | ||
|
@@ -1472,21 +1476,14 @@ class GroupShuffleSplit(ShuffleSplit): | |
|
||
''' | ||
|
||
def __init__(self, n_splits=5, test_size="default", train_size=None, | ||
def __init__(self, n_splits=5, test_size=None, train_size=None, | ||
random_state=None): | ||
if test_size == "default": | ||
if train_size is not None: | ||
warnings.warn("From version 0.21, test_size will always " | ||
"complement train_size unless both " | ||
"are specified.", | ||
FutureWarning) | ||
test_size = 0.2 | ||
|
||
super().__init__( | ||
n_splits=n_splits, | ||
test_size=test_size, | ||
train_size=train_size, | ||
random_state=random_state) | ||
self._default_test_size = 0.2 | ||
|
||
def _iter_indices(self, X, y, groups): | ||
if groups is None: | ||
|
@@ -1624,14 +1621,12 @@ class StratifiedShuffleSplit(BaseShuffleSplit): | |
n_splits : int, default 10 | ||
Number of re-shuffling & splitting iterations. | ||
|
||
test_size : float, int, None, optional | ||
test_size : float, int, None, optional (default=None) | ||
If float, should be between 0.0 and 1.0 and represent the proportion | ||
of the dataset to include in the test split. If int, represents the | ||
absolute number of test samples. If None, the value is set to the | ||
complement of the train size. By default, the value is set to 0.1. | ||
The default will change in version 0.21. It will remain 0.1 only | ||
if ``train_size`` is unspecified, otherwise it will complement | ||
the specified ``train_size``. | ||
complement of the train size. If ``train_size`` is also None, it will | ||
be set to 0.1. | ||
|
||
train_size : float, int, or None, default is None | ||
If float, should be between 0.0 and 1.0 and represent the | ||
|
@@ -1667,16 +1662,21 @@ class StratifiedShuffleSplit(BaseShuffleSplit): | |
TRAIN: [0 5 1] TEST: [3 4 2] | ||
""" | ||
|
||
def __init__(self, n_splits=10, test_size="default", train_size=None, | ||
def __init__(self, n_splits=10, test_size=None, train_size=None, | ||
random_state=None): | ||
super().__init__( | ||
n_splits, test_size, train_size, random_state) | ||
n_splits=n_splits, | ||
test_size=test_size, | ||
train_size=train_size, | ||
random_state=random_state) | ||
self._default_test_size = 0.1 | ||
|
||
def _iter_indices(self, X, y, groups=None): | ||
n_samples = _num_samples(X) | ||
y = check_array(y, ensure_2d=False, dtype=None) | ||
n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, | ||
self.train_size) | ||
n_train, n_test = _validate_shuffle_split( | ||
n_samples, self.test_size, self.train_size, | ||
default_test_size=self._default_test_size) | ||
|
||
if y.ndim == 2: | ||
# for multi-label y, map each distinct row to a string repr | ||
|
@@ -1770,90 +1770,55 @@ def split(self, X, y, groups=None): | |
return super().split(X, y, groups) | ||
|
||
|
||
def _validate_shuffle_split_init(test_size, train_size): | ||
"""Validation helper to check the test_size and train_size at init | ||
|
||
NOTE This does not take into account the number of samples which is known | ||
only at split | ||
""" | ||
if test_size == "default": | ||
if train_size is not None: | ||
warnings.warn("From version 0.21, test_size will always " | ||
"complement train_size unless both " | ||
"are specified.", | ||
FutureWarning) | ||
test_size = 0.1 | ||
|
||
if test_size is None and train_size is None: | ||
raise ValueError('test_size and train_size can not both be None') | ||
|
||
if test_size is not None: | ||
if np.asarray(test_size).dtype.kind == 'f': | ||
if test_size >= 1. or test_size <= 0: | ||
raise ValueError( | ||
'test_size=%f should be in the (0, 1) range ' | ||
'or be an integer' % test_size) | ||
elif np.asarray(test_size).dtype.kind != 'i': | ||
# int values are checked during split based on the input | ||
raise ValueError("Invalid value for test_size: %r" % test_size) | ||
|
||
if train_size is not None: | ||
if np.asarray(train_size).dtype.kind == 'f': | ||
if train_size >= 1. or train_size <= 0: | ||
raise ValueError('train_size=%f should be in the (0, 1) range ' | ||
'or be an integer' % train_size) | ||
elif (np.asarray(test_size).dtype.kind == 'f' and | ||
( | ||
(train_size + test_size) > 1. or | ||
(train_size + test_size) < 0)): | ||
raise ValueError('The sum of test_size and train_size = %f, ' | ||
'should be in the (0, 1) range. Reduce ' | ||
'test_size and/or train_size.' % | ||
(train_size + test_size)) | ||
elif np.asarray(train_size).dtype.kind != 'i': | ||
# int values are checked during split based on the input | ||
raise ValueError("Invalid value for train_size: %r" % train_size) | ||
|
||
|
||
def _validate_shuffle_split(n_samples, test_size, train_size): | ||
def _validate_shuffle_split(n_samples, test_size, train_size, | ||
default_test_size=None): | ||
""" | ||
Validation helper to check if the test/test sizes are meaningful wrt to the | ||
size of the data (n_samples) | ||
""" | ||
if (test_size is not None and | ||
(np.asarray(test_size).dtype.kind == 'i' and | ||
(test_size >= n_samples or test_size <= 0)) or | ||
(np.asarray(test_size).dtype.kind == 'f' and | ||
(test_size <= 0 or test_size >= 1))): | ||
raise ValueError('test_size=%d should be either positive and smaller ' | ||
'than the number of samples %d or a float in the ' | ||
'(0,1) range' % (test_size, n_samples)) | ||
|
||
if (train_size is not None and | ||
(np.asarray(train_size).dtype.kind == 'i' and | ||
(train_size >= n_samples or train_size <= 0)) or | ||
(np.asarray(train_size).dtype.kind == 'f' and | ||
(train_size <= 0 or train_size >= 1))): | ||
raise ValueError('train_size=%d should be either positive and smaller ' | ||
'than the number of samples %d or a float in the ' | ||
'(0,1) range' % (train_size, n_samples)) | ||
|
||
if test_size == "default": | ||
test_size = 0.1 | ||
|
||
if np.asarray(test_size).dtype.kind == 'f': | ||
if test_size is None and train_size is None: | ||
test_size = default_test_size | ||
|
||
test_size_type = np.asarray(test_size).dtype.kind | ||
train_size_type = np.asarray(train_size).dtype.kind | ||
|
||
if (test_size_type == 'i' and (test_size >= n_samples or test_size <= 0) | ||
or test_size_type == 'f' and (test_size <= 0 or test_size >= 1)): | ||
raise ValueError('test_size={0} should be either positive and smaller' | ||
' than the number of samples {1} or a float in the ' | ||
'(0, 1) range'.format(test_size, n_samples)) | ||
|
||
if (train_size_type == 'i' and (train_size >= n_samples or train_size <= 0) | ||
or train_size_type == 'f' and (train_size <= 0 or train_size >= 1)): | ||
raise ValueError('train_size={0} should be either positive and smaller' | ||
' than the number of samples {1} or a float in the ' | ||
'(0, 1) range'.format(train_size, n_samples)) | ||
|
||
if train_size is not None and train_size_type not in ('i', 'f'): | ||
raise ValueError("Invalid value for train_size: {}".format(train_size)) | ||
if test_size is not None and test_size_type not in ('i', 'f'): | ||
raise ValueError("Invalid value for test_size: {}".format(test_size)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Technically it should be a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah I guess so... |
||
|
||
if (train_size_type == 'f' and test_size_type == 'f' and | ||
train_size + test_size > 1): | ||
raise ValueError( | ||
'The sum of test_size and train_size = {}, should be in the (0, 1)' | ||
' range. Reduce test_size and/or train_size.' | ||
.format(train_size + test_size)) | ||
|
||
if test_size_type == 'f': | ||
n_test = ceil(test_size * n_samples) | ||
elif np.asarray(test_size).dtype.kind == 'i': | ||
elif test_size_type == 'i': | ||
n_test = float(test_size) | ||
|
||
if train_size is None: | ||
n_train = n_samples - n_test | ||
elif np.asarray(train_size).dtype.kind == 'f': | ||
if train_size_type == 'f': | ||
n_train = floor(train_size * n_samples) | ||
else: | ||
elif train_size_type == 'i': | ||
n_train = float(train_size) | ||
|
||
if test_size is None: | ||
if train_size is None: | ||
n_train = n_samples - n_test | ||
elif test_size is None: | ||
n_test = n_samples - n_train | ||
|
||
if n_train + n_test > n_samples: | ||
|
@@ -2091,14 +2056,12 @@ def train_test_split(*arrays, **options): | |
Allowed inputs are lists, numpy arrays, scipy-sparse | ||
matrices or pandas dataframes. | ||
|
||
test_size : float, int or None, optional (default=0.25) | ||
test_size : float, int or None, optional (default=None) | ||
If float, should be between 0.0 and 1.0 and represent the proportion | ||
of the dataset to include in the test split. If int, represents the | ||
absolute number of test samples. If None, the value is set to the | ||
complement of the train size. By default, the value is set to 0.25. | ||
The default will change in version 0.21. It will remain 0.25 only | ||
if ``train_size`` is unspecified, otherwise it will complement | ||
the specified ``train_size``. | ||
complement of the train size. If ``train_size`` is also None, it will | ||
be set to 0.25. | ||
|
||
train_size : float, int, or None, (default=None) | ||
If float, should be between 0.0 and 1.0 and represent the | ||
|
@@ -2166,7 +2129,7 @@ def train_test_split(*arrays, **options): | |
n_arrays = len(arrays) | ||
if n_arrays == 0: | ||
raise ValueError("At least one array required as input") | ||
test_size = options.pop('test_size', 'default') | ||
test_size = options.pop('test_size', None) | ||
train_size = options.pop('train_size', None) | ||
random_state = options.pop('random_state', None) | ||
stratify = options.pop('stratify', None) | ||
|
@@ -2175,29 +2138,18 @@ def train_test_split(*arrays, **options): | |
if options: | ||
raise TypeError("Invalid parameters passed: %s" % str(options)) | ||
|
||
if test_size == 'default': | ||
test_size = None | ||
if train_size is not None: | ||
warnings.warn("From version 0.21, test_size will always " | ||
"complement train_size unless both " | ||
"are specified.", | ||
FutureWarning) | ||
|
||
if test_size is None and train_size is None: | ||
test_size = 0.25 | ||
|
||
arrays = indexable(*arrays) | ||
|
||
n_samples = _num_samples(arrays[0]) | ||
n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, | ||
default_test_size=0.25) | ||
|
||
if shuffle is False: | ||
if stratify is not None: | ||
raise ValueError( | ||
"Stratified train/test split is not implemented for " | ||
"shuffle=False") | ||
|
||
n_samples = _num_samples(arrays[0]) | ||
n_train, n_test = _validate_shuffle_split(n_samples, test_size, | ||
train_size) | ||
|
||
train = np.arange(n_train) | ||
test = np.arange(n_train, n_train + n_test) | ||
|
||
|
@@ -2207,8 +2159,8 @@ def train_test_split(*arrays, **options): | |
else: | ||
CVClass = ShuffleSplit | ||
|
||
cv = CVClass(test_size=test_size, | ||
train_size=train_size, | ||
cv = CVClass(test_size=n_test, | ||
train_size=n_train, | ||
random_state=random_state) | ||
|
||
train, test = next(cv.split(X=arrays[0], y=stratify)) | ||
|
Uh oh!
There was an error while loading. Please reload this page.