Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 83 additions & 131 deletions sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,13 +1267,13 @@ def __init__(self, n_splits=5, n_repeats=10, random_state=None):
class BaseShuffleSplit(metaclass=ABCMeta):
"""Base class for ShuffleSplit and StratifiedShuffleSplit"""

def __init__(self, n_splits=10, test_size="default", train_size=None,
def __init__(self, n_splits=10, test_size=None, train_size=None,
random_state=None):
_validate_shuffle_split_init(test_size, train_size)
self.n_splits = n_splits
self.test_size = test_size
self.train_size = train_size
self.random_state = random_state
self._default_test_size = 0.1

def split(self, X, y=None, groups=None):
"""Generate indices to split data into training and test set.
Expand Down Expand Up @@ -1354,15 +1354,12 @@ class ShuffleSplit(BaseShuffleSplit):
n_splits : int, default 10
Number of re-shuffling & splitting iterations.

test_size : float, int, None, default=0.1
test_size : float, int, None, default=None
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. By default (the parameter is
unspecified), the value is set to 0.1.
The default will change in version 0.21. It will remain 0.1 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.
complement of the train size. If ``train_size`` is also None, it will
be set to 0.1.

train_size : float, int, or None, default=None
If float, should be between 0.0 and 1.0 and represent the
Expand Down Expand Up @@ -1406,12 +1403,21 @@ class ShuffleSplit(BaseShuffleSplit):
TRAIN: [3 4 1] TEST: [5 2]
TRAIN: [3 5 1] TEST: [2 4]
"""
def __init__(self, n_splits=10, test_size=None, train_size=None,
random_state=None):
super().__init__(
n_splits=n_splits,
test_size=test_size,
train_size=train_size,
random_state=random_state)
self._default_test_size = 0.1

def _iter_indices(self, X, y=None, groups=None):
n_samples = _num_samples(X)
n_train, n_test = _validate_shuffle_split(n_samples,
self.test_size,
self.train_size)
n_train, n_test = _validate_shuffle_split(
n_samples, self.test_size, self.train_size,
default_test_size=self._default_test_size)

rng = check_random_state(self.random_state)
for i in range(self.n_splits):
# random partition
Expand Down Expand Up @@ -1449,14 +1455,12 @@ class GroupShuffleSplit(ShuffleSplit):
n_splits : int (default 5)
Number of re-shuffling & splitting iterations.

test_size : float, int, None, optional
test_size : float, int, None, optional (default=None)
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test groups. If None, the value is set to the
complement of the train size. By default, the value is set to 0.2.
The default will change in version 0.21. It will remain 0.2 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.
complement of the train size. If ``train_size`` is also None, it will
be set to 0.2.

train_size : float, int, or None, default is None
If float, should be between 0.0 and 1.0 and represent the
Expand All @@ -1472,21 +1476,14 @@ class GroupShuffleSplit(ShuffleSplit):

'''

def __init__(self, n_splits=5, test_size="default", train_size=None,
def __init__(self, n_splits=5, test_size=None, train_size=None,
random_state=None):
if test_size == "default":
if train_size is not None:
warnings.warn("From version 0.21, test_size will always "
"complement train_size unless both "
"are specified.",
FutureWarning)
test_size = 0.2

super().__init__(
n_splits=n_splits,
test_size=test_size,
train_size=train_size,
random_state=random_state)
self._default_test_size = 0.2

def _iter_indices(self, X, y, groups):
if groups is None:
Expand Down Expand Up @@ -1624,14 +1621,12 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
n_splits : int, default 10
Number of re-shuffling & splitting iterations.

test_size : float, int, None, optional
test_size : float, int, None, optional (default=None)
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. By default, the value is set to 0.1.
The default will change in version 0.21. It will remain 0.1 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.
complement of the train size. If ``train_size`` is also None, it will
be set to 0.1.

train_size : float, int, or None, default is None
If float, should be between 0.0 and 1.0 and represent the
Expand Down Expand Up @@ -1667,16 +1662,21 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
TRAIN: [0 5 1] TEST: [3 4 2]
"""

def __init__(self, n_splits=10, test_size="default", train_size=None,
def __init__(self, n_splits=10, test_size=None, train_size=None,
random_state=None):
super().__init__(
n_splits, test_size, train_size, random_state)
n_splits=n_splits,
test_size=test_size,
train_size=train_size,
random_state=random_state)
self._default_test_size = 0.1

def _iter_indices(self, X, y, groups=None):
n_samples = _num_samples(X)
y = check_array(y, ensure_2d=False, dtype=None)
n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
self.train_size)
n_train, n_test = _validate_shuffle_split(
n_samples, self.test_size, self.train_size,
default_test_size=self._default_test_size)

if y.ndim == 2:
# for multi-label y, map each distinct row to a string repr
Expand Down Expand Up @@ -1770,90 +1770,55 @@ def split(self, X, y, groups=None):
return super().split(X, y, groups)


def _validate_shuffle_split_init(test_size, train_size):
"""Validation helper to check the test_size and train_size at init

NOTE This does not take into account the number of samples which is known
only at split
"""
if test_size == "default":
if train_size is not None:
warnings.warn("From version 0.21, test_size will always "
"complement train_size unless both "
"are specified.",
FutureWarning)
test_size = 0.1

if test_size is None and train_size is None:
raise ValueError('test_size and train_size can not both be None')

if test_size is not None:
if np.asarray(test_size).dtype.kind == 'f':
if test_size >= 1. or test_size <= 0:
raise ValueError(
'test_size=%f should be in the (0, 1) range '
'or be an integer' % test_size)
elif np.asarray(test_size).dtype.kind != 'i':
# int values are checked during split based on the input
raise ValueError("Invalid value for test_size: %r" % test_size)

if train_size is not None:
if np.asarray(train_size).dtype.kind == 'f':
if train_size >= 1. or train_size <= 0:
raise ValueError('train_size=%f should be in the (0, 1) range '
'or be an integer' % train_size)
elif (np.asarray(test_size).dtype.kind == 'f' and
(
(train_size + test_size) > 1. or
(train_size + test_size) < 0)):
raise ValueError('The sum of test_size and train_size = %f, '
'should be in the (0, 1) range. Reduce '
'test_size and/or train_size.' %
(train_size + test_size))
elif np.asarray(train_size).dtype.kind != 'i':
# int values are checked during split based on the input
raise ValueError("Invalid value for train_size: %r" % train_size)


def _validate_shuffle_split(n_samples, test_size, train_size):
def _validate_shuffle_split(n_samples, test_size, train_size,
default_test_size=None):
"""
Validation helper to check if the test/test sizes are meaningful wrt to the
size of the data (n_samples)
"""
if (test_size is not None and
(np.asarray(test_size).dtype.kind == 'i' and
(test_size >= n_samples or test_size <= 0)) or
(np.asarray(test_size).dtype.kind == 'f' and
(test_size <= 0 or test_size >= 1))):
raise ValueError('test_size=%d should be either positive and smaller '
'than the number of samples %d or a float in the '
'(0,1) range' % (test_size, n_samples))

if (train_size is not None and
(np.asarray(train_size).dtype.kind == 'i' and
(train_size >= n_samples or train_size <= 0)) or
(np.asarray(train_size).dtype.kind == 'f' and
(train_size <= 0 or train_size >= 1))):
raise ValueError('train_size=%d should be either positive and smaller '
'than the number of samples %d or a float in the '
'(0,1) range' % (train_size, n_samples))

if test_size == "default":
test_size = 0.1

if np.asarray(test_size).dtype.kind == 'f':
if test_size is None and train_size is None:
test_size = default_test_size

test_size_type = np.asarray(test_size).dtype.kind
train_size_type = np.asarray(train_size).dtype.kind

if (test_size_type == 'i' and (test_size >= n_samples or test_size <= 0)
or test_size_type == 'f' and (test_size <= 0 or test_size >= 1)):
raise ValueError('test_size={0} should be either positive and smaller'
' than the number of samples {1} or a float in the '
'(0, 1) range'.format(test_size, n_samples))

if (train_size_type == 'i' and (train_size >= n_samples or train_size <= 0)
or train_size_type == 'f' and (train_size <= 0 or train_size >= 1)):
raise ValueError('train_size={0} should be either positive and smaller'
' than the number of samples {1} or a float in the '
'(0, 1) range'.format(train_size, n_samples))

if train_size is not None and train_size_type not in ('i', 'f'):
raise ValueError("Invalid value for train_size: {}".format(train_size))
if test_size is not None and test_size_type not in ('i', 'f'):
raise ValueError("Invalid value for test_size: {}".format(test_size))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically it should be a TypeError but I guess it's too late to change now as it was already raising a ValueError for this case in released versions of scikit-learn.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I guess so...


if (train_size_type == 'f' and test_size_type == 'f' and
train_size + test_size > 1):
raise ValueError(
'The sum of test_size and train_size = {}, should be in the (0, 1)'
' range. Reduce test_size and/or train_size.'
.format(train_size + test_size))

if test_size_type == 'f':
n_test = ceil(test_size * n_samples)
elif np.asarray(test_size).dtype.kind == 'i':
elif test_size_type == 'i':
n_test = float(test_size)

if train_size is None:
n_train = n_samples - n_test
elif np.asarray(train_size).dtype.kind == 'f':
if train_size_type == 'f':
n_train = floor(train_size * n_samples)
else:
elif train_size_type == 'i':
n_train = float(train_size)

if test_size is None:
if train_size is None:
n_train = n_samples - n_test
elif test_size is None:
n_test = n_samples - n_train

if n_train + n_test > n_samples:
Expand Down Expand Up @@ -2091,14 +2056,12 @@ def train_test_split(*arrays, **options):
Allowed inputs are lists, numpy arrays, scipy-sparse
matrices or pandas dataframes.

test_size : float, int or None, optional (default=0.25)
test_size : float, int or None, optional (default=None)
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. By default, the value is set to 0.25.
The default will change in version 0.21. It will remain 0.25 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.
complement of the train size. If ``train_size`` is also None, it will
be set to 0.25.

train_size : float, int, or None, (default=None)
If float, should be between 0.0 and 1.0 and represent the
Expand Down Expand Up @@ -2166,7 +2129,7 @@ def train_test_split(*arrays, **options):
n_arrays = len(arrays)
if n_arrays == 0:
raise ValueError("At least one array required as input")
test_size = options.pop('test_size', 'default')
test_size = options.pop('test_size', None)
train_size = options.pop('train_size', None)
random_state = options.pop('random_state', None)
stratify = options.pop('stratify', None)
Expand All @@ -2175,29 +2138,18 @@ def train_test_split(*arrays, **options):
if options:
raise TypeError("Invalid parameters passed: %s" % str(options))

if test_size == 'default':
test_size = None
if train_size is not None:
warnings.warn("From version 0.21, test_size will always "
"complement train_size unless both "
"are specified.",
FutureWarning)

if test_size is None and train_size is None:
test_size = 0.25

arrays = indexable(*arrays)

n_samples = _num_samples(arrays[0])
n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size,
default_test_size=0.25)

if shuffle is False:
if stratify is not None:
raise ValueError(
"Stratified train/test split is not implemented for "
"shuffle=False")

n_samples = _num_samples(arrays[0])
n_train, n_test = _validate_shuffle_split(n_samples, test_size,
train_size)

train = np.arange(n_train)
test = np.arange(n_train, n_train + n_test)

Expand All @@ -2207,8 +2159,8 @@ def train_test_split(*arrays, **options):
else:
CVClass = ShuffleSplit

cv = CVClass(test_size=test_size,
train_size=train_size,
cv = CVClass(test_size=n_test,
train_size=n_train,
random_state=random_state)

train, test = next(cv.split(X=arrays[0], y=stratify))
Expand Down
Loading