Skip to content

[WIP] Add repeated cross-validations #7960

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions sklearn/model_selection/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from ._split import BaseCrossValidator
from ._split import KFold
from ._split import RepeatedKFold
from ._split import GroupKFold
from ._split import StratifiedKFold
from ._split import RepeatedStratifiedKFold
from ._split import TimeSeriesSplit
from ._split import LeaveOneGroupOut
from ._split import LeaveOneOut
Expand Down Expand Up @@ -30,6 +32,8 @@
'GridSearchCV',
'TimeSeriesSplit',
'KFold',
'RepeatedKFold',
'RepeatedStratifiedKFold',
'GroupKFold',
'GroupShuffleSplit',
'LeaveOneGroupOut',
Expand Down
173 changes: 173 additions & 0 deletions sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
'StratifiedKFold',
'StratifiedShuffleSplit',
'PredefinedSplit',
'RepeatedKFold',
'RepeatedStratifiedKFold',
'train_test_split',
'check_cv']

Expand Down Expand Up @@ -913,6 +915,177 @@ def get_n_splits(self, X, y, groups):
return int(comb(len(np.unique(groups)), self.n_groups, exact=True))


class BaseRepeatedCrossValidator(with_metaclass(ABCMeta)):
"""Base class for repeated cross validators

Implementations must define `_get_cv`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems inelegant to me. A mixin is an option, perhaps. Otherwise, see my main comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How would a mixin be better in this case?

"""
def __init__(self, n_iter=2, n_splits=3, random_states=None):
if not isinstance(n_iter, numbers.Integral):
raise ValueError("Number of repetitions must be of Integral type.")

if n_iter <= 1:
raise ValueError("Number of repetitions must be greater than 1.")

self.n_iter = n_iter
self.n_splits = n_splits

if random_states is None:
self._random_states = []
for _ in range(n_splits):
random_state = check_random_state(
None).randint(np.iinfo(np.int32).max)
self._random_states.extend([random_state])
else:
random_states = check_array(random_states, ensure_2d=False,
dtype=None)
if random_states.size != n_iter:
raise ValueError(
"Size of random_states must be equal to n_iter.")

if any(not isinstance(random_state, (np.integer, numbers.Integral))
for random_state in random_states):
raise ValueError("All random states must be int.")

self._random_states = random_states

def split(self, X, y=None, groups=None):
"""Generates indices to split data into training and test set.

Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.

y : array-like, of length n_samples
The target variable for supervised learning problems.

groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set.

Returns
-------
train : ndarray
The training set indices for that split.

test : ndarray
The testing set indices for that split.
"""
for idx in range(self.n_iter):
random_state = check_random_state(self._random_states[idx])
cv = self._get_cv(random_state)
for train_index, test_index in cv.split(X, y, groups):
yield train_index, test_index

def _get_cv(self, random_state):
"""Returns an instance of cross-validator

This method needs to implemented by class that inherits
BaseRepeatedCrossValidator.

Parameters
----------
random_state : Instance of RandomState
Random state that should be used to create instance of cv.

Returns
-------
cv : Instance of BaseCrossValidator.
"""
raise NotImplementedError

def get_n_iter(self):
"""Returns number of repetitions to be performed"""
return self.n_iter


class RepeatedKFold(with_metaclass(ABCMeta, BaseRepeatedCrossValidator)):
"""Repeated K-Fold cross validator.

Repeats K-Fold n times.

Parameters
----------
n_iter : int, default=3
Number of times KFold needs to be repeated.

n_splits : int, default=5
Number of folds. Must be atleast 2.

random_states : array-like, with shape (n_iter,), default=None
Random states to be used for each repetition.

Examples
--------
>>> from sklearn.model_selection import RepeatedKFold
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
>>> y = np.array([0, 0, 1, 1])
>>> random_states = [1944695409, 258173307]
>>> rkf = RepeatedKFold(n_splits=2, random_states=random_states)
>>> rkf.get_n_iter()
2
>>> for train_index, test_index in rkf.split(X):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [1 3] TEST: [0 2]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [2 3] TEST: [0 1]
TRAIN: [0 1] TEST: [2 3]
"""
def __init__(self, n_iter=2, n_splits=5, random_states=None):
super(RepeatedKFold, self).__init__(n_iter, n_splits, random_states)

def _get_cv(self, random_state):
return KFold(self.n_splits, shuffle=True, random_state=random_state)


class RepeatedStratifiedKFold(with_metaclass(
ABCMeta, BaseRepeatedCrossValidator)):
"""Repeated Stratified K-Fold cross validator

Repeats Stratified K-Fold n times.

Parameters
----------
n_iter : int, default=3
Number of times KFold needs to be repeated.

n_splits : int, default=5
Number of folds. Must be atleast 2.

random_states : array-like, with shape (n_iter,), default=None
Random states to be used for each repetition.

Examples
--------
>>> from sklearn.model_selection import RepeatedStratifiedKFold
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
>>> y = np.array([0, 0, 1, 1])
>>> random_states = [1944695409, 258173307]
>>> rskf = RepeatedStratifiedKFold(n_splits=2, random_states=random_states)
>>> rskf.get_n_iter()
2
>>> for train_index, test_index in rskf.split(X, y):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [1 3] TEST: [0 2]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [1 3] TEST: [0 2]
"""
def __init__(self, n_iter=2, n_splits=5, random_states=None):
super(RepeatedStratifiedKFold, self).__init__(
n_iter, n_splits, random_states)

def _get_cv(self, random_state):
return StratifiedKFold(self.n_splits, shuffle=True,
random_state=random_state)


class BaseShuffleSplit(with_metaclass(ABCMeta)):
"""Base class for ShuffleSplit and StratifiedShuffleSplit"""

Expand Down
77 changes: 77 additions & 0 deletions sklearn/model_selection/tests/test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
from sklearn.model_selection import check_cv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.linear_model import Ridge

Expand Down Expand Up @@ -679,6 +681,81 @@ def test_predefinedsplit_with_kfold_split():
assert_array_equal(ps_test, kf_test)


def test_repeated_kfold_valueerrors():
random_states1 = [0]
random_states2 = [np.random.RandomState()]

# random state
assert_raises(ValueError, RepeatedKFold, n_iter=3,
random_states=random_states1)
assert_raises(ValueError, RepeatedKFold, n_iter=3,
random_states=random_states2)

# number of repetitions
assert_raises(ValueError, RepeatedKFold, n_iter=1.5)
assert_raises(ValueError, RepeatedKFold, n_iter=1)


def test_repeated_kfold_deterministic_split():
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
random_states = [1944695409, 258173307]

splits = RepeatedKFold(2, 2, random_states).split(X)
train, test = next(splits)
assert_array_equal(train, [1, 3])
assert_array_equal(test, [0, 2, 4])

train, test = next(splits)
assert_array_equal(train, [0, 2, 4])
assert_array_equal(test, [1, 3])

train, test = next(splits)
assert_array_equal(train, [2, 4])
assert_array_equal(test, [0, 1, 3])

train, test = next(splits)
assert_array_equal(train, [0, 1, 3])
assert_array_equal(test, [2, 4])


def test_repeated_stratified_kfold_valueerrors():
random_states1 = [0]
random_states2 = [np.random.RandomState()]

# random state
assert_raises(ValueError, RepeatedStratifiedKFold,
n_iter=3, random_states=random_states1)
assert_raises(ValueError, RepeatedStratifiedKFold,
n_iter=3, random_states=random_states2)

# number of repetitions
assert_raises(ValueError, RepeatedStratifiedKFold, n_iter=1.5)
assert_raises(ValueError, RepeatedStratifiedKFold, n_iter=1)


def test_repeated_stratified_kfold_determistic_split():
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
y = [0, 0, 1, 1, 1]
random_states = [1944695409, 258173307]

splits = RepeatedStratifiedKFold(2, 2, random_states).split(X, y)
train, test = next(splits)
assert_array_equal(train, [1, 3])
assert_array_equal(test, [0, 2, 4])

train, test = next(splits)
assert_array_equal(train, [0, 2, 4])
assert_array_equal(test, [1, 3])

train, test = next(splits)
assert_array_equal(train, [0, 4])
assert_array_equal(test, [1, 2, 3])

train, test = next(splits)
assert_array_equal(train, [1, 2, 3])
assert_array_equal(test, [0, 4])


def test_group_shuffle_split():
for groups_i in test_groups:
X = y = np.ones(len(groups_i))
Expand Down