[MRG+3] CV splitters: train/test_size default behavior will change in 0.21 (scikit-learn#7459)

nelson-liu · jnothman · commit 1f781e64a226 · 2017-06-14T21:28:52.000+10:00
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -454,6 +454,11 @@ API changes summary
      method ``check_decision_proba_consistency`` has been added in
      **sklearn.utils.estimator_checks** to check their consistency.
      :issue:`7578` by :user:`Shubham Bhardwaj <shubham0704>`
+   
+   - In version 0.21, the default behavior of splitters that use the
+     ``test_size`` and ``train_size`` parameter will change, such that
+     specifying ``train_size`` alone will cause ``test_size`` to be the
+     remainder. :issue:`7459` by :user:`Nelson Liu <nelson-liu>`.
 
    - All tree based estimators now accept a ``min_impurity_decrease``
      parameter in lieu of the ``min_impurity_split``, which is now deprecated.
@@ -506,7 +511,6 @@ API changes summary
      - ``utils.stats.rankdata``
      - ``neighbors.approximate.LSHForest``
 
-
 .. _changes_0_18_1:
 
 Version 0.18.1
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -1133,7 +1133,7 @@ def __init__(self, n_splits=5, n_repeats=10, random_state=None):
 class BaseShuffleSplit(with_metaclass(ABCMeta)):
     """Base class for ShuffleSplit and StratifiedShuffleSplit"""
 
-    def __init__(self, n_splits=10, test_size=0.1, train_size=None,
+    def __init__(self, n_splits=10, test_size="default", train_size=None,
                  random_state=None):
         _validate_shuffle_split_init(test_size, train_size)
         self.n_splits = n_splits
@@ -1211,16 +1211,20 @@ class ShuffleSplit(BaseShuffleSplit):
 
     Parameters
     ----------
-    n_splits : int (default 10)
+    n_splits : int, default 10
         Number of re-shuffling & splitting iterations.
 
-    test_size : float, int, or None, default 0.1
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the test split. If
-        int, represents the absolute number of test samples. If None,
-        the value is automatically set to the complement of the train size.
-
-    train_size : float, int, or None (default is None)
+    test_size : float, int, None, default=0.1
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. By default (the is parameter
+        unspecified), the value is set to 0.1.
+        The default will change in version 0.21. It will remain 0.1 only
+        if ``train_size`` is unspecified, otherwise it will complement
+        the specified ``train_size``.
+
+    train_size : float, int, or None, default=None
         If float, should be between 0.0 and 1.0 and represent the
         proportion of the dataset to include in the train split. If
         int, represents the absolute number of train samples. If None,
@@ -1260,7 +1264,8 @@ class ShuffleSplit(BaseShuffleSplit):
 
     def _iter_indices(self, X, y=None, groups=None):
         n_samples = _num_samples(X)
-        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
+        n_train, n_test = _validate_shuffle_split(n_samples,
+                                                  self.test_size,
                                                   self.train_size)
         rng = check_random_state(self.random_state)
         for i in range(self.n_splits):
@@ -1299,13 +1304,16 @@ class GroupShuffleSplit(ShuffleSplit):
     n_splits : int (default 5)
         Number of re-shuffling & splitting iterations.
 
-    test_size : float (default 0.2), int, or None
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the groups to include in the test split. If
-        int, represents the absolute number of test groups. If None,
-        the value is automatically set to the complement of the train size.
+    test_size : float, int, None, optional
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. By default, the value is set to 0.2.
+        The default will change in version 0.21. It will remain 0.2 only
+        if ``train_size`` is unspecified, otherwise it will complement
+        the specified ``train_size``.
 
-    train_size : float, int, or None (default is None)
+    train_size : float, int, or None, default is None
         If float, should be between 0.0 and 1.0 and represent the
         proportion of the groups to include in the train split. If
         int, represents the absolute number of train groups. If None,
@@ -1319,8 +1327,16 @@ class GroupShuffleSplit(ShuffleSplit):
 
     '''
 
-    def __init__(self, n_splits=5, test_size=0.2, train_size=None,
+    def __init__(self, n_splits=5, test_size="default", train_size=None,
                  random_state=None):
+        if test_size == "default":
+            if train_size is not None:
+                warnings.warn("From version 0.21, test_size will always "
+                              "complement train_size unless both "
+                              "are specified.",
+                              FutureWarning)
+            test_size = 0.2
+
         super(GroupShuffleSplit, self).__init__(
             n_splits=n_splits,
             test_size=test_size,
@@ -1428,16 +1444,19 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
 
     Parameters
     ----------
-    n_splits : int (default 10)
+    n_splits : int, default 10
         Number of re-shuffling & splitting iterations.
 
-    test_size : float (default 0.1), int, or None
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the test split. If
-        int, represents the absolute number of test samples. If None,
-        the value is automatically set to the complement of the train size.
+    test_size : float, int, None, optional
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. By default, the value is set to 0.1.
+        The default will change in version 0.21. It will remain 0.1 only
+        if ``train_size`` is unspecified, otherwise it will complement
+        the specified ``train_size``.
 
-    train_size : float, int, or None (default is None)
+    train_size : float, int, or None, default is None
         If float, should be between 0.0 and 1.0 and represent the
         proportion of the dataset to include in the train split. If
         int, represents the absolute number of train samples. If None,
@@ -1468,7 +1487,7 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
     TRAIN: [0 2] TEST: [3 1]
     """
 
-    def __init__(self, n_splits=10, test_size=0.1, train_size=None,
+    def __init__(self, n_splits=10, test_size="default", train_size=None,
                  random_state=None):
         super(StratifiedShuffleSplit, self).__init__(
             n_splits, test_size, train_size, random_state)
@@ -1563,6 +1582,14 @@ def _validate_shuffle_split_init(test_size, train_size):
     NOTE This does not take into account the number of samples which is known
     only at split
     """
+    if test_size == "default":
+        if train_size is not None:
+            warnings.warn("From version 0.21, test_size will always "
+                          "complement train_size unless both "
+                          "are specified.",
+                          FutureWarning)
+        test_size = 0.1
+
     if test_size is None and train_size is None:
         raise ValueError('test_size and train_size can not both be None')
 
@@ -1597,16 +1624,21 @@ def _validate_shuffle_split(n_samples, test_size, train_size):
     Validation helper to check if the test/test sizes are meaningful wrt to the
     size of the data (n_samples)
     """
-    if (test_size is not None and np.asarray(test_size).dtype.kind == 'i' and
+    if (test_size is not None and
+            np.asarray(test_size).dtype.kind == 'i' and
             test_size >= n_samples):
         raise ValueError('test_size=%d should be smaller than the number of '
                          'samples %d' % (test_size, n_samples))
 
-    if (train_size is not None and np.asarray(train_size).dtype.kind == 'i' and
+    if (train_size is not None and
+            np.asarray(train_size).dtype.kind == 'i' and
             train_size >= n_samples):
         raise ValueError("train_size=%d should be smaller than the number of"
                          " samples %d" % (train_size, n_samples))
 
+    if test_size == "default":
+        test_size = 0.1
+
     if np.asarray(test_size).dtype.kind == 'f':
         n_test = ceil(test_size * n_samples)
     elif np.asarray(test_size).dtype.kind == 'i':
@@ -1844,14 +1876,16 @@ def train_test_split(*arrays, **options):
         Allowed inputs are lists, numpy arrays, scipy-sparse
         matrices or pandas dataframes.
 
-    test_size : float, int, or None (default is None)
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the test split. If
-        int, represents the absolute number of test samples. If None,
-        the value is automatically set to the complement of the train size.
-        If train size is also None, test size is set to 0.25.
+    test_size : float, int, None, optional
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. By default, the value is set to 0.25.
+        The default will change in version 0.21. It will remain 0.25 only
+        if ``train_size`` is unspecified, otherwise it will complement
+        the specified ``train_size``.
 
-    train_size : float, int, or None (default is None)
+    train_size : float, int, or None, default None
         If float, should be between 0.0 and 1.0 and represent the
         proportion of the dataset to include in the train split. If
         int, represents the absolute number of train samples. If None,
@@ -1917,7 +1951,7 @@ def train_test_split(*arrays, **options):
     n_arrays = len(arrays)
     if n_arrays == 0:
         raise ValueError("At least one array required as input")
-    test_size = options.pop('test_size', None)
+    test_size = options.pop('test_size', 'default')
     train_size = options.pop('train_size', None)
     random_state = options.pop('random_state', None)
     stratify = options.pop('stratify', None)
@@ -1926,6 +1960,14 @@ def train_test_split(*arrays, **options):
     if options:
         raise TypeError("Invalid parameters passed: %s" % str(options))
 
+    if test_size == 'default':
+        test_size = None
+        if train_size is not None:
+            warnings.warn("From version 0.21, test_size will always "
+                          "complement train_size unless both "
+                          "are specified.",
+                          FutureWarning)
+
     if test_size is None and train_size is None:
         test_size = 0.25
 
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
@@ -20,6 +20,7 @@
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_warns_message
+from sklearn.utils.testing import assert_warns
 from sklearn.utils.testing import assert_raise_message
 from sklearn.utils.testing import ignore_warnings
 from sklearn.utils.validation import _num_samples
@@ -163,8 +164,8 @@ def test_cross_validator_with_default_params():
     skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)"
     lolo_repr = "LeaveOneGroupOut()"
     lopo_repr = "LeavePGroupsOut(n_groups=2)"
-    ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, "
-               "train_size=None)")
+    ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, "
+               "test_size='default',\n       train_size=None)")
     ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"
 
     n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits,
@@ -527,6 +528,7 @@ def test_shuffle_split():
         assert_array_equal(t3[1], t4[1])
 
 
+@ignore_warnings
 def test_stratified_shuffle_split_init():
     X = np.arange(7)
     y = np.asarray([0, 1, 1, 1, 2, 2, 2])
@@ -859,6 +861,7 @@ def test_leave_one_p_group_out_error_on_fewer_number_of_groups():
                          LeavePGroupsOut(n_groups=3).split(X, y, groups))
 
 
+@ignore_warnings
 def test_repeated_cv_value_errors():
     # n_repeats is not integer or <= 0
     for cv in (RepeatedKFold, RepeatedStratifiedKFold):
@@ -1070,6 +1073,7 @@ def train_test_split_list_input():
         np.testing.assert_equal(y_test3, y_test2)
 
 
+@ignore_warnings
 def test_shufflesplit_errors():
     # When the {test|train}_size is a float/invalid, error is raised at init
     assert_raises(ValueError, ShuffleSplit, test_size=None, train_size=None)
@@ -1366,6 +1370,14 @@ def test_nested_cv():
                         fit_params={'groups': groups})
 
 
+def test_train_test_default_warning():
+    assert_warns(FutureWarning, ShuffleSplit, train_size=0.75)
+    assert_warns(FutureWarning, GroupShuffleSplit, train_size=0.75)
+    assert_warns(FutureWarning, StratifiedShuffleSplit, train_size=0.75)
+    assert_warns(FutureWarning, train_test_split, range(3),
+                 train_size=0.75)
+
+
 def test_build_repr():
     class MockSplitter:
         def __init__(self, a, b=0, c=None):