diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py
index fba26a9ffa5cc..2965f4e18ca60 100644
--- a/sklearn/covariance/graph_lasso_.py
+++ b/sklearn/covariance/graph_lasso_.py
@@ -20,7 +20,8 @@
 from ..utils.extmath import pinvh
 from ..linear_model import lars_path
 from ..linear_model import cd_fast
-from ..cross_validation import _check_cv as check_cv, cross_val_score
+from ..model_selection.partition import _check_cv as check_cv
+from ..model_selection import cross_val_score
 from ..externals.joblib import Parallel, delayed
 import collections
 
@@ -388,7 +389,7 @@ def graph_lasso_path(X, alphas, cov_init=None, X_test=None, mode='cd',
 
 
 class GraphLassoCV(GraphLasso):
-    """Sparse inverse covariance w/ cross-validated choice of the l1 penalty
+    """Sparse inverse covariance w/ cross-from .partition import _check_cvvalidated choice of the l1 penalty
 
     Parameters
     ----------
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index c5206a4fd5239..28168b645b3fa 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -12,22 +12,15 @@
 from __future__ import division
 
 import warnings
-from itertools import chain, combinations
-from math import ceil, floor, factorial
+from math import ceil
 import numbers
-import time
-from abc import ABCMeta, abstractmethod
 
-import numpy as np
-import scipy.sparse as sp
+from .utils import check_random_state
 
-from .base import is_classifier, clone
-from .utils import check_arrays, check_random_state, safe_mask
-from .utils.validation import _num_samples
-from .externals.joblib import Parallel, delayed, logger
-from .externals.six import with_metaclass
-from .externals.six.moves import zip
-from .metrics.scorer import check_scoring
+from .model_selection.partition import LeaveOneOut, LeavePOut, KFold, \
+        StratifiedKFold, LeaveOneLabelOut, LeavePLabelOut, \
+        ShuffleSplit, StratifiedShuffleSplit, train_test_split, check_cv
+from .model_selection.validate import cross_val_score, permutation_test_score
 
 __all__ = ['Bootstrap',
            'KFold',
@@ -44,566 +37,8 @@
            'train_test_split']
 
 
-class _PartitionIterator(with_metaclass(ABCMeta)):
-    """Base class for CV iterators where train_mask = ~test_mask
-
-    Implementations must define `_iter_test_masks` or `_iter_test_indices`.
-
-    Parameters
-    ----------
-    n : int
-        Total number of elements in dataset.
-    """
-
-    def __init__(self, n, indices=None):
-        if indices is None:
-            indices = True
-        else:
-            warnings.warn("The indices parameter is deprecated and will be "
-                          "removed (assumed True) in 0.17", DeprecationWarning,
-                          stacklevel=1)
-        if abs(n - int(n)) >= np.finfo('f').eps:
-            raise ValueError("n must be an integer")
-        self.n = int(n)
-        self._indices = indices
-
-    @property
-    def indices(self):
-        warnings.warn("The indices attribute is deprecated and will be "
-                      "removed (assumed True) in 0.17", DeprecationWarning,
-                      stacklevel=1)
-        return self._indices
-
-    def __iter__(self):
-        indices = self._indices
-        if indices:
-            ind = np.arange(self.n)
-        for test_index in self._iter_test_masks():
-            train_index = np.logical_not(test_index)
-            if indices:
-                train_index = ind[train_index]
-                test_index = ind[test_index]
-            yield train_index, test_index
-
-    # Since subclasses must implement either _iter_test_masks or
-    # _iter_test_indices, neither can be abstract.
-    def _iter_test_masks(self):
-        """Generates boolean masks corresponding to test sets.
-
-        By default, delegates to _iter_test_indices()
-        """
-        for test_index in self._iter_test_indices():
-            test_mask = self._empty_mask()
-            test_mask[test_index] = True
-            yield test_mask
-
-    def _iter_test_indices(self):
-        """Generates integer indices corresponding to test sets."""
-        raise NotImplementedError
-
-    def _empty_mask(self):
-        return np.zeros(self.n, dtype=np.bool)
-
-
-class LeaveOneOut(_PartitionIterator):
-    """Leave-One-Out cross validation iterator.
-
-    Provides train/test indices to split data in train test sets. Each
-    sample is used once as a test set (singleton) while the remaining
-    samples form the training set.
-
-    Note: ``LeaveOneOut(n)`` is equivalent to ``KFold(n, n_folds=n)`` and
-    ``LeavePOut(n, p=1)``.
-
-    Due to the high number of test sets (which is the same as the
-    number of samples) this cross validation method can be very costly.
-    For large datasets one should favor KFold, StratifiedKFold or
-    ShuffleSplit.
-
-    Parameters
-    ----------
-    n : int
-        Total number of elements in dataset.
-
-    Examples
-    --------
-    >>> from sklearn import cross_validation
-    >>> X = np.array([[1, 2], [3, 4]])
-    >>> y = np.array([1, 2])
-    >>> loo = cross_validation.LeaveOneOut(2)
-    >>> len(loo)
-    2
-    >>> print(loo)
-    sklearn.cross_validation.LeaveOneOut(n=2)
-    >>> for train_index, test_index in loo:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    ...    print(X_train, X_test, y_train, y_test)
-    TRAIN: [1] TEST: [0]
-    [[3 4]] [[1 2]] [2] [1]
-    TRAIN: [0] TEST: [1]
-    [[1 2]] [[3 4]] [1] [2]
-
-    See also
-    --------
-    LeaveOneLabelOut for splitting the data according to explicit,
-    domain-specific stratification of the dataset.
-    """
-
-    def _iter_test_indices(self):
-        return range(self.n)
-
-    def __repr__(self):
-        return '%s.%s(n=%i)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.n,
-        )
-
-    def __len__(self):
-        return self.n
-
-
-class LeavePOut(_PartitionIterator):
-    """Leave-P-Out cross validation iterator
-
-    Provides train/test indices to split data in train test sets. This results
-    in testing on all distinct samples of size p, while the remaining n - p
-    samples form the training set in each iteration.
-
-    Note: ``LeavePOut(n, p)`` is NOT equivalent to ``KFold(n, n_folds=n // p)``
-    which creates non-overlapping test sets.
-
-    Due to the high number of iterations which grows combinatorically with the
-    number of samples this cross validation method can be very costly. For
-    large datasets one should favor KFold, StratifiedKFold or ShuffleSplit.
-
-    Parameters
-    ----------
-    n : int
-        Total number of elements in dataset.
-
-    p : int
-        Size of the test sets.
-
-    Examples
-    --------
-    >>> from sklearn import cross_validation
-    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
-    >>> y = np.array([1, 2, 3, 4])
-    >>> lpo = cross_validation.LeavePOut(4, 2)
-    >>> len(lpo)
-    6
-    >>> print(lpo)
-    sklearn.cross_validation.LeavePOut(n=4, p=2)
-    >>> for train_index, test_index in lpo:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    TRAIN: [2 3] TEST: [0 1]
-    TRAIN: [1 3] TEST: [0 2]
-    TRAIN: [1 2] TEST: [0 3]
-    TRAIN: [0 3] TEST: [1 2]
-    TRAIN: [0 2] TEST: [1 3]
-    TRAIN: [0 1] TEST: [2 3]
-    """
-
-    def __init__(self, n, p, indices=None):
-        super(LeavePOut, self).__init__(n, indices)
-        self.p = p
-
-    def _iter_test_indices(self):
-        for comb in combinations(range(self.n), self.p):
-            yield np.array(comb)
-
-    def __repr__(self):
-        return '%s.%s(n=%i, p=%i)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.n,
-            self.p,
-        )
-
-    def __len__(self):
-        return int(factorial(self.n) / factorial(self.n - self.p)
-                   / factorial(self.p))
-
-
-class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)):
-    """Base class to validate KFold approaches"""
-
-    @abstractmethod
-    def __init__(self, n, n_folds, indices, shuffle, random_state):
-        super(_BaseKFold, self).__init__(n, indices)
-
-        if abs(n_folds - int(n_folds)) >= np.finfo('f').eps:
-            raise ValueError("n_folds must be an integer")
-        self.n_folds = n_folds = int(n_folds)
-
-        if n_folds <= 1:
-            raise ValueError(
-                "k-fold cross validation requires at least one"
-                " train / test split by setting n_folds=2 or more,"
-                " got n_folds={0}.".format(n_folds))
-        if n_folds > self.n:
-            raise ValueError(
-                ("Cannot have number of folds n_folds={0} greater"
-                 " than the number of samples: {1}.").format(n_folds, n))
-
-        if not isinstance(shuffle, bool):
-            raise TypeError("shuffle must be True or False;"
-                            " got {0}".format(shuffle))
-        self.shuffle = shuffle
-        self.random_state = random_state
-
-
-class KFold(_BaseKFold):
-    """K-Folds cross validation iterator.
-
-    Provides train/test indices to split data in train test sets. Split
-    dataset into k consecutive folds (without shuffling).
-
-    Each fold is then used a validation set once while the k - 1 remaining
-    fold form the training set.
-
-    Parameters
-    ----------
-    n : int
-        Total number of elements.
-
-    n_folds : int, default=3
-        Number of folds. Must be at least 2.
-
-    shuffle : boolean, optional
-        Whether to shuffle the data before splitting into batches.
-
-    random_state : None, int or RandomState
-        Pseudo-random number generator state used for random
-        sampling. If None, use default numpy RNG for shuffling
-
-    Examples
-    --------
-    >>> from sklearn import cross_validation
-    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
-    >>> y = np.array([1, 2, 3, 4])
-    >>> kf = cross_validation.KFold(4, n_folds=2)
-    >>> len(kf)
-    2
-    >>> print(kf)  # doctest: +NORMALIZE_WHITESPACE
-    sklearn.cross_validation.KFold(n=4, n_folds=2, shuffle=False,
-                                   random_state=None)
-    >>> for train_index, test_index in kf:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    TRAIN: [2 3] TEST: [0 1]
-    TRAIN: [0 1] TEST: [2 3]
-
-    Notes
-    -----
-    The first n % n_folds folds have size n // n_folds + 1, other folds have
-    size n // n_folds.
-
-    See also
-    --------
-    StratifiedKFold: take label information into account to avoid building
-    folds with imbalanced class distributions (for binary or multiclass
-    classification tasks).
-    """
-
-    def __init__(self, n, n_folds=3, indices=None, shuffle=False,
-                 random_state=None):
-        super(KFold, self).__init__(n, n_folds, indices, shuffle, random_state)
-        self.idxs = np.arange(n)
-        if shuffle:
-            rng = check_random_state(self.random_state)
-            rng.shuffle(self.idxs)
-
-    def _iter_test_indices(self):
-        n = self.n
-        n_folds = self.n_folds
-        fold_sizes = (n // n_folds) * np.ones(n_folds, dtype=np.int)
-        fold_sizes[:n % n_folds] += 1
-        current = 0
-        for fold_size in fold_sizes:
-            start, stop = current, current + fold_size
-            yield self.idxs[start:stop]
-            current = stop
-
-    def __repr__(self):
-        return '%s.%s(n=%i, n_folds=%i, shuffle=%s, random_state=%s)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.n,
-            self.n_folds,
-            self.shuffle,
-            self.random_state,
-        )
-
-    def __len__(self):
-        return self.n_folds
-
-
-class StratifiedKFold(_BaseKFold):
-    """Stratified K-Folds cross validation iterator
-
-    Provides train/test indices to split data in train test sets.
-
-    This cross-validation object is a variation of KFold that
-    returns stratified folds. The folds are made by preserving
-    the percentage of samples for each class.
-
-    Parameters
-    ----------
-    y : array-like, [n_samples]
-        Samples to split in K folds.
-
-    n_folds : int, default=3
-        Number of folds. Must be at least 2.
-
-    shuffle : boolean, optional
-        Whether to shuffle each stratification of the data before splitting
-        into batches.
-
-    random_state : None, int or RandomState
-        Pseudo-random number generator state used for random
-        sampling. If None, use default numpy RNG for shuffling
-
-    Examples
-    --------
-    >>> from sklearn import cross_validation
-    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
-    >>> y = np.array([0, 0, 1, 1])
-    >>> skf = cross_validation.StratifiedKFold(y, n_folds=2)
-    >>> len(skf)
-    2
-    >>> print(skf)  # doctest: +NORMALIZE_WHITESPACE
-    sklearn.cross_validation.StratifiedKFold(labels=[0 0 1 1], n_folds=2,
-                                             shuffle=False, random_state=None)
-    >>> for train_index, test_index in skf:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    TRAIN: [1 3] TEST: [0 2]
-    TRAIN: [0 2] TEST: [1 3]
-
-    Notes
-    -----
-    All the folds have size trunc(n_samples / n_folds), the last one has the
-    complementary.
-
-    """
-
-    def __init__(self, y, n_folds=3, indices=None, shuffle=False,
-                 random_state=None):
-        super(StratifiedKFold, self).__init__(
-            len(y), n_folds, indices, shuffle, random_state)
-        y = np.asarray(y)
-        n_samples = y.shape[0]
-        unique_labels, y_inversed = np.unique(y, return_inverse=True)
-        label_counts = np.bincount(y_inversed)
-        min_labels = np.min(label_counts)
-        if self.n_folds > min_labels:
-            warnings.warn(("The least populated class in y has only %d"
-                          " members, which is too few. The minimum"
-                          " number of labels for any class cannot"
-                          " be less than n_folds=%d."
-                          % (min_labels, self.n_folds)), Warning)
-
-        # don't want to use the same seed in each label's shuffle
-        if self.shuffle:
-            rng = check_random_state(self.random_state)
-        else:
-            rng = self.random_state
-
-        # pre-assign each sample to a test fold index using individual KFold
-        # splitting strategies for each label so as to respect the
-        # balance of labels
-        per_label_cvs = [
-            KFold(max(c, self.n_folds), self.n_folds, shuffle=self.shuffle,
-                  random_state=rng) for c in label_counts]
-        test_folds = np.zeros(n_samples, dtype=np.int)
-        for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
-            for label, (_, test_split) in zip(unique_labels, per_label_splits):
-                label_test_folds = test_folds[y == label]
-                # the test split can be too big because we used
-                # KFold(max(c, self.n_folds), self.n_folds) instead of
-                # KFold(c, self.n_folds) to make it possible to not crash even
-                # if the data is not 100% stratifiable for all the labels
-                # (we use a warning instead of raising an exception)
-                # If this is the case, let's trim it:
-                test_split = test_split[test_split < len(label_test_folds)]
-                label_test_folds[test_split] = test_fold_idx
-                test_folds[y == label] = label_test_folds
-
-        self.test_folds = test_folds
-        self.y = y
-
-    def _iter_test_masks(self):
-        for i in range(self.n_folds):
-            yield self.test_folds == i
-
-    def __repr__(self):
-        return '%s.%s(labels=%s, n_folds=%i, shuffle=%s, random_state=%s)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.y,
-            self.n_folds,
-            self.shuffle,
-            self.random_state,
-        )
-
-    def __len__(self):
-        return self.n_folds
-
-
-class LeaveOneLabelOut(_PartitionIterator):
-    """Leave-One-Label_Out cross-validation iterator
-
-    Provides train/test indices to split data according to a third-party
-    provided label. This label information can be used to encode arbitrary
-    domain specific stratifications of the samples as integers.
-
-    For instance the labels could be the year of collection of the samples
-    and thus allow for cross-validation against time-based splits.
-
-    Parameters
-    ----------
-    labels : array-like of int with shape (n_samples,)
-        Arbitrary domain-specific stratification of the data to be used
-        to draw the splits.
-
-    Examples
-    --------
-    >>> from sklearn import cross_validation
-    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
-    >>> y = np.array([1, 2, 1, 2])
-    >>> labels = np.array([1, 1, 2, 2])
-    >>> lol = cross_validation.LeaveOneLabelOut(labels)
-    >>> len(lol)
-    2
-    >>> print(lol)
-    sklearn.cross_validation.LeaveOneLabelOut(labels=[1 1 2 2])
-    >>> for train_index, test_index in lol:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    ...    print(X_train, X_test, y_train, y_test)
-    TRAIN: [2 3] TEST: [0 1]
-    [[5 6]
-     [7 8]] [[1 2]
-     [3 4]] [1 2] [1 2]
-    TRAIN: [0 1] TEST: [2 3]
-    [[1 2]
-     [3 4]] [[5 6]
-     [7 8]] [1 2] [1 2]
-
-    """
-
-    def __init__(self, labels, indices=None):
-        super(LeaveOneLabelOut, self).__init__(len(labels), indices)
-        # We make a copy of labels to avoid side-effects during iteration
-        self.labels = np.array(labels, copy=True)
-        self.unique_labels = np.unique(labels)
-        self.n_unique_labels = len(self.unique_labels)
-
-    def _iter_test_masks(self):
-        for i in self.unique_labels:
-            yield self.labels == i
-
-    def __repr__(self):
-        return '%s.%s(labels=%s)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.labels,
-        )
-
-    def __len__(self):
-        return self.n_unique_labels
-
-
-class LeavePLabelOut(_PartitionIterator):
-    """Leave-P-Label_Out cross-validation iterator
-
-    Provides train/test indices to split data according to a third-party
-    provided label. This label information can be used to encode arbitrary
-    domain specific stratifications of the samples as integers.
-
-    For instance the labels could be the year of collection of the samples
-    and thus allow for cross-validation against time-based splits.
-
-    The difference between LeavePLabelOut and LeaveOneLabelOut is that
-    the former builds the test sets with all the samples assigned to
-    ``p`` different values of the labels while the latter uses samples
-    all assigned the same labels.
-
-    Parameters
-    ----------
-    labels : array-like of int with shape (n_samples,)
-        Arbitrary domain-specific stratification of the data to be used
-        to draw the splits.
-
-    p : int
-        Number of samples to leave out in the test split.
-
-    Examples
-    --------
-    >>> from sklearn import cross_validation
-    >>> X = np.array([[1, 2], [3, 4], [5, 6]])
-    >>> y = np.array([1, 2, 1])
-    >>> labels = np.array([1, 2, 3])
-    >>> lpl = cross_validation.LeavePLabelOut(labels, p=2)
-    >>> len(lpl)
-    3
-    >>> print(lpl)
-    sklearn.cross_validation.LeavePLabelOut(labels=[1 2 3], p=2)
-    >>> for train_index, test_index in lpl:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    ...    print(X_train, X_test, y_train, y_test)
-    TRAIN: [2] TEST: [0 1]
-    [[5 6]] [[1 2]
-     [3 4]] [1] [1 2]
-    TRAIN: [1] TEST: [0 2]
-    [[3 4]] [[1 2]
-     [5 6]] [2] [1 1]
-    TRAIN: [0] TEST: [1 2]
-    [[1 2]] [[3 4]
-     [5 6]] [1] [2 1]
-    """
-
-    def __init__(self, labels, p, indices=None):
-        # We make a copy of labels to avoid side-effects during iteration
-        super(LeavePLabelOut, self).__init__(len(labels), indices)
-        self.labels = np.array(labels, copy=True)
-        self.unique_labels = np.unique(labels)
-        self.n_unique_labels = len(self.unique_labels)
-        self.p = p
-
-    def _iter_test_masks(self):
-        comb = combinations(range(self.n_unique_labels), self.p)
-        for idx in comb:
-            test_index = self._empty_mask()
-            idx = np.array(idx)
-            for l in self.unique_labels[idx]:
-                test_index[self.labels == l] = True
-            yield test_index
-
-    def __repr__(self):
-        return '%s.%s(labels=%s, p=%s)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.labels,
-            self.p,
-        )
-
-    def __len__(self):
-        return int(factorial(self.n_unique_labels) /
-                   factorial(self.n_unique_labels - self.p) /
-                   factorial(self.p))
-
+#TODO: move Boostrap somewhere else and import it from here
+#TODO: issue a DeprecationWarning when this module is imported
 
 class Bootstrap(object):
     """Random sampling with replacement cross-validation iterator
@@ -745,817 +180,6 @@ def __len__(self):
         return self.n_iter
 
 
-class BaseShuffleSplit(with_metaclass(ABCMeta)):
-    """Base class for ShuffleSplit and StratifiedShuffleSplit"""
-
-    def __init__(self, n, n_iter=10, test_size=0.1, train_size=None,
-                 indices=None, random_state=None, n_iterations=None):
-        if indices is None:
-            indices = True
-        else:
-            warnings.warn("The indices parameter is deprecated and will be "
-                          "removed (assumed True) in 0.17", DeprecationWarning)
-        self.n = n
-        self.n_iter = n_iter
-        if n_iterations is not None:  # pragma: no cover
-            warnings.warn("n_iterations was renamed to n_iter for consistency "
-                          " and will be removed in 0.16.")
-            self.n_iter = n_iterations
-        self.test_size = test_size
-        self.train_size = train_size
-        self.random_state = random_state
-        self._indices = indices
-        self.n_train, self.n_test = _validate_shuffle_split(n,
-                                                            test_size,
-                                                            train_size)
-
-    @property
-    def indices(self):
-        warnings.warn("The indices attribute is deprecated and will be "
-                      "removed (assumed True) in 0.17", DeprecationWarning,
-                      stacklevel=1)
-        return self._indices
-
-    def __iter__(self):
-        if self._indices:
-            for train, test in self._iter_indices():
-                yield train, test
-            return
-        for train, test in self._iter_indices():
-            train_m = np.zeros(self.n, dtype=bool)
-            test_m = np.zeros(self.n, dtype=bool)
-            train_m[train] = True
-            test_m[test] = True
-            yield train_m, test_m
-
-    @abstractmethod
-    def _iter_indices(self):
-        """Generate (train, test) indices"""
-
-
-class ShuffleSplit(BaseShuffleSplit):
-    """Random permutation cross-validation iterator.
-
-    Yields indices to split data into training and test sets.
-
-    Note: contrary to other cross-validation strategies, random splits
-    do not guarantee that all folds will be different, although this is
-    still very likely for sizeable datasets.
-
-    Parameters
-    ----------
-    n : int
-        Total number of elements in the dataset.
-
-    n_iter : int (default 10)
-        Number of re-shuffling & splitting iterations.
-
-    test_size : float (default 0.1), int, or None
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the test split. If
-        int, represents the absolute number of test samples. If None,
-        the value is automatically set to the complement of the train size.
-
-    train_size : float, int, or None (default is None)
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the train split. If
-        int, represents the absolute number of train samples. If None,
-        the value is automatically set to the complement of the test size.
-
-    random_state : int or RandomState
-        Pseudo-random number generator state used for random sampling.
-
-    Examples
-    --------
-    >>> from sklearn import cross_validation
-    >>> rs = cross_validation.ShuffleSplit(4, n_iter=3,
-    ...     test_size=.25, random_state=0)
-    >>> len(rs)
-    3
-    >>> print(rs)
-    ... # doctest: +ELLIPSIS
-    ShuffleSplit(4, n_iter=3, test_size=0.25, ...)
-    >>> for train_index, test_index in rs:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...
-    TRAIN: [3 1 0] TEST: [2]
-    TRAIN: [2 1 3] TEST: [0]
-    TRAIN: [0 2 1] TEST: [3]
-
-    >>> rs = cross_validation.ShuffleSplit(4, n_iter=3,
-    ...     train_size=0.5, test_size=.25, random_state=0)
-    >>> for train_index, test_index in rs:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...
-    TRAIN: [3 1] TEST: [2]
-    TRAIN: [2 1] TEST: [0]
-    TRAIN: [0 2] TEST: [3]
-
-    See also
-    --------
-    Bootstrap: cross-validation using re-sampling with replacement.
-    """
-
-    def _iter_indices(self):
-        rng = check_random_state(self.random_state)
-        for i in range(self.n_iter):
-            # random partition
-            permutation = rng.permutation(self.n)
-            ind_test = permutation[:self.n_test]
-            ind_train = permutation[self.n_test:self.n_test + self.n_train]
-            yield ind_train, ind_test
-
-    def __repr__(self):
-        return ('%s(%d, n_iter=%d, test_size=%s, '
-                'random_state=%s)' % (
-                    self.__class__.__name__,
-                    self.n,
-                    self.n_iter,
-                    str(self.test_size),
-                    self.random_state,
-                ))
-
-    def __len__(self):
-        return self.n_iter
-
-
-def _validate_shuffle_split(n, test_size, train_size):
-    if test_size is None and train_size is None:
-        raise ValueError(
-            'test_size and train_size can not both be None')
-
-    if test_size is not None:
-        if np.asarray(test_size).dtype.kind == 'f':
-            if test_size >= 1.:
-                raise ValueError(
-                    'test_size=%f should be smaller '
-                    'than 1.0 or be an integer' % test_size)
-        elif np.asarray(test_size).dtype.kind == 'i':
-            if test_size >= n:
-                raise ValueError(
-                    'test_size=%d should be smaller '
-                    'than the number of samples %d' % (test_size, n))
-        else:
-            raise ValueError("Invalid value for test_size: %r" % test_size)
-
-    if train_size is not None:
-        if np.asarray(train_size).dtype.kind == 'f':
-            if train_size >= 1.:
-                raise ValueError("train_size=%f should be smaller "
-                                 "than 1.0 or be an integer" % train_size)
-            elif np.asarray(test_size).dtype.kind == 'f' and \
-                    train_size + test_size > 1.:
-                raise ValueError('The sum of test_size and train_size = %f, '
-                                 'should be smaller than 1.0. Reduce '
-                                 'test_size and/or train_size.' %
-                                 (train_size + test_size))
-        elif np.asarray(train_size).dtype.kind == 'i':
-            if train_size >= n:
-                raise ValueError("train_size=%d should be smaller "
-                                 "than the number of samples %d" %
-                                 (train_size, n))
-        else:
-            raise ValueError("Invalid value for train_size: %r" % train_size)
-
-    if np.asarray(test_size).dtype.kind == 'f':
-        n_test = ceil(test_size * n)
-    elif np.asarray(test_size).dtype.kind == 'i':
-        n_test = float(test_size)
-
-    if train_size is None:
-        n_train = n - n_test
-    else:
-        if np.asarray(train_size).dtype.kind == 'f':
-            n_train = floor(train_size * n)
-        else:
-            n_train = float(train_size)
-
-    if test_size is None:
-        n_test = n - n_train
-
-    if n_train + n_test > n:
-        raise ValueError('The sum of train_size and test_size = %d, '
-                         'should be smaller than the number of '
-                         'samples %d. Reduce test_size and/or '
-                         'train_size.' % (n_train + n_test, n))
-
-    return int(n_train), int(n_test)
-
-
-class StratifiedShuffleSplit(BaseShuffleSplit):
-    """Stratified ShuffleSplit cross validation iterator
-
-    Provides train/test indices to split data in train test sets.
-
-    This cross-validation object is a merge of StratifiedKFold and
-    ShuffleSplit, which returns stratified randomized folds. The folds
-    are made by preserving the percentage of samples for each class.
-
-    Note: like the ShuffleSplit strategy, stratified random splits
-    do not guarantee that all folds will be different, although this is
-    still very likely for sizeable datasets.
-
-    Parameters
-    ----------
-    y : array, [n_samples]
-        Labels of samples.
-
-    n_iter : int (default 10)
-        Number of re-shuffling & splitting iterations.
-
-    test_size : float (default 0.1), int, or None
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the test split. If
-        int, represents the absolute number of test samples. If None,
-        the value is automatically set to the complement of the train size.
-
-    train_size : float, int, or None (default is None)
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the train split. If
-        int, represents the absolute number of train samples. If None,
-        the value is automatically set to the complement of the test size.
-
-    random_state : int or RandomState
-        Pseudo-random number generator state used for random sampling.
-
-    Examples
-    --------
-    >>> from sklearn.cross_validation import StratifiedShuffleSplit
-    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
-    >>> y = np.array([0, 0, 1, 1])
-    >>> sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)
-    >>> len(sss)
-    3
-    >>> print(sss)       # doctest: +ELLIPSIS
-    StratifiedShuffleSplit(labels=[0 0 1 1], n_iter=3, ...)
-    >>> for train_index, test_index in sss:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    TRAIN: [1 2] TEST: [3 0]
-    TRAIN: [0 2] TEST: [1 3]
-    TRAIN: [0 2] TEST: [3 1]
-    """
-
-    def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
-                 indices=None, random_state=None, n_iterations=None):
-
-        super(StratifiedShuffleSplit, self).__init__(
-            len(y), n_iter, test_size, train_size, indices, random_state,
-            n_iterations)
-        self.y = np.array(y)
-        self.classes, self.y_indices = np.unique(y, return_inverse=True)
-        n_cls = self.classes.shape[0]
-
-        if np.min(np.bincount(self.y_indices)) < 2:
-            raise ValueError("The least populated class in y has only 1"
-                             " member, which is too few. The minimum"
-                             " number of labels for any class cannot"
-                             " be less than 2.")
-
-        if self.n_train < n_cls:
-            raise ValueError('The train_size = %d should be greater or '
-                             'equal to the number of classes = %d' %
-                             (self.n_train, n_cls))
-        if self.n_test < n_cls:
-            raise ValueError('The test_size = %d should be greater or '
-                             'equal to the number of classes = %d' %
-                             (self.n_test, n_cls))
-
-    def _iter_indices(self):
-        rng = check_random_state(self.random_state)
-        cls_count = np.bincount(self.y_indices)
-        p_i = cls_count / float(self.n)
-        n_i = np.round(self.n_train * p_i).astype(int)
-        t_i = np.minimum(cls_count - n_i,
-                         np.round(self.n_test * p_i).astype(int))
-
-        for n in range(self.n_iter):
-            train = []
-            test = []
-
-            for i, cls in enumerate(self.classes):
-                permutation = rng.permutation(cls_count[i])
-                cls_i = np.where((self.y == cls))[0][permutation]
-
-                train.extend(cls_i[:n_i[i]])
-                test.extend(cls_i[n_i[i]:n_i[i] + t_i[i]])
-
-            # Because of rounding issues (as n_train and n_test are not
-            # dividers of the number of elements per class), we may end
-            # up here with less samples in train and test than asked for.
-            if len(train) < self.n_train or len(test) < self.n_test:
-                # We complete by affecting randomly the missing indexes
-                missing_idx = np.where(np.bincount(train + test,
-                                                   minlength=len(self.y)) == 0,
-                                       )[0]
-                missing_idx = rng.permutation(missing_idx)
-                train.extend(missing_idx[:(self.n_train - len(train))])
-                test.extend(missing_idx[-(self.n_test - len(test)):])
-
-            train = rng.permutation(train)
-            test = rng.permutation(test)
-
-            yield train, test
-
-    def __repr__(self):
-        return ('%s(labels=%s, n_iter=%d, test_size=%s, '
-                'random_state=%s)' % (
-                    self.__class__.__name__,
-                    self.y,
-                    self.n_iter,
-                    str(self.test_size),
-                    self.random_state,
-                ))
-
-    def __len__(self):
-        return self.n_iter
-
-
 ##############################################################################
 
 
-def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
-                    verbose=0, fit_params=None, score_func=None,
-                    pre_dispatch='2*n_jobs'):
-    """Evaluate a score by cross-validation
-
-    Parameters
-    ----------
-    estimator : estimator object implementing 'fit'
-        The object to use to fit the data.
-
-    X : array-like
-        The data to fit. Can be, for example a list, or an array at least 2d.
-
-    y : array-like, optional, default: None
-        The target variable to try to predict in the case of
-        supervised learning.
-
-    scoring : string, callable or None, optional, default: None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    cv : cross-validation generator, optional, default: None
-        A cross-validation generator. If None, a 3-fold cross
-        validation is used or 3-fold stratified cross-validation
-        when y is supplied and estimator is a classifier.
-
-    n_jobs : integer, optional
-        The number of CPUs to use to do the computation. -1 means
-        'all CPUs'.
-
-    verbose : integer, optional
-        The verbosity level.
-
-    fit_params : dict, optional
-        Parameters to pass to the fit method of the estimator.
-
-    pre_dispatch : int, or string, optional
-        Controls the number of jobs that get dispatched during parallel
-        execution. Reducing this number can be useful to avoid an
-        explosion of memory consumption when more jobs get dispatched
-        than CPUs can process. This parameter can be:
-
-            - None, in which case all the jobs are immediately
-              created and spawned. Use this for lightweight and
-              fast-running jobs, to avoid delays due to on-demand
-              spawning of the jobs
-
-            - An int, giving the exact number of total jobs that are
-              spawned
-
-            - A string, giving an expression as a function of n_jobs,
-              as in '2*n_jobs'
-
-    Returns
-    -------
-    scores : array of float, shape=(len(list(cv)),)
-        Array of scores of the estimator for each run of the cross validation.
-    """
-    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True,
-                        allow_nans=True, allow_nd=True)
-    if y is not None:
-        y = np.asarray(y)
-
-    cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
-    scorer = check_scoring(estimator, score_func=score_func, scoring=scoring)
-    # We clone the estimator to make sure that all the folds are
-    # independent, and that it is pickle-able.
-    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
-                        pre_dispatch=pre_dispatch)
-    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
-                                              train, test, verbose, None,
-                                              fit_params)
-                      for train, test in cv)
-    return np.array(scores)[:, 0]
-
-
-def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
-                   fit_params, return_train_score=False,
-                   return_parameters=False):
-    """Fit estimator and compute scores for a given dataset split.
-
-    Parameters
-    ----------
-    estimator : estimator object implementing 'fit'
-        The object to use to fit the data.
-
-    X : array-like of shape at least 2D
-        The data to fit.
-
-    y : array-like, optional, default: None
-        The target variable to try to predict in the case of
-        supervised learning.
-
-    scoring : callable
-        A scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    train : array-like, shape = (n_train_samples,)
-        Indices of training samples.
-
-    test : array-like, shape = (n_test_samples,)
-        Indices of test samples.
-
-    verbose : integer
-        The verbosity level.
-
-    parameters : dict or None
-        Parameters to be set on the estimator.
-
-    fit_params : dict or None
-        Parameters that will be passed to ``estimator.fit``.
-
-    return_train_score : boolean, optional, default: False
-        Compute and return score on training set.
-
-    return_parameters : boolean, optional, default: False
-        Return parameters that has been used for the estimator.
-
-    Returns
-    -------
-    train_score : float, optional
-        Score on training set, returned only if `return_train_score` is `True`.
-
-    test_score : float
-        Score on test set.
-
-    n_test_samples : int
-        Number of test samples.
-
-    scoring_time : float
-        Time spent for fitting and scoring in seconds.
-
-    parameters : dict or None, optional
-        The parameters that have been evaluated.
-    """
-    if verbose > 1:
-        if parameters is None:
-            msg = "no parameters to be set"
-        else:
-            msg = '%s' % (', '.join('%s=%s' % (k, v)
-                          for k, v in parameters.items()))
-        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))
-
-    # Adjust lenght of sample weights
-    n_samples = _num_samples(X)
-    fit_params = fit_params if fit_params is not None else {}
-    fit_params = dict([(k, np.asarray(v)[train]
-                       if hasattr(v, '__len__') and len(v) == n_samples else v)
-                       for k, v in fit_params.items()])
-
-    if parameters is not None:
-        estimator.set_params(**parameters)
-
-    start_time = time.time()
-
-    X_train, y_train = _safe_split(estimator, X, y, train)
-    X_test, y_test = _safe_split(estimator, X, y, test, train)
-    if y_train is None:
-        estimator.fit(X_train, **fit_params)
-    else:
-        estimator.fit(X_train, y_train, **fit_params)
-    test_score = _score(estimator, X_test, y_test, scorer)
-    if return_train_score:
-        train_score = _score(estimator, X_train, y_train, scorer)
-
-    scoring_time = time.time() - start_time
-
-    if verbose > 2:
-        msg += ", score=%f" % test_score
-    if verbose > 1:
-        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
-        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
-
-    ret = [train_score] if return_train_score else []
-    ret.extend([test_score, _num_samples(X_test), scoring_time])
-    if return_parameters:
-        ret.append(parameters)
-    return ret
-
-
-def _safe_split(estimator, X, y, indices, train_indices=None):
-    """Create subset of dataset and properly handle kernels."""
-    if hasattr(estimator, 'kernel') and callable(estimator.kernel):
-        # cannot compute the kernel values with custom function
-        raise ValueError("Cannot use a custom kernel function. "
-                         "Precompute the kernel matrix instead.")
-
-    if not hasattr(X, "shape"):
-        if getattr(estimator, "_pairwise", False):
-            raise ValueError("Precomputed kernels or affinity matrices have "
-                             "to be passed as arrays or sparse matrices.")
-        X_subset = [X[idx] for idx in indices]
-    else:
-        if getattr(estimator, "_pairwise", False):
-            # X is a precomputed square kernel matrix
-            if X.shape[0] != X.shape[1]:
-                raise ValueError("X should be a square kernel matrix")
-            if train_indices is None:
-                X_subset = X[np.ix_(indices, indices)]
-            else:
-                X_subset = X[np.ix_(indices, train_indices)]
-        else:
-            X_subset = X[safe_mask(X, indices)]
-
-    if y is not None:
-        y_subset = y[safe_mask(y, indices)]
-    else:
-        y_subset = None
-
-    return X_subset, y_subset
-
-
-def _score(estimator, X_test, y_test, scorer):
-    """Compute the score of an estimator on a given test set."""
-    if y_test is None:
-        score = scorer(estimator, X_test)
-    else:
-        score = scorer(estimator, X_test, y_test)
-    if not isinstance(score, numbers.Number):
-        raise ValueError("scoring must return a number, got %s (%s) instead."
-                         % (str(score), type(score)))
-    return score
-
-
-def _permutation_test_score(estimator, X, y, cv, scorer):
-    """Auxiliary function for permutation_test_score"""
-    avg_score = []
-    for train, test in cv:
-        estimator.fit(X[train], y[train])
-        avg_score.append(scorer(estimator, X[test], y[test]))
-    return np.mean(avg_score)
-
-
-def _shuffle(y, labels, random_state):
-    """Return a shuffled copy of y eventually shuffle among same labels."""
-    if labels is None:
-        ind = random_state.permutation(len(y))
-    else:
-        ind = np.arange(len(labels))
-        for label in np.unique(labels):
-            this_mask = (labels == label)
-            ind[this_mask] = random_state.permutation(ind[this_mask])
-    return y[ind]
-
-
-def check_cv(cv, X=None, y=None, classifier=False):
-    """Input checker utility for building a CV in a user friendly way.
-
-    Parameters
-    ----------
-    cv : int, a cv generator instance, or None
-        The input specifying which cv generator to use. It can be an
-        integer, in which case it is the number of folds in a KFold,
-        None, in which case 3 fold is used, or another object, that
-        will then be used as a cv generator.
-
-    X : array-like
-        The data the cross-val object will be applied on.
-
-    y : array-like
-        The target variable for a supervised learning problem.
-
-    classifier : boolean optional
-        Whether the task is a classification task, in which case
-        stratified KFold will be used.
-
-    Returns
-    -------
-    checked_cv: a cross-validation generator instance.
-        The return value is guaranteed to be a cv generator instance, whatever
-        the input type.
-    """
-    return _check_cv(cv, X=X, y=y, classifier=classifier, warn_mask=True)
-
-
-def _check_cv(cv, X=None, y=None, classifier=False, warn_mask=False):
-    # This exists for internal use while indices is being deprecated.
-    is_sparse = sp.issparse(X)
-    needs_indices = is_sparse or not hasattr(X, "shape")
-    if cv is None:
-        cv = 3
-    if isinstance(cv, numbers.Integral):
-        if warn_mask and not needs_indices:
-            warnings.warn('check_cv will return indices instead of boolean '
-                          'masks from 0.17', DeprecationWarning)
-        else:
-            needs_indices = None
-        if classifier:
-            cv = StratifiedKFold(y, cv, indices=needs_indices)
-        else:
-            if not is_sparse:
-                n_samples = len(X)
-            else:
-                n_samples = X.shape[0]
-            cv = KFold(n_samples, cv, indices=needs_indices)
-    if needs_indices and not getattr(cv, "_indices", True):
-        raise ValueError("Sparse data and lists require indices-based cross"
-                         " validation generator, got: %r", cv)
-    return cv
-
-
-def permutation_test_score(estimator, X, y, score_func=None, cv=None,
-                           n_permutations=100, n_jobs=1, labels=None,
-                           random_state=0, verbose=0, scoring=None):
-    """Evaluate the significance of a cross-validated score with permutations
-
-    Parameters
-    ----------
-    estimator : estimator object implementing 'fit'
-        The object to use to fit the data.
-
-    X : array-like of shape at least 2D
-        The data to fit.
-
-    y : array-like
-        The target variable to try to predict in the case of
-        supervised learning.
-
-    scoring : string, callable or None, optional, default: None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    cv : integer or cross-validation generator, optional
-        If an integer is passed, it is the number of fold (default 3).
-        Specific cross-validation objects can be passed, see
-        sklearn.cross_validation module for the list of possible objects.
-
-    n_permutations : integer, optional
-        Number of times to permute ``y``.
-
-    n_jobs : integer, optional
-        The number of CPUs to use to do the computation. -1 means
-        'all CPUs'.
-
-    labels : array-like of shape [n_samples] (optional)
-        Labels constrain the permutation among groups of samples with
-        a same label.
-
-    random_state : RandomState or an int seed (0 by default)
-        A random number generator instance to define the state of the
-        random permutations generator.
-
-    verbose : integer, optional
-        The verbosity level.
-
-    Returns
-    -------
-    score : float
-        The true score without permuting targets.
-
-    permutation_scores : array, shape = [n_permutations]
-        The scores obtained for each permutations.
-
-    pvalue : float
-        The returned value equals p-value if `score_func` returns bigger
-        numbers for better scores (e.g., accuracy_score). If `score_func` is
-        rather a loss function (i.e. when lower is better such as with
-        `mean_squared_error`) then this is actually the complement of the
-        p-value:  1 - p-value.
-
-    Notes
-    -----
-    This function implements Test 1 in:
-
-        Ojala and Garriga. Permutation Tests for Studying Classifier
-        Performance.  The Journal of Machine Learning Research (2010)
-        vol. 11
-
-    """
-    X, y = check_arrays(X, y, sparse_format='csr', allow_nans=True)
-    cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
-    scorer = check_scoring(estimator, scoring=scoring, score_func=score_func)
-    random_state = check_random_state(random_state)
-
-    # We clone the estimator to make sure that all the folds are
-    # independent, and that it is pickle-able.
-    score = _permutation_test_score(clone(estimator), X, y, cv, scorer)
-    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(_permutation_test_score)(
-            clone(estimator), X, _shuffle(y, labels, random_state), cv,
-            scorer)
-        for _ in range(n_permutations))
-    permutation_scores = np.array(permutation_scores)
-    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
-    return score, permutation_scores, pvalue
-
-
-permutation_test_score.__test__ = False  # to avoid a pb with nosetests
-
-
-def train_test_split(*arrays, **options):
-    """Split arrays or matrices into random train and test subsets
-
-    Quick utility that wraps calls to ``check_arrays`` and
-    ``next(iter(ShuffleSplit(n_samples)))`` and application to input
-    data into a single call for splitting (and optionally subsampling)
-    data in a oneliner.
-
-    Parameters
-    ----------
-    *arrays : sequence of arrays or scipy.sparse matrices with same shape[0]
-        Python lists or tuples occurring in arrays are converted to 1D numpy
-        arrays.
-
-    test_size : float, int, or None (default is None)
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the test split. If
-        int, represents the absolute number of test samples. If None,
-        the value is automatically set to the complement of the train size.
-        If train size is also None, test size is set to 0.25.
-
-    train_size : float, int, or None (default is None)
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the train split. If
-        int, represents the absolute number of train samples. If None,
-        the value is automatically set to the complement of the test size.
-
-    random_state : int or RandomState
-        Pseudo-random number generator state used for random sampling.
-
-    dtype : a numpy dtype instance, None by default
-        Enforce a specific dtype.
-
-    Returns
-    -------
-    splitting : list of arrays, length=2 * len(arrays)
-        List containing train-test split of input array.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.cross_validation import train_test_split
-    >>> a, b = np.arange(10).reshape((5, 2)), range(5)
-    >>> a
-    array([[0, 1],
-           [2, 3],
-           [4, 5],
-           [6, 7],
-           [8, 9]])
-    >>> list(b)
-    [0, 1, 2, 3, 4]
-
-    >>> a_train, a_test, b_train, b_test = train_test_split(
-    ...     a, b, test_size=0.33, random_state=42)
-    ...
-    >>> a_train
-    array([[4, 5],
-           [0, 1],
-           [6, 7]])
-    >>> b_train
-    array([2, 0, 3])
-    >>> a_test
-    array([[2, 3],
-           [8, 9]])
-    >>> b_test
-    array([1, 4])
-
-    """
-    n_arrays = len(arrays)
-    if n_arrays == 0:
-        raise ValueError("At least one array required as input")
-
-    test_size = options.pop('test_size', None)
-    train_size = options.pop('train_size', None)
-    random_state = options.pop('random_state', None)
-    options['sparse_format'] = 'csr'
-    options['allow_nans'] = True
-
-    if test_size is None and train_size is None:
-        test_size = 0.25
-
-    arrays = check_arrays(*arrays, **options)
-    n_samples = arrays[0].shape[0]
-    cv = ShuffleSplit(n_samples, test_size=test_size,
-                      train_size=train_size,
-                      random_state=random_state)
-
-    train, test = next(iter(cv))
-    return list(chain.from_iterable((a[train], a[test]) for a in arrays))
-
-
-train_test_split.__test__ = False  # to avoid a pb with nosetests
diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py
index be5cdaec5ecdc..b07a7c32350ef 100644
--- a/sklearn/feature_selection/rfe.py
+++ b/sklearn/feature_selection/rfe.py
@@ -12,8 +12,8 @@
 from ..base import MetaEstimatorMixin
 from ..base import clone
 from ..base import is_classifier
-from ..cross_validation import _check_cv as check_cv
-from ..cross_validation import _safe_split, _score
+from ..model_selection.partition import _check_cv as check_cv
+from ..model_selection.validate import _safe_split, _score
 from .base import SelectorMixin
 from ..metrics.scorer import check_scoring
 
diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
index 753803509b230..bc425750d5c53 100644
--- a/sklearn/grid_search.py
+++ b/sklearn/grid_search.py
@@ -10,741 +10,12 @@
 #         Olivier Grisel <olivier.grisel@ensta.org>
 # License: BSD 3 clause
 
-from abc import ABCMeta, abstractmethod
-from collections import Mapping, namedtuple, Sized
-from functools import partial, reduce
-from itertools import product
-import operator
-
-import numpy as np
-
-from .base import BaseEstimator, is_classifier, clone
-from .base import MetaEstimatorMixin
-from .cross_validation import _check_cv as check_cv
-from .cross_validation import _fit_and_score
-from .externals.joblib import Parallel, delayed
-from .externals import six
-from .utils import check_random_state
-from .utils.validation import _num_samples, check_arrays
-from .metrics.scorer import check_scoring
+#TODO: add deprecation warning to this module
 
+from .model_selection.search import GridSearchCV, RandomizedSearchCV
+from .model_selection.utils import ParameterGrid, ParameterSampler, \
+        fit_grid_point
 
 __all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point',
            'ParameterSampler', 'RandomizedSearchCV']
 
-
-class ParameterGrid(object):
-    """Grid of parameters with a discrete number of values for each.
-
-    Can be used to iterate over parameter value combinations with the
-    Python built-in function iter.
-
-    Parameters
-    ----------
-    param_grid : dict of string to sequence, or sequence of such
-        The parameter grid to explore, as a dictionary mapping estimator
-        parameters to sequences of allowed values.
-
-        An empty dict signifies default parameters.
-
-        A sequence of dicts signifies a sequence of grids to search, and is
-        useful to avoid exploring parameter combinations that make no sense
-        or have no effect. See the examples below.
-
-    Examples
-    --------
-    >>> from sklearn.grid_search import ParameterGrid
-    >>> param_grid = {'a': [1, 2], 'b': [True, False]}
-    >>> list(ParameterGrid(param_grid)) == (
-    ...    [{'a': 1, 'b': True}, {'a': 1, 'b': False},
-    ...     {'a': 2, 'b': True}, {'a': 2, 'b': False}])
-    True
-
-    >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
-    >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
-    ...                               {'kernel': 'rbf', 'gamma': 1},
-    ...                               {'kernel': 'rbf', 'gamma': 10}]
-    True
-
-    See also
-    --------
-    :class:`GridSearchCV`:
-        uses ``ParameterGrid`` to perform a full parallelized parameter search.
-    """
-
-    def __init__(self, param_grid):
-        if isinstance(param_grid, Mapping):
-            # wrap dictionary in a singleton list to support either dict
-            # or list of dicts
-            param_grid = [param_grid]
-        self.param_grid = param_grid
-
-    def __iter__(self):
-        """Iterate over the points in the grid.
-
-        Returns
-        -------
-        params : iterator over dict of string to any
-            Yields dictionaries mapping each estimator parameter to one of its
-            allowed values.
-        """
-        for p in self.param_grid:
-            # Always sort the keys of a dictionary, for reproducibility
-            items = sorted(p.items())
-            if not items:
-                yield {}
-            else:
-                keys, values = zip(*items)
-                for v in product(*values):
-                    params = dict(zip(keys, v))
-                    yield params
-
-    def __len__(self):
-        """Number of points on the grid."""
-        # Product function that can handle iterables (np.product can't).
-        product = partial(reduce, operator.mul)
-        return sum(product(len(v) for v in p.values()) if p else 1
-                   for p in self.param_grid)
-
-
-class ParameterSampler(object):
-    """Generator on parameters sampled from given distributions.
-
-    Non-deterministic iterable over random candidate combinations for hyper-
-    parameter search.
-
-    Note that as of SciPy 0.12, the ``scipy.stats.distributions`` do not accept
-    a custom RNG instance and always use the singleton RNG from
-    ``numpy.random``. Hence setting ``random_state`` will not guarantee a
-    deterministic iteration whenever ``scipy.stats`` distributions are used to
-    define the parameter search space.
-
-    Parameters
-    ----------
-    param_distributions : dict
-        Dictionary where the keys are parameters and values
-        are distributions from which a parameter is to be sampled.
-        Distributions either have to provide a ``rvs`` function
-        to sample from them, or can be given as a list of values,
-        where a uniform distribution is assumed.
-
-    n_iter : integer
-        Number of parameter settings that are produced.
-
-    random_state : int or RandomState
-        Pseudo random number generator state used for random uniform sampling
-        from lists of possible values instead of scipy.stats distributions.
-
-    Returns
-    -------
-    params : dict of string to any
-        **Yields** dictionaries mapping each estimator parameter to
-        as sampled value.
-
-    Examples
-    --------
-    >>> from sklearn.grid_search import ParameterSampler
-    >>> from scipy.stats.distributions import expon
-    >>> import numpy as np
-    >>> np.random.seed(0)
-    >>> param_grid = {'a':[1, 2], 'b': expon()}
-    >>> param_list = list(ParameterSampler(param_grid, n_iter=4))
-    >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())
-    ...                 for d in param_list]
-    >>> rounded_list == [{'b': 0.89856, 'a': 1},
-    ...                  {'b': 0.923223, 'a': 1},
-    ...                  {'b': 1.878964, 'a': 2},
-    ...                  {'b': 1.038159, 'a': 2}]
-    True
-    """
-    def __init__(self, param_distributions, n_iter, random_state=None):
-        self.param_distributions = param_distributions
-        self.n_iter = n_iter
-        self.random_state = random_state
-
-    def __iter__(self):
-        rnd = check_random_state(self.random_state)
-        # Always sort the keys of a dictionary, for reproducibility
-        items = sorted(self.param_distributions.items())
-        for _ in range(self.n_iter):
-            params = dict()
-            for k, v in items:
-                if hasattr(v, "rvs"):
-                    params[k] = v.rvs()
-                else:
-                    params[k] = v[rnd.randint(len(v))]
-            yield params
-
-    def __len__(self):
-        """Number of points that will be sampled."""
-        return self.n_iter
-
-
-def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
-                   verbose, **fit_params):
-    """Run fit on one set of parameters.
-
-    Parameters
-    ----------
-    X : array-like, sparse matrix or list
-        Input data.
-
-    y : array-like or None
-        Targets for input data.
-
-    estimator : estimator object
-        This estimator will be cloned and then fitted.
-
-    parameters : dict
-        Parameters to be set on estimator for this grid point.
-
-    train : ndarray, dtype int or bool
-        Boolean mask or indices for training set.
-
-    test : ndarray, dtype int or bool
-        Boolean mask or indices for test set.
-
-    scorer : callable or None.
-        If provided must be a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    verbose : int
-        Verbosity level.
-
-    **fit_params : kwargs
-        Additional parameter passed to the fit function of the estimator.
-
-
-    Returns
-    -------
-    score : float
-        Score of this parameter setting on given training / test split.
-
-    parameters : dict
-        The parameters that have been evaluated.
-
-    n_samples_test : int
-        Number of test samples in this split.
-    """
-    score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train,
-                                              test, verbose, parameters,
-                                              fit_params)
-    return score, parameters, n_samples_test
-
-
-def _check_param_grid(param_grid):
-    if hasattr(param_grid, 'items'):
-        param_grid = [param_grid]
-
-    for p in param_grid:
-        for v in p.values():
-            if isinstance(v, np.ndarray) and v.ndim > 1:
-                raise ValueError("Parameter array should be one-dimensional.")
-
-            check = [isinstance(v, k) for k in (list, tuple, np.ndarray)]
-            if not True in check:
-                raise ValueError("Parameter values should be a list.")
-
-            if len(v) == 0:
-                raise ValueError("Parameter values should be a non-empty "
-                                 "list.")
-
-
-class _CVScoreTuple (namedtuple('_CVScoreTuple',
-                                ('parameters',
-                                 'mean_validation_score',
-                                 'cv_validation_scores'))):
-    # A raw namedtuple is very memory efficient as it packs the attributes
-    # in a struct to get rid of the __dict__ of attributes in particular it
-    # does not copy the string for the keys on each instance.
-    # By deriving a namedtuple class just to introduce the __repr__ method we
-    # would also reintroduce the __dict__ on the instance. By telling the
-    # Python interpreter that this subclass uses static __slots__ instead of
-    # dynamic attributes. Furthermore we don't need any additional slot in the
-    # subclass so we set __slots__ to the empty tuple.
-    __slots__ = ()
-
-    def __repr__(self):
-        """Simple custom repr to summarize the main info"""
-        return "mean: {0:.5f}, std: {1:.5f}, params: {2}".format(
-            self.mean_validation_score,
-            np.std(self.cv_validation_scores),
-            self.parameters)
-
-
-class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator,
-                                      MetaEstimatorMixin)):
-    """Base class for hyper parameter search with cross-validation."""
-
-    @abstractmethod
-    def __init__(self, estimator, scoring=None, loss_func=None,
-                 score_func=None, fit_params=None, n_jobs=1, iid=True,
-                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'):
-
-        self.scoring = scoring
-        self.estimator = estimator
-        self.loss_func = loss_func
-        self.score_func = score_func
-        self.n_jobs = n_jobs
-        self.fit_params = fit_params if fit_params is not None else {}
-        self.iid = iid
-        self.refit = refit
-        self.cv = cv
-        self.verbose = verbose
-        self.pre_dispatch = pre_dispatch
-
-    def score(self, X, y=None):
-        """Returns the score on the given test data and labels, if the search
-        estimator has been refit. The ``score`` function of the best estimator
-        is used, or the ``scoring`` parameter where unavailable.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-            Input data, where n_samples is the number of samples and
-            n_features is the number of features.
-
-        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
-            Target relative to X for classification or regression;
-            None for unsupervised learning.
-
-        Returns
-        -------
-        score : float
-
-        """
-        if hasattr(self.best_estimator_, 'score'):
-            return self.best_estimator_.score(X, y)
-        if self.scorer_ is None:
-            raise ValueError("No score function explicitly defined, "
-                             "and the estimator doesn't provide one %s"
-                             % self.best_estimator_)
-        return self.scorer_(self.best_estimator_, X, y)
-
-    @property
-    def predict(self):
-        return self.best_estimator_.predict
-
-    @property
-    def predict_proba(self):
-        return self.best_estimator_.predict_proba
-
-    @property
-    def decision_function(self):
-        return self.best_estimator_.decision_function
-
-    @property
-    def transform(self):
-        return self.best_estimator_.transform
-
-    def _fit(self, X, y, parameter_iterable):
-        """Actual fitting,  performing the search over parameters."""
-
-        estimator = self.estimator
-        cv = self.cv
-        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring,
-                                     loss_func=self.loss_func,
-                                     score_func=self.score_func)
-
-        n_samples = _num_samples(X)
-        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr',
-                            allow_nans=True)
-
-        if y is not None:
-            if len(y) != n_samples:
-                raise ValueError('Target variable (y) has a different number '
-                                 'of samples (%i) than data (X: %i samples)'
-                                 % (len(y), n_samples))
-            y = np.asarray(y)
-        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
-
-        if self.verbose > 0:
-            if isinstance(parameter_iterable, Sized):
-                n_candidates = len(parameter_iterable)
-                print("Fitting {0} folds for each of {1} candidates, totalling"
-                      " {2} fits".format(len(cv), n_candidates,
-                                         n_candidates * len(cv)))
-
-        base_estimator = clone(self.estimator)
-
-        pre_dispatch = self.pre_dispatch
-
-        out = Parallel(
-            n_jobs=self.n_jobs, verbose=self.verbose,
-            pre_dispatch=pre_dispatch
-        )(
-            delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
-                                    train, test, self.verbose, parameters,
-                                    self.fit_params, return_parameters=True)
-            for parameters in parameter_iterable
-            for train, test in cv)
-
-        # Out is a list of triplet: score, estimator, n_test_samples
-        n_fits = len(out)
-        n_folds = len(cv)
-
-        scores = list()
-        grid_scores = list()
-        for grid_start in range(0, n_fits, n_folds):
-            n_test_samples = 0
-            score = 0
-            all_scores = []
-            for this_score, this_n_test_samples, _, parameters in \
-                    out[grid_start:grid_start + n_folds]:
-                all_scores.append(this_score)
-                if self.iid:
-                    this_score *= this_n_test_samples
-                    n_test_samples += this_n_test_samples
-                score += this_score
-            if self.iid:
-                score /= float(n_test_samples)
-            else:
-                score /= float(n_folds)
-            scores.append((score, parameters))
-            # TODO: shall we also store the test_fold_sizes?
-            grid_scores.append(_CVScoreTuple(
-                parameters,
-                score,
-                np.array(all_scores)))
-        # Store the computed scores
-        self.grid_scores_ = grid_scores
-
-        # Find the best parameters by comparing on the mean validation score:
-        # note that `sorted` is deterministic in the way it breaks ties
-        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
-                      reverse=True)[0]
-        self.best_params_ = best.parameters
-        self.best_score_ = best.mean_validation_score
-
-        if self.refit:
-            # fit the best estimator using the entire dataset
-            # clone first to work around broken estimators
-            best_estimator = clone(base_estimator).set_params(
-                **best.parameters)
-            if y is not None:
-                best_estimator.fit(X, y, **self.fit_params)
-            else:
-                best_estimator.fit(X, **self.fit_params)
-            self.best_estimator_ = best_estimator
-        return self
-
-
-class GridSearchCV(BaseSearchCV):
-    """Exhaustive search over specified parameter values for an estimator.
-
-    Important members are fit, predict.
-
-    GridSearchCV implements a "fit" method and a "predict" method like
-    any classifier except that the parameters of the classifier
-    used to predict is optimized by cross-validation.
-
-    Parameters
-    ----------
-    estimator : object type that implements the "fit" and "predict" methods
-        A object of that type is instantiated for each grid point.
-
-    param_grid : dict or list of dictionaries
-        Dictionary with parameters names (string) as keys and lists of
-        parameter settings to try as values, or a list of such
-        dictionaries, in which case the grids spanned by each dictionary
-        in the list are explored. This enables searching over any sequence
-        of parameter settings.
-
-    scoring : string, callable or None, optional, default: None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    fit_params : dict, optional
-        Parameters to pass to the fit method.
-
-    n_jobs : int, optional
-        Number of jobs to run in parallel (default 1).
-
-    pre_dispatch : int, or string, optional
-        Controls the number of jobs that get dispatched during parallel
-        execution. Reducing this number can be useful to avoid an
-        explosion of memory consumption when more jobs get dispatched
-        than CPUs can process. This parameter can be:
-
-            - None, in which case all the jobs are immediately
-              created and spawned. Use this for lightweight and
-              fast-running jobs, to avoid delays due to on-demand
-              spawning of the jobs
-
-            - An int, giving the exact number of total jobs that are
-              spawned
-
-            - A string, giving an expression as a function of n_jobs,
-              as in '2*n_jobs'
-
-    iid : boolean, optional
-        If True, the data is assumed to be identically distributed across
-        the folds, and the loss minimized is the total loss per sample,
-        and not the mean loss across the folds.
-
-    cv : integer or cross-validation generator, optional
-        If an integer is passed, it is the number of folds (default 3).
-        Specific cross-validation objects can be passed, see
-        sklearn.cross_validation module for the list of possible objects
-
-    refit : boolean
-        Refit the best estimator with the entire dataset.
-        If "False", it is impossible to make predictions using
-        this GridSearchCV instance after fitting.
-
-    verbose : integer
-        Controls the verbosity: the higher, the more messages.
-
-    Examples
-    --------
-    >>> from sklearn import svm, grid_search, datasets
-    >>> iris = datasets.load_iris()
-    >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
-    >>> svr = svm.SVC()
-    >>> clf = grid_search.GridSearchCV(svr, parameters)
-    >>> clf.fit(iris.data, iris.target)
-    ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
-    GridSearchCV(cv=None,
-           estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
-                         degree=..., gamma=..., kernel='rbf', max_iter=-1,
-                         probability=False, random_state=None, shrinking=True,
-                         tol=..., verbose=False),
-           fit_params={}, iid=..., loss_func=..., n_jobs=1,
-           param_grid=..., pre_dispatch=..., refit=..., score_func=...,
-           scoring=..., verbose=...)
-
-
-    Attributes
-    ----------
-    `grid_scores_` : list of named tuples
-        Contains scores for all parameter combinations in param_grid.
-        Each entry corresponds to one parameter setting.
-        Each named tuple has the attributes:
-
-            * ``parameters``, a dict of parameter settings
-            * ``mean_validation_score``, the mean score over the
-              cross-validation folds
-            * ``cv_validation_scores``, the list of scores for each fold
-
-    `best_estimator_` : estimator
-        Estimator that was chosen by the search, i.e. estimator
-        which gave highest score (or smallest loss if specified)
-        on the left out data.
-
-    `best_score_` : float
-        Score of best_estimator on the left out data.
-
-    `best_params_` : dict
-        Parameter setting that gave the best results on the hold out data.
-
-    `scorer_` : function
-        Scorer function used on the held out data to choose the best
-        parameters for the model.
-
-    Notes
-    ------
-    The parameters selected are those that maximize the score of the left out
-    data, unless an explicit score is passed in which case it is used instead.
-
-    If `n_jobs` was set to a value higher than one, the data is copied for each
-    point in the grid (and not `n_jobs` times). This is done for efficiency
-    reasons if individual jobs take very little time, but may raise errors if
-    the dataset is large and not enough memory is available.  A workaround in
-    this case is to set `pre_dispatch`. Then, the memory is copied only
-    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
-    n_jobs`.
-
-    See Also
-    ---------
-    :class:`ParameterGrid`:
-        generates all the combinations of a an hyperparameter grid.
-
-    :func:`sklearn.cross_validation.train_test_split`:
-        utility function to split the data into a development set usable
-        for fitting a GridSearchCV instance and an evaluation set for
-        its final evaluation.
-
-    :func:`sklearn.metrics.make_scorer`:
-        Make a scorer from a performance metric or loss function.
-
-    """
-
-    def __init__(self, estimator, param_grid, scoring=None, loss_func=None,
-                 score_func=None, fit_params=None, n_jobs=1, iid=True,
-                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'):
-        super(GridSearchCV, self).__init__(
-            estimator, scoring, loss_func, score_func, fit_params, n_jobs, iid,
-            refit, cv, verbose, pre_dispatch)
-        self.param_grid = param_grid
-        _check_param_grid(param_grid)
-
-    def fit(self, X, y=None):
-        """Run fit with all sets of parameters.
-
-        Parameters
-        ----------
-
-        X : array-like, shape = [n_samples, n_features]
-            Training vector, where n_samples is the number of samples and
-            n_features is the number of features.
-
-        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
-            Target relative to X for classification or regression;
-            None for unsupervised learning.
-
-        """
-        return self._fit(X, y, ParameterGrid(self.param_grid))
-
-
-class RandomizedSearchCV(BaseSearchCV):
-    """Randomized search on hyper parameters.
-
-    RandomizedSearchCV implements a "fit" method and a "predict" method like
-    any classifier except that the parameters of the classifier
-    used to predict is optimized by cross-validation.
-
-    In contrast to GridSearchCV, not all parameter values are tried out, but
-    rather a fixed number of parameter settings is sampled from the specified
-    distributions. The number of parameter settings that are tried is
-    given by n_iter.
-
-    Parameters
-    ----------
-    estimator : object type that implements the "fit" and "predict" methods
-        A object of that type is instantiated for each parameter setting.
-
-    param_distributions : dict
-        Dictionary with parameters names (string) as keys and distributions
-        or lists of parameters to try. Distributions must provide a ``rvs``
-        method for sampling (such as those from scipy.stats.distributions).
-        If a list is given, it is sampled uniformly.
-
-    n_iter : int, default=10
-        Number of parameter settings that are sampled. n_iter trades
-        off runtime vs quality of the solution.
-
-    scoring : string, callable or None, optional, default: None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    fit_params : dict, optional
-        Parameters to pass to the fit method.
-
-    n_jobs : int, optional
-        Number of jobs to run in parallel (default 1).
-
-    pre_dispatch : int, or string, optional
-        Controls the number of jobs that get dispatched during parallel
-        execution. Reducing this number can be useful to avoid an
-        explosion of memory consumption when more jobs get dispatched
-        than CPUs can process. This parameter can be:
-
-            - None, in which case all the jobs are immediately
-              created and spawned. Use this for lightweight and
-              fast-running jobs, to avoid delays due to on-demand
-              spawning of the jobs
-
-            - An int, giving the exact number of total jobs that are
-              spawned
-
-            - A string, giving an expression as a function of n_jobs,
-              as in '2*n_jobs'
-
-    iid : boolean, optional
-        If True, the data is assumed to be identically distributed across
-        the folds, and the loss minimized is the total loss per sample,
-        and not the mean loss across the folds.
-
-    cv : integer or cross-validation generator, optional
-        If an integer is passed, it is the number of folds (default 3).
-        Specific cross-validation objects can be passed, see
-        sklearn.cross_validation module for the list of possible objects
-
-    refit : boolean
-        Refit the best estimator with the entire dataset.
-        If "False", it is impossible to make predictions using
-        this RandomizedSearchCV instance after fitting.
-
-    verbose : integer
-        Controls the verbosity: the higher, the more messages.
-
-
-    Attributes
-    ----------
-    `grid_scores_` : list of named tuples
-        Contains scores for all parameter combinations in param_grid.
-        Each entry corresponds to one parameter setting.
-        Each named tuple has the attributes:
-
-            * ``parameters``, a dict of parameter settings
-            * ``mean_validation_score``, the mean score over the
-              cross-validation folds
-            * ``cv_validation_scores``, the list of scores for each fold
-
-    `best_estimator_` : estimator
-        Estimator that was chosen by the search, i.e. estimator
-        which gave highest score (or smallest loss if specified)
-        on the left out data.
-
-    `best_score_` : float
-        Score of best_estimator on the left out data.
-
-    `best_params_` : dict
-        Parameter setting that gave the best results on the hold out data.
-
-    Notes
-    -----
-    The parameters selected are those that maximize the score of the held-out
-    data, according to the scoring parameter.
-
-    If `n_jobs` was set to a value higher than one, the data is copied for each
-    parameter setting(and not `n_jobs` times). This is done for efficiency
-    reasons if individual jobs take very little time, but may raise errors if
-    the dataset is large and not enough memory is available.  A workaround in
-    this case is to set `pre_dispatch`. Then, the memory is copied only
-    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
-    n_jobs`.
-
-    See Also
-    --------
-    :class:`GridSearchCV`:
-        Does exhaustive search over a grid of parameters.
-
-    :class:`ParameterSampler`:
-        A generator over parameter settins, constructed from
-        param_distributions.
-
-    """
-
-    def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
-                 fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
-                 verbose=0, pre_dispatch='2*n_jobs', random_state=None):
-
-        self.param_distributions = param_distributions
-        self.n_iter = n_iter
-        self.random_state = random_state
-        super(RandomizedSearchCV, self).__init__(
-            estimator=estimator, scoring=scoring, fit_params=fit_params,
-            n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
-            pre_dispatch=pre_dispatch)
-
-    def fit(self, X, y=None):
-        """Run fit on the estimator with randomly drawn parameters.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-            Training vector, where n_samples in the number of samples and
-            n_features is the number of features.
-
-        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
-            Target relative to X for classification or regression;
-            None for unsupervised learning.
-
-        """
-        sampled_params = ParameterSampler(self.param_distributions,
-                                          self.n_iter,
-                                          random_state=self.random_state)
-        return self._fit(X, y, sampled_params)
diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py
index 9debbe0776287..1ada1d5d3b257 100644
--- a/sklearn/learning_curve.py
+++ b/sklearn/learning_curve.py
@@ -4,303 +4,6 @@
 #
 # License: BSD 3 clause
 
-import warnings
+from sklearn.model_selection.validate import learning_curve, validation_curve
 
-import numpy as np
-
-from .base import is_classifier, clone
-from .cross_validation import _check_cv
-from .externals.joblib import Parallel, delayed
-from .cross_validation import _safe_split, _score, _fit_and_score
-from .metrics.scorer import check_scoring
-from .utils import check_arrays
-from .utils.fixes import astype
-
-
-def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5),
-                   cv=None, scoring=None, exploit_incremental_learning=False,
-                   n_jobs=1, pre_dispatch="all", verbose=0):
-    """Learning curve.
-
-    Determines cross-validated training and test scores for different training
-    set sizes.
-
-    A cross-validation generator splits the whole dataset k times in training
-    and test data. Subsets of the training set with varying sizes will be used
-    to train the estimator and a score for each training subset size and the
-    test set will be computed. Afterwards, the scores will be averaged over
-    all k runs for each training subset size.
-
-    Parameters
-    ----------
-    estimator : object type that implements the "fit" and "predict" methods
-        An object of that type which is cloned for each validation.
-
-    X : array-like, shape (n_samples, n_features)
-        Training vector, where n_samples is the number of samples and
-        n_features is the number of features.
-
-    y : array-like, shape (n_samples) or (n_samples, n_features), optional
-        Target relative to X for classification or regression;
-        None for unsupervised learning.
-
-    train_sizes : array-like, shape (n_ticks,), dtype float or int
-        Relative or absolute numbers of training examples that will be used to
-        generate the learning curve. If the dtype is float, it is regarded as a
-        fraction of the maximum size of the training set (that is determined
-        by the selected validation method), i.e. it has to be within (0, 1].
-        Otherwise it is interpreted as absolute sizes of the training sets.
-        Note that for classification the number of samples usually have to
-        be big enough to contain at least one sample from each class.
-        (default: np.linspace(0.1, 1.0, 5))
-
-    cv : integer, cross-validation generator, optional
-        If an integer is passed, it is the number of folds (defaults to 3).
-        Specific cross-validation objects can be passed, see
-        sklearn.cross_validation module for the list of possible objects
-
-    scoring : string, callable or None, optional, default: None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    exploit_incremental_learning : boolean, optional, default: False
-        If the estimator supports incremental learning, this will be
-        used to speed up fitting for different training set sizes.
-
-    n_jobs : integer, optional
-        Number of jobs to run in parallel (default 1).
-
-    pre_dispatch : integer or string, optional
-        Number of predispatched jobs for parallel execution (default is
-        all). The option can reduce the allocated memory. The string can
-        be an expression like '2*n_jobs'.
-
-    verbose : integer, optional
-        Controls the verbosity: the higher, the more messages.
-
-    Returns
-    -------
-    train_sizes_abs : array, shape = (n_unique_ticks,), dtype int
-        Numbers of training examples that has been used to generate the
-        learning curve. Note that the number of ticks might be less
-        than n_ticks because duplicate entries will be removed.
-
-    train_scores : array, shape (n_ticks, n_cv_folds)
-        Scores on training sets.
-
-    test_scores : array, shape (n_ticks, n_cv_folds)
-        Scores on test set.
-
-    Notes
-    -----
-    See :ref:`examples/plot_learning_curve.py <example_plot_learning_curve.py>`
-    """
-    if exploit_incremental_learning and not hasattr(estimator, "partial_fit"):
-        raise ValueError("An estimator must support the partial_fit interface "
-                         "to exploit incremental learning")
-
-    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
-    # Make a list since we will be iterating multiple times over the folds
-    cv = list(_check_cv(cv, X, y, classifier=is_classifier(estimator)))
-    scorer = check_scoring(estimator, scoring=scoring)
-
-    # HACK as long as boolean indices are allowed in cv generators
-    if cv[0][0].dtype == bool:
-        new_cv = []
-        for i in range(len(cv)):
-            new_cv.append((np.nonzero(cv[i][0])[0], np.nonzero(cv[i][1])[0]))
-        cv = new_cv
-
-    n_max_training_samples = len(cv[0][0])
-    # Because the lengths of folds can be significantly different, it is
-    # not guaranteed that we use all of the available training data when we
-    # use the first 'n_max_training_samples' samples.
-    train_sizes_abs = _translate_train_sizes(train_sizes,
-                                             n_max_training_samples)
-    n_unique_ticks = train_sizes_abs.shape[0]
-    if verbose > 0:
-        print("[learning_curve] Training set sizes: " + str(train_sizes_abs))
-
-    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
-                        verbose=verbose)
-    if exploit_incremental_learning:
-        classes = np.unique(y) if is_classifier(estimator) else None
-        out = parallel(delayed(_incremental_fit_estimator)(
-            clone(estimator), X, y, classes, train, test, train_sizes_abs,
-            scorer, verbose) for train, test in cv)
-    else:
-        out = parallel(delayed(_fit_and_score)(
-            clone(estimator), X, y, scorer, train[:n_train_samples], test,
-            verbose, parameters=None, fit_params=None, return_train_score=True)
-            for train, test in cv for n_train_samples in train_sizes_abs)
-        out = np.array(out)[:, :2]
-        n_cv_folds = out.shape[0] // n_unique_ticks
-        out = out.reshape(n_cv_folds, n_unique_ticks, 2)
-
-    out = np.asarray(out).transpose((2, 1, 0))
-
-    return train_sizes_abs, out[0], out[1]
-
-
-def _translate_train_sizes(train_sizes, n_max_training_samples):
-    """Determine absolute sizes of training subsets and validate 'train_sizes'.
-
-    Examples:
-        _translate_train_sizes([0.5, 1.0], 10) -> [5, 10]
-        _translate_train_sizes([5, 10], 10) -> [5, 10]
-
-    Parameters
-    ----------
-    train_sizes : array-like, shape (n_ticks,), dtype float or int
-        Numbers of training examples that will be used to generate the
-        learning curve. If the dtype is float, it is regarded as a
-        fraction of 'n_max_training_samples', i.e. it has to be within (0, 1].
-
-    n_max_training_samples : int
-        Maximum number of training samples (upper bound of 'train_sizes').
-
-    Returns
-    -------
-    train_sizes_abs : array, shape (n_unique_ticks,), dtype int
-        Numbers of training examples that will be used to generate the
-        learning curve. Note that the number of ticks might be less
-        than n_ticks because duplicate entries will be removed.
-    """
-    train_sizes_abs = np.asarray(train_sizes)
-    n_ticks = train_sizes_abs.shape[0]
-    n_min_required_samples = np.min(train_sizes_abs)
-    n_max_required_samples = np.max(train_sizes_abs)
-    if np.issubdtype(train_sizes_abs.dtype, np.float):
-        if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0:
-            raise ValueError("train_sizes has been interpreted as fractions "
-                             "of the maximum number of training samples and "
-                             "must be within (0, 1], but is within [%f, %f]."
-                             % (n_min_required_samples,
-                                n_max_required_samples))
-        train_sizes_abs = astype(train_sizes_abs * n_max_training_samples,
-                                 dtype=np.int, copy=False)
-        train_sizes_abs = np.clip(train_sizes_abs, 1,
-                                  n_max_training_samples)
-    else:
-        if (n_min_required_samples <= 0 or
-                n_max_required_samples > n_max_training_samples):
-            raise ValueError("train_sizes has been interpreted as absolute "
-                             "numbers of training samples and must be within "
-                             "(0, %d], but is within [%d, %d]."
-                             % (n_max_training_samples,
-                                n_min_required_samples,
-                                n_max_required_samples))
-
-    train_sizes_abs = np.unique(train_sizes_abs)
-    if n_ticks > train_sizes_abs.shape[0]:
-        warnings.warn("Removed duplicate entries from 'train_sizes'. Number "
-                      "of ticks will be less than than the size of "
-                      "'train_sizes' %d instead of %d)."
-                      % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning)
-
-    return train_sizes_abs
-
-
-def _incremental_fit_estimator(estimator, X, y, classes, train, test,
-                               train_sizes, scorer, verbose):
-    """Train estimator on training subsets incrementally and compute scores."""
-    train_scores, test_scores = [], []
-    partitions = zip(train_sizes, np.split(train, train_sizes)[:-1])
-    for n_train_samples, partial_train in partitions:
-        train_subset = train[:n_train_samples]
-        X_train, y_train = _safe_split(estimator, X, y, train_subset)
-        X_partial_train, y_partial_train = _safe_split(estimator, X, y,
-                                                       partial_train)
-        X_test, y_test = _safe_split(estimator, X, y, test, train_subset)
-        if y_partial_train is None:
-            estimator.partial_fit(X_partial_train, classes=classes)
-        else:
-            estimator.partial_fit(X_partial_train, y_partial_train,
-                                  classes=classes)
-        train_scores.append(_score(estimator, X_train, y_train, scorer))
-        test_scores.append(_score(estimator, X_test, y_test, scorer))
-    return np.array((train_scores, test_scores)).T
-
-
-def validation_curve(estimator, X, y, param_name, param_range, cv=None,
-                     scoring=None, n_jobs=1, pre_dispatch="all", verbose=0):
-    """Validation curve.
-
-    Determine training and test scores for varying parameter values.
-
-    Compute scores for an estimator with different values of a specified
-    parameter. This is similar to grid search with one parameter. However, this
-    will also compute training scores and is merely a utility for plotting the
-    results.
-
-    Parameters
-    ----------
-    estimator : object type that implements the "fit" and "predict" methods
-        An object of that type which is cloned for each validation.
-
-    X : array-like, shape (n_samples, n_features)
-        Training vector, where n_samples is the number of samples and
-        n_features is the number of features.
-
-    y : array-like, shape (n_samples) or (n_samples, n_features), optional
-        Target relative to X for classification or regression;
-        None for unsupervised learning.
-
-    param_name : string
-        Name of the parameter that will be varied.
-
-    param_range : array-like, shape (n_values,)
-        The values of the parameter that will be evaluated.
-
-    cv : integer, cross-validation generator, optional
-        If an integer is passed, it is the number of folds (defaults to 3).
-        Specific cross-validation objects can be passed, see
-        sklearn.cross_validation module for the list of possible objects
-
-    scoring : string, callable or None, optional, default: None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    n_jobs : integer, optional
-        Number of jobs to run in parallel (default 1).
-
-    pre_dispatch : integer or string, optional
-        Number of predispatched jobs for parallel execution (default is
-        all). The option can reduce the allocated memory. The string can
-        be an expression like '2*n_jobs'.
-
-    verbose : integer, optional
-        Controls the verbosity: the higher, the more messages.
-
-    Returns
-    -------
-    train_scores : array, shape (n_ticks, n_cv_folds)
-        Scores on training sets.
-
-    test_scores : array, shape (n_ticks, n_cv_folds)
-        Scores on test set.
-
-    Notes
-    -----
-    See
-    :ref:`examples/plot_validation_curve.py <example_plot_validation_curve.py>`
-    """
-    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
-    cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
-    scorer = check_scoring(estimator, scoring=scoring)
-
-    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
-                        verbose=verbose)
-    out = parallel(delayed(_fit_and_score)(
-        estimator, X, y, scorer, train, test, verbose,
-        parameters={param_name: v}, fit_params=None, return_train_score=True)
-        for train, test in cv for v in param_range)
-
-    out = np.asarray(out)[:, :2]
-    n_params = len(param_range)
-    n_cv_folds = out.shape[0] // n_params
-    out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0))
-
-    return out[0], out[1]
+#TODO: issue warning when importing this module
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 7009f4861b375..ad0e08edc5a9a 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -16,7 +16,7 @@
 from ..base import RegressorMixin
 from .base import center_data, sparse_center_data
 from ..utils import array2d, atleast2d_or_csc
-from ..cross_validation import _check_cv as check_cv
+from ..model_selection.partition import _check_cv as check_cv
 from ..externals.joblib import Parallel, delayed
 from ..externals import six
 from ..externals.six.moves import xrange
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 4d52580311db5..8f9909fde5420 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -22,7 +22,7 @@
 from .base import LinearModel
 from ..base import RegressorMixin
 from ..utils import array2d, arrayfuncs, as_float_array, check_arrays
-from ..cross_validation import _check_cv as check_cv
+from ..model_selection.partition import _check_cv as check_cv
 from ..utils import ConvergenceWarning
 from ..externals.joblib import Parallel, delayed
 from ..externals.six.moves import xrange
diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
index 8b0cbf53e0905..043dd0a91a165 100644
--- a/sklearn/linear_model/omp.py
+++ b/sklearn/linear_model/omp.py
@@ -15,7 +15,7 @@
 from .base import LinearModel, _pre_fit
 from ..base import RegressorMixin
 from ..utils import array2d, as_float_array, check_arrays
-from ..cross_validation import _check_cv as check_cv
+from ..model_selection.partition import _check_cv as check_cv
 from ..externals.joblib import Parallel, delayed
 
 import scipy
diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
new file mode 100644
index 0000000000000..c790b1667582f
--- /dev/null
+++ b/sklearn/model_selection/__init__.py
@@ -0,0 +1,28 @@
+from .partition import LeaveOneOut, LeavePOut, KFold, \
+        StratifiedKFold, LeaveOneLabelOut, LeavePLabelOut, \
+        ShuffleSplit, StratifiedShuffleSplit, train_test_split, check_cv
+from .validate import cross_val_score, permutation_test_score, \
+        learning_curve, validation_curve
+from .search import GridSearchCV, RandomizedSearchCV
+from .utils import ParameterGrid, ParameterSampler, fit_grid_point
+
+__all__ = ['KFold',
+           'LeaveOneLabelOut',
+           'LeaveOneOut',
+           'LeavePLabelOut',
+           'LeavePOut',
+           'ShuffleSplit',
+           'StratifiedKFold',
+           'StratifiedShuffleSplit',
+           'check_cv',
+           'cross_val_score',
+           'permutation_test_score',
+           'train_test_split',
+           'learning_curve',
+           'validation_curve',
+           'GridSearchCV',
+           'RandomizedSearchCV',
+           'ParameterGrid',
+           'ParameterSampler',
+           'fit_grid_point',
+          ]
diff --git a/sklearn/model_selection/partition.py b/sklearn/model_selection/partition.py
new file mode 100644
index 0000000000000..510cc8ac7a7c5
--- /dev/null
+++ b/sklearn/model_selection/partition.py
@@ -0,0 +1,1108 @@
+"""
+The :mod:`sklearn.model_selection.partition` module includes
+"""
+#TODO Complete docstring
+
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>,
+#         Gael Varoquaux <gael.varoquaux@normalesup.org>,
+#         Olivier Grisel <olivier.grisel@ensta.org>
+# License: BSD 3 clause
+
+from __future__ import print_function
+from __future__ import division
+
+import warnings
+from itertools import chain, combinations
+from math import ceil, floor, factorial
+import numbers
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+import scipy.sparse as sp
+
+from sklearn.utils import check_arrays, check_random_state
+from sklearn.externals.six import with_metaclass
+from sklearn.externals.six.moves import zip
+
+
+class _PartitionIterator(with_metaclass(ABCMeta)):
+    """Base class for CV iterators where train_mask = ~test_mask
+
+    Implementations must define `_iter_test_masks` or `_iter_test_indices`.
+
+    Parameters
+    ----------
+    n : int
+        Total number of elements in dataset.
+    """
+
+    def __init__(self, n=None, indices=None):
+        if indices is None:
+            indices = True
+        else:
+            warnings.warn("The indices parameter is deprecated and will be "
+                          "removed (assumed True) in 0.17", DeprecationWarning,
+                          stacklevel=1)
+        if n is not None:
+            warnings.warn("The n parameter is deprecated and will be "
+                          "removed (use split method instead)",
+                          DeprecationWarning, stacklevel=1)
+            if abs(n - int(n)) >= np.finfo('f').eps:
+                raise ValueError("n must be an integer")
+            n = int(n)
+        self.n = n
+        self._indices = indices
+
+    @property
+    def indices(self):
+        warnings.warn("The indices attribute is deprecated and will be "
+                      "removed (assumed True) in 0.17", DeprecationWarning,
+                      stacklevel=1)
+        return self._indices
+
+    def __iter__(self):
+        #TODO: deprecation warning
+        if self.n is None:
+            raise ValueError("Cannot iterate dataless CV iterator")
+        return self.split(None)
+
+    def split(self, y):
+        self._pre_split_check(y)
+        indices = self._indices
+        if indices:
+            ind = np.arange(self._sample_size(y))
+        for test_index in self._iter_test_masks(y):
+            train_index = np.logical_not(test_index)
+            if indices:
+                train_index = ind[train_index]
+                test_index = ind[test_index]
+            yield train_index, test_index
+
+    def _pre_split_check(self, y):
+        pass
+
+    def _sample_size(self, y):
+        return self.n if y is None else len(y) #TODO: Check for dict of arrays or dataframe
+
+    # Since subclasses must implement either _iter_test_masks or
+    # _iter_test_indices, neither can be abstract.
+    def _iter_test_masks(self, y):
+        """Generates boolean masks corresponding to test sets.
+
+        By default, delegates to _iter_test_indices()
+        """
+        for test_index in self._iter_test_indices(y):
+            test_mask = self._empty_mask(y)
+            test_mask[test_index] = True
+            yield test_mask
+
+    def _iter_test_indices(self, y):
+        """Generates integer indices corresponding to test sets."""
+        raise NotImplementedError
+
+    def _empty_mask(self, y):
+        return np.zeros(self._sample_size(y), dtype=np.bool)
+
+
+class LeaveOneOut(_PartitionIterator):
+    """Leave-One-Out cross validation iterator.
+
+    Provides train/test indices to split data in train test sets. Each
+    sample is used once as a test set (singleton) while the remaining
+    samples form the training set.
+
+    Note: ``LeaveOneOut(n)`` is equivalent to ``KFold(n, n_folds=n)`` and
+    ``LeavePOut(n, p=1)``.
+
+    Due to the high number of test sets (which is the same as the
+    number of samples) this cross validation method can be very costly.
+    For large datasets one should favor KFold, StratifiedKFold or
+    ShuffleSplit.
+
+    Parameters
+    ----------
+    n : int
+        Total number of elements in dataset.
+
+    Examples
+    --------
+    >>> from sklearn import model_selection
+    >>> X = np.array([[1, 2], [3, 4]])
+    >>> y = np.array([1, 2])
+    >>> loo = model_selection.LeaveOneOut(2)
+    >>> len(loo)
+    2
+    >>> print(loo)
+    sklearn.model_selection.partition.LeaveOneOut(n=2)
+    >>> for train_index, test_index in loo:
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...    X_train, X_test = X[train_index], X[test_index]
+    ...    y_train, y_test = y[train_index], y[test_index]
+    ...    print(X_train, X_test, y_train, y_test)
+    TRAIN: [1] TEST: [0]
+    [[3 4]] [[1 2]] [2] [1]
+    TRAIN: [0] TEST: [1]
+    [[1 2]] [[3 4]] [1] [2]
+
+    See also
+    --------
+    LeaveOneLabelOut for splitting the data according to explicit,
+    domain-specific stratification of the dataset.
+    """
+
+    def _iter_test_indices(self, y):
+        return range(self._sample_size(y))
+
+    def __repr__(self):
+        return '%s.%s(n=%i)' % (
+            self.__class__.__module__,
+            self.__class__.__name__,
+            self.n,
+        )
+
+    def __len__(self):
+        # TODO: remove?
+        return self.n
+
+
+class LeavePOut(_PartitionIterator):
+    """Leave-P-Out cross validation iterator
+
+    Provides train/test indices to split data in train test sets. This results
+    in testing on all distinct samples of size p, while the remaining n - p
+    samples form the training set in each iteration.
+
+    Note: ``LeavePOut(n, p)`` is NOT equivalent to ``KFold(n, n_folds=n // p)``
+    which creates non-overlapping test sets.
+
+    Due to the high number of iterations which grows combinatorically with the
+    number of samples this cross validation method can be very costly. For
+    large datasets one should favor KFold, StratifiedKFold or ShuffleSplit.
+
+    Parameters
+    ----------
+    n : int
+        Total number of elements in dataset.
+
+    p : int
+        Size of the test sets.
+
+    Examples
+    --------
+    >>> from sklearn import model_selection
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    >>> y = np.array([1, 2, 3, 4])
+    >>> lpo = model_selection.LeavePOut(4, 2)
+    >>> len(lpo)
+    6
+    >>> print(lpo)
+    sklearn.model_selection.partition.LeavePOut(n=4, p=2)
+    >>> for train_index, test_index in lpo:
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...    X_train, X_test = X[train_index], X[test_index]
+    ...    y_train, y_test = y[train_index], y[test_index]
+    TRAIN: [2 3] TEST: [0 1]
+    TRAIN: [1 3] TEST: [0 2]
+    TRAIN: [1 2] TEST: [0 3]
+    TRAIN: [0 3] TEST: [1 2]
+    TRAIN: [0 2] TEST: [1 3]
+    TRAIN: [0 1] TEST: [2 3]
+    """
+
+    def __init__(self, n=None, p=None, indices=None):
+        super(LeavePOut, self).__init__(n, indices)
+        if p is None:
+            raise ValueError("LeavePOut: must supply p")
+        self.p = p
+
+    def _iter_test_indices(self, y):
+        for comb in combinations(range(self._sample_size(y)), self.p):
+            yield np.array(comb)
+
+    def __repr__(self):
+        return '%s.%s(n=%i, p=%i)' % (
+            self.__class__.__module__,
+            self.__class__.__name__,
+            self.n,
+            self.p,
+        )
+
+    def __len__(self):
+        # TODO: remove?
+        return int(factorial(self.n) / factorial(self.n - self.p)
+                   / factorial(self.p))
+
+
+class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)):
+    """Base class to validate KFold approaches"""
+
+    @abstractmethod
+    def __init__(self, n, n_folds, indices, shuffle, random_state):
+        super(_BaseKFold, self).__init__(n, indices)
+
+        if abs(n_folds - int(n_folds)) >= np.finfo('f').eps:
+            raise ValueError("n_folds must be an integer")
+        self.n_folds = n_folds = int(n_folds)
+
+        if n_folds <= 1:
+            raise ValueError(
+                "k-fold cross validation requires at least one"
+                " train / test split by setting n_folds=2 or more,"
+                " got n_folds={0}.".format(n_folds))
+
+        if not isinstance(shuffle, bool):
+            raise TypeError("shuffle must be True or False;"
+                            " got {0}".format(shuffle))
+        self.shuffle = shuffle
+        self.random_state = random_state
+
+    def _pre_split_check(self, y):
+        n = self._sample_size(y)
+        if self.n_folds > n:
+            raise ValueError(
+                ("Cannot have number of folds n_folds={0} greater"
+                 " than the number of samples: {1}.").format(self.n_folds, n))
+
+
+class KFold(_BaseKFold):
+    """K-Folds cross validation iterator.
+
+    Provides train/test indices to split data in train test sets. Split
+    dataset into k consecutive folds (without shuffling).
+
+    Each fold is then used a validation set once while the k - 1 remaining
+    fold form the training set.
+
+    Parameters
+    ----------
+    n : int
+        Total number of elements.
+
+    n_folds : int, default=3
+        Number of folds. Must be at least 2.
+
+    shuffle : boolean, optional
+        Whether to shuffle the data before splitting into batches.
+
+    random_state : None, int or RandomState
+        Pseudo-random number generator state used for random
+        sampling. If None, use default numpy RNG for shuffling
+
+    Examples
+    --------
+    >>> from sklearn import model_selection
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([1, 2, 3, 4])
+    >>> kf = model_selection.KFold(4, n_folds=2)
+    >>> len(kf)
+    2
+    >>> print(kf)  # doctest: +NORMALIZE_WHITESPACE
+    sklearn.model_selection.partition.KFold(n=4, n_folds=2, shuffle=False,
+                                   random_state=None)
+    >>> for train_index, test_index in kf:
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...    X_train, X_test = X[train_index], X[test_index]
+    ...    y_train, y_test = y[train_index], y[test_index]
+    TRAIN: [2 3] TEST: [0 1]
+    TRAIN: [0 1] TEST: [2 3]
+
+    Notes
+    -----
+    The first n % n_folds folds have size n // n_folds + 1, other folds have
+    size n // n_folds.
+
+    See also
+    --------
+    StratifiedKFold: take label information into account to avoid building
+    folds with imbalanced class distributions (for binary or multiclass
+    classification tasks).
+    """
+
+    def __init__(self, n=None, n_folds=3, indices=None, shuffle=False,
+                 random_state=None):
+        super(KFold, self).__init__(n, n_folds, indices, shuffle, random_state)
+
+    def _iter_test_indices(self, y):
+        n = self._sample_size(y)
+        idxs = np.arange(n)
+        if self.shuffle:
+            rng = check_random_state(self.random_state)
+            rng.shuffle(idxs)
+        n_folds = self.n_folds
+        fold_sizes = (n // n_folds) * np.ones(n_folds, dtype=np.int)
+        fold_sizes[:n % n_folds] += 1
+        current = 0
+        for fold_size in fold_sizes:
+            start, stop = current, current + fold_size
+            yield idxs[start:stop]
+            current = stop
+
+    def __repr__(self):
+        return '%s.%s(n=%i, n_folds=%i, shuffle=%s, random_state=%s)' % (
+            self.__class__.__module__,
+            self.__class__.__name__,
+            self.n,
+            self.n_folds,
+            self.shuffle,
+            self.random_state,
+        )
+
+    def __len__(self):
+        return self.n_folds
+
+
+class StratifiedKFold(_BaseKFold):
+    """Stratified K-Folds cross validation iterator
+
+    Provides train/test indices to split data in train test sets.
+
+    This cross-validation object is a variation of KFold that
+    returns stratified folds. The folds are made by preserving
+    the percentage of samples for each class.
+
+    Parameters
+    ----------
+    y : array-like, [n_samples]
+        Samples to split in K folds.
+
+    n_folds : int, default=3
+        Number of folds. Must be at least 2.
+
+    shuffle : boolean, optional
+        Whether to shuffle each stratification of the data before splitting
+        into batches.
+
+    random_state : None, int or RandomState
+        Pseudo-random number generator state used for random
+        sampling. If None, use default numpy RNG for shuffling
+
+    Examples
+    --------
+    >>> from sklearn import model_selection
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([0, 0, 1, 1])
+    >>> skf = model_selection.StratifiedKFold(y, n_folds=2)
+    >>> len(skf)
+    2
+    >>> print(skf)  # doctest: +NORMALIZE_WHITESPACE
+    sklearn.model_selection.partition.StratifiedKFold(labels=[0 0 1 1], n_folds=2,
+                                             shuffle=False, random_state=None)
+    >>> for train_index, test_index in skf:
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...    X_train, X_test = X[train_index], X[test_index]
+    ...    y_train, y_test = y[train_index], y[test_index]
+    TRAIN: [1 3] TEST: [0 2]
+    TRAIN: [0 2] TEST: [1 3]
+
+    Notes
+    -----
+    All the folds have size trunc(n_samples / n_folds), the last one has the
+    complementary.
+
+    """
+
+    def __init__(self, y=None, n_folds=3, indices=None, shuffle=False,
+                 random_state=None):
+        #TODO: deprecation warning if y is not none, should use split
+        n = len(y) if y is not None else None
+        super(StratifiedKFold, self).__init__(
+            n, n_folds, indices, shuffle, random_state)
+        self.y = np.asarray(y) if y is not None else None
+
+    def _make_test_folds(self, y):
+        if y is None:
+            if self.y is None:
+                raise ValueError("Must supply y in constructor or split")
+            y = self.y
+
+        n_samples = y.shape[0]
+        unique_labels, y_inversed = np.unique(y, return_inverse=True)
+        label_counts = np.bincount(y_inversed)
+        min_labels = np.min(label_counts)
+        if self.n_folds > min_labels:
+            warnings.warn(("The least populated class in y has only %d"
+                          " members, which is too few. The minimum"
+                          " number of labels for any class cannot"
+                          " be less than n_folds=%d."
+                          % (min_labels, self.n_folds)), Warning)
+
+        # don't want to use the same seed in each label's shuffle
+        if self.shuffle:
+            rng = check_random_state(self.random_state)
+        else:
+            rng = self.random_state
+
+        # pre-assign each sample to a test fold index using individual KFold
+        # splitting strategies for each label so as to respect the
+        # balance of labels
+        per_label_cvs = [
+            KFold(max(c, self.n_folds), self.n_folds, shuffle=self.shuffle,
+                  random_state=rng) for c in label_counts]
+        test_folds = np.zeros(n_samples, dtype=np.int)
+        for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
+            for label, (_, test_split) in zip(unique_labels, per_label_splits):
+                label_test_folds = test_folds[y == label]
+                # the test split can be too big because we used
+                # KFold(max(c, self.n_folds), self.n_folds) instead of
+                # KFold(c, self.n_folds) to make it possible to not crash even
+                # if the data is not 100% stratifiable for all the labels
+                # (we use a warning instead of raising an exception)
+                # If this is the case, let's trim it:
+                test_split = test_split[test_split < len(label_test_folds)]
+                label_test_folds[test_split] = test_fold_idx
+                test_folds[y == label] = label_test_folds
+
+        return test_folds
+
+    def _iter_test_masks(self, y):
+        test_folds = self._make_test_folds(y)
+        for i in range(self.n_folds):
+            yield test_folds == i
+
+    def __repr__(self):
+        return '%s.%s(labels=%s, n_folds=%i, shuffle=%s, random_state=%s)' % (
+            self.__class__.__module__,
+            self.__class__.__name__,
+            self.y,
+            self.n_folds,
+            self.shuffle,
+            self.random_state,
+        )
+
+    def __len__(self):
+        return self.n_folds
+
+
+class LeaveOneLabelOut(_PartitionIterator):
+    """Leave-One-Label_Out cross-validation iterator
+
+    Provides train/test indices to split data according to a third-party
+    provided label. This label information can be used to encode arbitrary
+    domain specific stratifications of the samples as integers.
+
+    For instance the labels could be the year of collection of the samples
+    and thus allow for cross-validation against time-based splits.
+
+    Parameters
+    ----------
+    labels : array-like of int with shape (n_samples,)
+        Arbitrary domain-specific stratification of the data to be used
+        to draw the splits.
+
+    Examples
+    --------
+    >>> from sklearn import model_selection
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    >>> y = np.array([1, 2, 1, 2])
+    >>> labels = np.array([1, 1, 2, 2])
+    >>> lol = model_selection.LeaveOneLabelOut(labels)
+    >>> len(lol)
+    2
+    >>> print(lol)
+    sklearn.model_selection.partition.LeaveOneLabelOut(labels=[1 1 2 2])
+    >>> for train_index, test_index in lol:
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...    X_train, X_test = X[train_index], X[test_index]
+    ...    y_train, y_test = y[train_index], y[test_index]
+    ...    print(X_train, X_test, y_train, y_test)
+    TRAIN: [2 3] TEST: [0 1]
+    [[5 6]
+     [7 8]] [[1 2]
+     [3 4]] [1 2] [1 2]
+    TRAIN: [0 1] TEST: [2 3]
+    [[1 2]
+     [3 4]] [[5 6]
+     [7 8]] [1 2] [1 2]
+
+    """
+
+    def __init__(self, labels=None, indices=None):
+        n = None if labels is None else len(labels)
+        super(LeaveOneLabelOut, self).__init__(n, indices)
+        self.labels = labels
+
+    def _iter_test_masks(self, y):
+        labels = self.labels if y is None else y
+        # We make a copy of labels to avoid side-effects during iteration
+        labels = np.array(labels, copy=True)
+        unique_labels = np.unique(labels)
+        n_unique_labels = len(unique_labels)
+        for i in unique_labels:
+            print("yielding", labels == i)
+            yield labels == i
+
+    def __repr__(self):
+        return '%s.%s(labels=%s)' % (
+            self.__class__.__module__,
+            self.__class__.__name__,
+            self.labels,
+        )
+
+    def __len__(self):
+        # TODO: remove?
+        return self.n_unique_labels
+
+
+class LeavePLabelOut(_PartitionIterator):
+    """Leave-P-Label_Out cross-validation iterator
+
+    Provides train/test indices to split data according to a third-party
+    provided label. This label information can be used to encode arbitrary
+    domain specific stratifications of the samples as integers.
+
+    For instance the labels could be the year of collection of the samples
+    and thus allow for cross-validation against time-based splits.
+
+    The difference between LeavePLabelOut and LeaveOneLabelOut is that
+    the former builds the test sets with all the samples assigned to
+    ``p`` different values of the labels while the latter uses samples
+    all assigned the same labels.
+
+    Parameters
+    ----------
+    labels : array-like of int with shape (n_samples,)
+        Arbitrary domain-specific stratification of the data to be used
+        to draw the splits.
+
+    p : int
+        Number of samples to leave out in the test split.
+
+    Examples
+    --------
+    >>> from sklearn import model_selection
+    >>> X = np.array([[1, 2], [3, 4], [5, 6]])
+    >>> y = np.array([1, 2, 1])
+    >>> labels = np.array([1, 2, 3])
+    >>> lpl = model_selection.LeavePLabelOut(labels, p=2)
+    >>> len(lpl)
+    3
+    >>> print(lpl)
+    sklearn.model_selection.partition.LeavePLabelOut(labels=[1 2 3], p=2)
+    >>> for train_index, test_index in lpl:
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...    X_train, X_test = X[train_index], X[test_index]
+    ...    y_train, y_test = y[train_index], y[test_index]
+    ...    print(X_train, X_test, y_train, y_test)
+    TRAIN: [2] TEST: [0 1]
+    [[5 6]] [[1 2]
+     [3 4]] [1] [1 2]
+    TRAIN: [1] TEST: [0 2]
+    [[3 4]] [[1 2]
+     [5 6]] [2] [1 1]
+    TRAIN: [0] TEST: [1 2]
+    [[1 2]] [[3 4]
+     [5 6]] [1] [2 1]
+    """
+
+    def __init__(self, labels=None, p=None, indices=None):
+        n = None if labels is None else len(labels)
+        super(LeavePLabelOut, self).__init__(n, indices)
+        if p is None:
+            raise ValueError("LeavePLabelOut: must supply p")
+
+        self.p = p
+        self.labels = labels
+
+    def _iter_test_masks(self, y):
+        labels = self.labels if y is None else y
+        # We make a copy of labels to avoid side-effects during iteration
+        labels = np.array(labels, copy=True)
+        unique_labels = np.unique(labels)
+        n_unique_labels = len(unique_labels)
+        comb = combinations(range(n_unique_labels), self.p)
+        for idx in comb:
+            test_index = self._empty_mask(labels)
+            idx = np.array(idx)
+            for l in unique_labels[idx]:
+                test_index[labels == l] = True
+            yield test_index
+
+    def __repr__(self):
+        return '%s.%s(labels=%s, p=%s)' % (
+            self.__class__.__module__,
+            self.__class__.__name__,
+            self.labels,
+            self.p,
+        )
+
+    def __len__(self):
+        return int(factorial(self.n_unique_labels) /
+                   factorial(self.n_unique_labels - self.p) /
+                   factorial(self.p))
+
+
+def train_test_split(*arrays, **options):
+    """Split arrays or matrices into random train and test subsets
+
+    Quick utility that wraps calls to ``check_arrays`` and
+    ``next(iter(ShuffleSplit(n_samples)))`` and application to input
+    data into a single call for splitting (and optionally subsampling)
+    data in a oneliner.
+
+    Parameters
+    ----------
+    *arrays : sequence of arrays or scipy.sparse matrices with same shape[0]
+        Python lists or tuples occurring in arrays are converted to 1D numpy
+        arrays.
+
+    test_size : float, int, or None (default is None)
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the test split. If
+        int, represents the absolute number of test samples. If None,
+        the value is automatically set to the complement of the train size.
+        If train size is also None, test size is set to 0.25.
+
+    train_size : float, int, or None (default is None)
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the train split. If
+        int, represents the absolute number of train samples. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int or RandomState
+        Pseudo-random number generator state used for random sampling.
+
+    dtype : a numpy dtype instance, None by default
+        Enforce a specific dtype.
+
+    Returns
+    -------
+    splitting : list of arrays, length=2 * len(arrays)
+        List containing train-test split of input array.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection.partition import train_test_split
+    >>> a, b = np.arange(10).reshape((5, 2)), range(5)
+    >>> a
+    array([[0, 1],
+           [2, 3],
+           [4, 5],
+           [6, 7],
+           [8, 9]])
+    >>> list(b)
+    [0, 1, 2, 3, 4]
+
+    >>> a_train, a_test, b_train, b_test = train_test_split(
+    ...     a, b, test_size=0.33, random_state=42)
+    ...
+    >>> a_train
+    array([[4, 5],
+           [0, 1],
+           [6, 7]])
+    >>> b_train
+    array([2, 0, 3])
+    >>> a_test
+    array([[2, 3],
+           [8, 9]])
+    >>> b_test
+    array([1, 4])
+
+    """
+    n_arrays = len(arrays)
+    if n_arrays == 0:
+        raise ValueError("At least one array required as input")
+
+    test_size = options.pop('test_size', None)
+    train_size = options.pop('train_size', None)
+    random_state = options.pop('random_state', None)
+    options['sparse_format'] = 'csr'
+    options['allow_nans'] = True
+
+    if test_size is None and train_size is None:
+        test_size = 0.25
+
+    arrays = check_arrays(*arrays, **options)
+    n_samples = arrays[0].shape[0]
+    cv = ShuffleSplit(n_samples, test_size=test_size,
+                      train_size=train_size,
+                      random_state=random_state)
+
+    train, test = next(iter(cv))
+    return list(chain.from_iterable((a[train], a[test]) for a in arrays))
+
+
+train_test_split.__test__ = False  # to avoid a pb with nosetests
+
+
+class BaseShuffleSplit(with_metaclass(ABCMeta)):
+    """Base class for ShuffleSplit and StratifiedShuffleSplit"""
+
+    def __init__(self, n, n_iter=10, test_size=0.1, train_size=None,
+                 indices=None, random_state=None, n_iterations=None):
+        if indices is None:
+            indices = True
+        else:
+            warnings.warn("The indices parameter is deprecated and will be "
+                          "removed (assumed True) in 0.17", DeprecationWarning)
+        self.n = n
+        self.n_iter = n_iter
+        if n_iterations is not None:  # pragma: no cover
+            warnings.warn("n_iterations was renamed to n_iter for consistency "
+                          " and will be removed in 0.16.")
+            self.n_iter = n_iterations
+        self.test_size = test_size
+        self.train_size = train_size
+        self.random_state = random_state
+        self._indices = indices
+        self.n_train, self.n_test = _validate_shuffle_split(n,
+                                                            test_size,
+                                                            train_size)
+
+    @property
+    def indices(self):
+        warnings.warn("The indices attribute is deprecated and will be "
+                      "removed (assumed True) in 0.17", DeprecationWarning,
+                      stacklevel=1)
+        return self._indices
+
+    def __iter__(self):
+        if self._indices:
+            for train, test in self._iter_indices():
+                yield train, test
+            return
+        for train, test in self._iter_indices():
+            train_m = np.zeros(self.n, dtype=bool)
+            test_m = np.zeros(self.n, dtype=bool)
+            train_m[train] = True
+            test_m[test] = True
+            yield train_m, test_m
+
+    @abstractmethod
+    def _iter_indices(self):
+        """Generate (train, test) indices"""
+
+
+class ShuffleSplit(BaseShuffleSplit):
+    """Random permutation cross-validation iterator.
+
+    Yields indices to split data into training and test sets.
+
+    Note: contrary to other cross-validation strategies, random splits
+    do not guarantee that all folds will be different, although this is
+    still very likely for sizeable datasets.
+
+    Parameters
+    ----------
+    n : int
+        Total number of elements in the dataset.
+
+    n_iter : int (default 10)
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float (default 0.1), int, or None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the test split. If
+        int, represents the absolute number of test samples. If None,
+        the value is automatically set to the complement of the train size.
+
+    train_size : float, int, or None (default is None)
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the train split. If
+        int, represents the absolute number of train samples. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int or RandomState
+        Pseudo-random number generator state used for random sampling.
+
+    Examples
+    --------
+    >>> from sklearn import cross_validation
+    >>> rs = cross_validation.ShuffleSplit(4, n_iter=3,
+    ...     test_size=.25, random_state=0)
+    >>> len(rs)
+    3
+    >>> print(rs)
+    ... # doctest: +ELLIPSIS
+    ShuffleSplit(4, n_iter=3, test_size=0.25, ...)
+    >>> for train_index, test_index in rs:
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...
+    TRAIN: [3 1 0] TEST: [2]
+    TRAIN: [2 1 3] TEST: [0]
+    TRAIN: [0 2 1] TEST: [3]
+
+    >>> rs = cross_validation.ShuffleSplit(4, n_iter=3,
+    ...     train_size=0.5, test_size=.25, random_state=0)
+    >>> for train_index, test_index in rs:
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...
+    TRAIN: [3 1] TEST: [2]
+    TRAIN: [2 1] TEST: [0]
+    TRAIN: [0 2] TEST: [3]
+
+    See also
+    --------
+    Bootstrap: cross-validation using re-sampling with replacement.
+    """
+
+    def _iter_indices(self):
+        rng = check_random_state(self.random_state)
+        for i in range(self.n_iter):
+            # random partition
+            permutation = rng.permutation(self.n)
+            ind_test = permutation[:self.n_test]
+            ind_train = permutation[self.n_test:self.n_test + self.n_train]
+            yield ind_train, ind_test
+
+    def __repr__(self):
+        return ('%s(%d, n_iter=%d, test_size=%s, '
+                'random_state=%s)' % (
+                    self.__class__.__name__,
+                    self.n,
+                    self.n_iter,
+                    str(self.test_size),
+                    self.random_state,
+                ))
+
+    def __len__(self):
+        return self.n_iter
+
+
+def _validate_shuffle_split(n, test_size, train_size):
+    if test_size is None and train_size is None:
+        raise ValueError(
+            'test_size and train_size can not both be None')
+
+    if test_size is not None:
+        if np.asarray(test_size).dtype.kind == 'f':
+            if test_size >= 1.:
+                raise ValueError(
+                    'test_size=%f should be smaller '
+                    'than 1.0 or be an integer' % test_size)
+        elif np.asarray(test_size).dtype.kind == 'i':
+            if test_size >= n:
+                raise ValueError(
+                    'test_size=%d should be smaller '
+                    'than the number of samples %d' % (test_size, n))
+        else:
+            raise ValueError("Invalid value for test_size: %r" % test_size)
+
+    if train_size is not None:
+        if np.asarray(train_size).dtype.kind == 'f':
+            if train_size >= 1.:
+                raise ValueError("train_size=%f should be smaller "
+                                 "than 1.0 or be an integer" % train_size)
+            elif np.asarray(test_size).dtype.kind == 'f' and \
+                    train_size + test_size > 1.:
+                raise ValueError('The sum of test_size and train_size = %f, '
+                                 'should be smaller than 1.0. Reduce '
+                                 'test_size and/or train_size.' %
+                                 (train_size + test_size))
+        elif np.asarray(train_size).dtype.kind == 'i':
+            if train_size >= n:
+                raise ValueError("train_size=%d should be smaller "
+                                 "than the number of samples %d" %
+                                 (train_size, n))
+        else:
+            raise ValueError("Invalid value for train_size: %r" % train_size)
+
+    if np.asarray(test_size).dtype.kind == 'f':
+        n_test = ceil(test_size * n)
+    elif np.asarray(test_size).dtype.kind == 'i':
+        n_test = float(test_size)
+
+    if train_size is None:
+        n_train = n - n_test
+    else:
+        if np.asarray(train_size).dtype.kind == 'f':
+            n_train = floor(train_size * n)
+        else:
+            n_train = float(train_size)
+
+    if test_size is None:
+        n_test = n - n_train
+
+    if n_train + n_test > n:
+        raise ValueError('The sum of train_size and test_size = %d, '
+                         'should be smaller than the number of '
+                         'samples %d. Reduce test_size and/or '
+                         'train_size.' % (n_train + n_test, n))
+
+    return int(n_train), int(n_test)
+
+
+class StratifiedShuffleSplit(BaseShuffleSplit):
+    """Stratified ShuffleSplit cross validation iterator
+
+    Provides train/test indices to split data in train test sets.
+
+    This cross-validation object is a merge of StratifiedKFold and
+    ShuffleSplit, which returns stratified randomized folds. The folds
+    are made by preserving the percentage of samples for each class.
+
+    Note: like the ShuffleSplit strategy, stratified random splits
+    do not guarantee that all folds will be different, although this is
+    still very likely for sizeable datasets.
+
+    Parameters
+    ----------
+    y : array, [n_samples]
+        Labels of samples.
+
+    n_iter : int (default 10)
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float (default 0.1), int, or None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the test split. If
+        int, represents the absolute number of test samples. If None,
+        the value is automatically set to the complement of the train size.
+
+    train_size : float, int, or None (default is None)
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the train split. If
+        int, represents the absolute number of train samples. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int or RandomState
+        Pseudo-random number generator state used for random sampling.
+
+    Examples
+    --------
+    >>> from sklearn.cross_validation import StratifiedShuffleSplit
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([0, 0, 1, 1])
+    >>> sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)
+    >>> len(sss)
+    3
+    >>> print(sss)       # doctest: +ELLIPSIS
+    StratifiedShuffleSplit(labels=[0 0 1 1], n_iter=3, ...)
+    >>> for train_index, test_index in sss:
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...    X_train, X_test = X[train_index], X[test_index]
+    ...    y_train, y_test = y[train_index], y[test_index]
+    TRAIN: [1 2] TEST: [3 0]
+    TRAIN: [0 2] TEST: [1 3]
+    TRAIN: [0 2] TEST: [3 1]
+    """
+
+    def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
+                 indices=None, random_state=None, n_iterations=None):
+
+        super(StratifiedShuffleSplit, self).__init__(
+            len(y), n_iter, test_size, train_size, indices, random_state,
+            n_iterations)
+        self.y = np.array(y)
+        self.classes, self.y_indices = np.unique(y, return_inverse=True)
+        n_cls = self.classes.shape[0]
+
+        if np.min(np.bincount(self.y_indices)) < 2:
+            raise ValueError("The least populated class in y has only 1"
+                             " member, which is too few. The minimum"
+                             " number of labels for any class cannot"
+                             " be less than 2.")
+
+        if self.n_train < n_cls:
+            raise ValueError('The train_size = %d should be greater or '
+                             'equal to the number of classes = %d' %
+                             (self.n_train, n_cls))
+        if self.n_test < n_cls:
+            raise ValueError('The test_size = %d should be greater or '
+                             'equal to the number of classes = %d' %
+                             (self.n_test, n_cls))
+
+    def _iter_indices(self):
+        rng = check_random_state(self.random_state)
+        cls_count = np.bincount(self.y_indices)
+        p_i = cls_count / float(self.n)
+        n_i = np.round(self.n_train * p_i).astype(int)
+        t_i = np.minimum(cls_count - n_i,
+                         np.round(self.n_test * p_i).astype(int))
+
+        for n in range(self.n_iter):
+            train = []
+            test = []
+
+            for i, cls in enumerate(self.classes):
+                permutation = rng.permutation(cls_count[i])
+                cls_i = np.where((self.y == cls))[0][permutation]
+
+                train.extend(cls_i[:n_i[i]])
+                test.extend(cls_i[n_i[i]:n_i[i] + t_i[i]])
+
+            # Because of rounding issues (as n_train and n_test are not
+            # dividers of the number of elements per class), we may end
+            # up here with less samples in train and test than asked for.
+            if len(train) < self.n_train or len(test) < self.n_test:
+                # We complete by affecting randomly the missing indexes
+                missing_idx = np.where(np.bincount(train + test,
+                                                   minlength=len(self.y)) == 0,
+                                       )[0]
+                missing_idx = rng.permutation(missing_idx)
+                train.extend(missing_idx[:(self.n_train - len(train))])
+                test.extend(missing_idx[-(self.n_test - len(test)):])
+
+            train = rng.permutation(train)
+            test = rng.permutation(test)
+
+            yield train, test
+
+    def __repr__(self):
+        return ('%s(labels=%s, n_iter=%d, test_size=%s, '
+                'random_state=%s)' % (
+                    self.__class__.__name__,
+                    self.y,
+                    self.n_iter,
+                    str(self.test_size),
+                    self.random_state,
+                ))
+
+    def __len__(self):
+        return self.n_iter
+
+
+def check_cv(cv, X=None, y=None, classifier=False):
+    """Input checker utility for building a CV in a user friendly way.
+
+    Parameters
+    ----------
+    cv : int, a cv generator instance, or None
+        The input specifying which cv generator to use. It can be an
+        integer, in which case it is the number of folds in a KFold,
+        None, in which case 3 fold is used, or another object, that
+        will then be used as a cv generator.
+
+    X : array-like
+        The data the cross-val object will be applied on.
+
+    y : array-like
+        The target variable for a supervised learning problem.
+
+    classifier : boolean optional
+        Whether the task is a classification task, in which case
+        stratified KFold will be used.
+
+    Returns
+    -------
+    checked_cv: a cross-validation generator instance.
+        The return value is guaranteed to be a cv generator instance, whatever
+        the input type.
+    """
+    return _check_cv(cv, X=X, y=y, classifier=classifier, warn_mask=True)
+
+
+def _check_cv(cv, X=None, y=None, classifier=False, warn_mask=False):
+    # This exists for internal use while indices is being deprecated.
+    is_sparse = sp.issparse(X)
+    needs_indices = is_sparse or not hasattr(X, "shape")
+    if cv is None:
+        cv = 3
+    if isinstance(cv, numbers.Integral):
+        if warn_mask and not needs_indices:
+            warnings.warn('check_cv will return indices instead of boolean '
+                          'masks from 0.17', DeprecationWarning)
+        else:
+            needs_indices = None
+        if classifier:
+            cv = StratifiedKFold(y, cv, indices=needs_indices)
+        else:
+            if not is_sparse:
+                n_samples = len(X)
+            else:
+                n_samples = X.shape[0]
+            cv = KFold(n_samples, cv, indices=needs_indices)
+    if needs_indices and not getattr(cv, "_indices", True):
+        raise ValueError("Sparse data and lists require indices-based cross"
+                         " validation generator, got: %r", cv)
+    return cv
diff --git a/sklearn/model_selection/scoring.py b/sklearn/model_selection/scoring.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sklearn/model_selection/search.py b/sklearn/model_selection/search.py
new file mode 100644
index 0000000000000..878c7157398c0
--- /dev/null
+++ b/sklearn/model_selection/search.py
@@ -0,0 +1,548 @@
+"""
+The :mod:`sklearn.model_selection.search` includes utilities to fine-tune the
+parameters of an estimator.
+"""
+
+from __future__ import print_function
+
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>,
+#         Gael Varoquaux <gael.varoquaux@normalesup.org>
+#         Andreas Mueller <amueller@ais.uni-bonn.de>
+#         Olivier Grisel <olivier.grisel@ensta.org>
+# License: BSD 3 clause
+
+from abc import ABCMeta, abstractmethod
+from collections import namedtuple, Sized
+
+import numpy as np
+
+from sklearn.base import BaseEstimator, is_classifier, clone
+from sklearn.base import MetaEstimatorMixin
+from sklearn.externals.joblib import Parallel, delayed
+from sklearn.externals import six
+from sklearn.utils.validation import _num_samples, check_arrays
+from sklearn.metrics.scorer import check_scoring
+from .partition import _check_cv as check_cv
+from .validate import _fit_and_score
+from .utils import ParameterGrid, ParameterSampler
+
+
+__all__ = ['GridSearchCV', 'RandomizedSearchCV']
+
+
+def _check_param_grid(param_grid):
+    if hasattr(param_grid, 'items'):
+        param_grid = [param_grid]
+
+    for p in param_grid:
+        for v in p.values():
+            if isinstance(v, np.ndarray) and v.ndim > 1:
+                raise ValueError("Parameter array should be one-dimensional.")
+
+            check = [isinstance(v, k) for k in (list, tuple, np.ndarray)]
+            if not True in check:
+                raise ValueError("Parameter values should be a list.")
+
+            if len(v) == 0:
+                raise ValueError("Parameter values should be a non-empty "
+                                 "list.")
+
+
+class _CVScoreTuple (namedtuple('_CVScoreTuple',
+                                ('parameters',
+                                 'mean_validation_score',
+                                 'cv_validation_scores'))):
+    # A raw namedtuple is very memory efficient as it packs the attributes
+    # in a struct to get rid of the __dict__ of attributes in particular it
+    # does not copy the string for the keys on each instance.
+    # By deriving a namedtuple class just to introduce the __repr__ method we
+    # would also reintroduce the __dict__ on the instance. By telling the
+    # Python interpreter that this subclass uses static __slots__ instead of
+    # dynamic attributes. Furthermore we don't need any additional slot in the
+    # subclass so we set __slots__ to the empty tuple.
+    __slots__ = ()
+
+    def __repr__(self):
+        """Simple custom repr to summarize the main info"""
+        return "mean: {0:.5f}, std: {1:.5f}, params: {2}".format(
+            self.mean_validation_score,
+            np.std(self.cv_validation_scores),
+            self.parameters)
+
+
+class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator,
+                                      MetaEstimatorMixin)):
+    """Base class for hyper parameter search with cross-validation."""
+
+    @abstractmethod
+    def __init__(self, estimator, scoring=None, loss_func=None,
+                 score_func=None, fit_params=None, n_jobs=1, iid=True,
+                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'):
+
+        self.scoring = scoring
+        self.estimator = estimator
+        self.loss_func = loss_func
+        self.score_func = score_func
+        self.n_jobs = n_jobs
+        self.fit_params = fit_params if fit_params is not None else {}
+        self.iid = iid
+        self.refit = refit
+        self.cv = cv
+        self.verbose = verbose
+        self.pre_dispatch = pre_dispatch
+
+    def score(self, X, y=None):
+        """Returns the score on the given test data and labels, if the search
+        estimator has been refit. The ``score`` function of the best estimator
+        is used, or the ``scoring`` parameter where unavailable.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            Input data, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        Returns
+        -------
+        score : float
+
+        """
+        if hasattr(self.best_estimator_, 'score'):
+            return self.best_estimator_.score(X, y)
+        if self.scorer_ is None:
+            raise ValueError("No score function explicitly defined, "
+                             "and the estimator doesn't provide one %s"
+                             % self.best_estimator_)
+        return self.scorer_(self.best_estimator_, X, y)
+
+    @property
+    def predict(self):
+        return self.best_estimator_.predict
+
+    @property
+    def predict_proba(self):
+        return self.best_estimator_.predict_proba
+
+    @property
+    def decision_function(self):
+        return self.best_estimator_.decision_function
+
+    @property
+    def transform(self):
+        return self.best_estimator_.transform
+
+    def _fit(self, X, y, parameter_iterable):
+        """Actual fitting,  performing the search over parameters."""
+
+        estimator = self.estimator
+        cv = self.cv
+        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring,
+                                     loss_func=self.loss_func,
+                                     score_func=self.score_func)
+
+        n_samples = _num_samples(X)
+        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr',
+                            allow_nans=True)
+
+        if y is not None:
+            if len(y) != n_samples:
+                raise ValueError('Target variable (y) has a different number '
+                                 'of samples (%i) than data (X: %i samples)'
+                                 % (len(y), n_samples))
+            y = np.asarray(y)
+        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
+
+        if self.verbose > 0:
+            if isinstance(parameter_iterable, Sized):
+                n_candidates = len(parameter_iterable)
+                print("Fitting {0} folds for each of {1} candidates, totalling"
+                      " {2} fits".format(len(cv), n_candidates,
+                                         n_candidates * len(cv)))
+
+        base_estimator = clone(self.estimator)
+
+        pre_dispatch = self.pre_dispatch
+
+        out = Parallel(
+            n_jobs=self.n_jobs, verbose=self.verbose,
+            pre_dispatch=pre_dispatch
+        )(
+            delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
+                                    train, test, self.verbose, parameters,
+                                    self.fit_params, return_parameters=True)
+            for parameters in parameter_iterable
+            for train, test in cv)
+
+        # Out is a list of triplet: score, estimator, n_test_samples
+        n_fits = len(out)
+        n_folds = len(cv)
+
+        scores = list()
+        grid_scores = list()
+        for grid_start in range(0, n_fits, n_folds):
+            n_test_samples = 0
+            score = 0
+            all_scores = []
+            for this_score, this_n_test_samples, _, parameters in \
+                    out[grid_start:grid_start + n_folds]:
+                all_scores.append(this_score)
+                if self.iid:
+                    this_score *= this_n_test_samples
+                    n_test_samples += this_n_test_samples
+                score += this_score
+            if self.iid:
+                score /= float(n_test_samples)
+            else:
+                score /= float(n_folds)
+            scores.append((score, parameters))
+            # TODO: shall we also store the test_fold_sizes?
+            grid_scores.append(_CVScoreTuple(
+                parameters,
+                score,
+                np.array(all_scores)))
+        # Store the computed scores
+        self.grid_scores_ = grid_scores
+
+        # Find the best parameters by comparing on the mean validation score:
+        # note that `sorted` is deterministic in the way it breaks ties
+        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
+                      reverse=True)[0]
+        self.best_params_ = best.parameters
+        self.best_score_ = best.mean_validation_score
+
+        if self.refit:
+            # fit the best estimator using the entire dataset
+            # clone first to work around broken estimators
+            best_estimator = clone(base_estimator).set_params(
+                **best.parameters)
+            if y is not None:
+                best_estimator.fit(X, y, **self.fit_params)
+            else:
+                best_estimator.fit(X, **self.fit_params)
+            self.best_estimator_ = best_estimator
+        return self
+
+
+class GridSearchCV(BaseSearchCV):
+    """Exhaustive search over specified parameter values for an estimator.
+
+    Important members are fit, predict.
+
+    GridSearchCV implements a "fit" method and a "predict" method like
+    any classifier except that the parameters of the classifier
+    used to predict is optimized by cross-validation.
+
+    Parameters
+    ----------
+    estimator : object type that implements the "fit" and "predict" methods
+        A object of that type is instantiated for each grid point.
+
+    param_grid : dict or list of dictionaries
+        Dictionary with parameters names (string) as keys and lists of
+        parameter settings to try as values, or a list of such
+        dictionaries, in which case the grids spanned by each dictionary
+        in the list are explored. This enables searching over any sequence
+        of parameter settings.
+
+    scoring : string, callable or None, optional, default: None
+        A string (see model evaluation documentation) or
+        a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+
+    fit_params : dict, optional
+        Parameters to pass to the fit method.
+
+    n_jobs : int, optional
+        Number of jobs to run in parallel (default 1).
+
+    pre_dispatch : int, or string, optional
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+            - None, in which case all the jobs are immediately
+              created and spawned. Use this for lightweight and
+              fast-running jobs, to avoid delays due to on-demand
+              spawning of the jobs
+
+            - An int, giving the exact number of total jobs that are
+              spawned
+
+            - A string, giving an expression as a function of n_jobs,
+              as in '2*n_jobs'
+
+    iid : boolean, optional
+        If True, the data is assumed to be identically distributed across
+        the folds, and the loss minimized is the total loss per sample,
+        and not the mean loss across the folds.
+
+    cv : integer or cross-validation generator, optional
+        If an integer is passed, it is the number of folds (default 3).
+        Specific cross-validation objects can be passed, see
+        sklearn.cross_validation module for the list of possible objects
+
+    refit : boolean
+        Refit the best estimator with the entire dataset.
+        If "False", it is impossible to make predictions using
+        this GridSearchCV instance after fitting.
+
+    verbose : integer
+        Controls the verbosity: the higher, the more messages.
+
+    Examples
+    --------
+    >>> from sklearn import svm, grid_search, datasets
+    >>> iris = datasets.load_iris()
+    >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
+    >>> svr = svm.SVC()
+    >>> clf = grid_search.GridSearchCV(svr, parameters)
+    >>> clf.fit(iris.data, iris.target)
+    ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+    GridSearchCV(cv=None,
+           estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
+                         degree=..., gamma=..., kernel='rbf', max_iter=-1,
+                         probability=False, random_state=None, shrinking=True,
+                         tol=..., verbose=False),
+           fit_params={}, iid=..., loss_func=..., n_jobs=1,
+           param_grid=..., pre_dispatch=..., refit=..., score_func=...,
+           scoring=..., verbose=...)
+
+
+    Attributes
+    ----------
+    `grid_scores_` : list of named tuples
+        Contains scores for all parameter combinations in param_grid.
+        Each entry corresponds to one parameter setting.
+        Each named tuple has the attributes:
+
+            * ``parameters``, a dict of parameter settings
+            * ``mean_validation_score``, the mean score over the
+              cross-validation folds
+            * ``cv_validation_scores``, the list of scores for each fold
+
+    `best_estimator_` : estimator
+        Estimator that was chosen by the search, i.e. estimator
+        which gave highest score (or smallest loss if specified)
+        on the left out data.
+
+    `best_score_` : float
+        Score of best_estimator on the left out data.
+
+    `best_params_` : dict
+        Parameter setting that gave the best results on the hold out data.
+
+    `scorer_` : function
+        Scorer function used on the held out data to choose the best
+        parameters for the model.
+
+    Notes
+    ------
+    The parameters selected are those that maximize the score of the left out
+    data, unless an explicit score is passed in which case it is used instead.
+
+    If `n_jobs` was set to a value higher than one, the data is copied for each
+    point in the grid (and not `n_jobs` times). This is done for efficiency
+    reasons if individual jobs take very little time, but may raise errors if
+    the dataset is large and not enough memory is available.  A workaround in
+    this case is to set `pre_dispatch`. Then, the memory is copied only
+    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
+    n_jobs`.
+
+    See Also
+    ---------
+    :class:`ParameterGrid`:
+        generates all the combinations of a an hyperparameter grid.
+
+    :func:`sklearn.cross_validation.train_test_split`:
+        utility function to split the data into a development set usable
+        for fitting a GridSearchCV instance and an evaluation set for
+        its final evaluation.
+
+    :func:`sklearn.metrics.make_scorer`:
+        Make a scorer from a performance metric or loss function.
+
+    """
+
+    def __init__(self, estimator, param_grid, scoring=None, loss_func=None,
+                 score_func=None, fit_params=None, n_jobs=1, iid=True,
+                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'):
+        super(GridSearchCV, self).__init__(
+            estimator, scoring, loss_func, score_func, fit_params, n_jobs, iid,
+            refit, cv, verbose, pre_dispatch)
+        self.param_grid = param_grid
+        _check_param_grid(param_grid)
+
+    def fit(self, X, y=None):
+        """Run fit with all sets of parameters.
+
+        Parameters
+        ----------
+
+        X : array-like, shape = [n_samples, n_features]
+            Training vector, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        """
+        return self._fit(X, y, ParameterGrid(self.param_grid))
+
+
+class RandomizedSearchCV(BaseSearchCV):
+    """Randomized search on hyper parameters.
+
+    RandomizedSearchCV implements a "fit" method and a "predict" method like
+    any classifier except that the parameters of the classifier
+    used to predict is optimized by cross-validation.
+
+    In contrast to GridSearchCV, not all parameter values are tried out, but
+    rather a fixed number of parameter settings is sampled from the specified
+    distributions. The number of parameter settings that are tried is
+    given by n_iter.
+
+    Parameters
+    ----------
+    estimator : object type that implements the "fit" and "predict" methods
+        A object of that type is instantiated for each parameter setting.
+
+    param_distributions : dict
+        Dictionary with parameters names (string) as keys and distributions
+        or lists of parameters to try. Distributions must provide a ``rvs``
+        method for sampling (such as those from scipy.stats.distributions).
+        If a list is given, it is sampled uniformly.
+
+    n_iter : int, default=10
+        Number of parameter settings that are sampled. n_iter trades
+        off runtime vs quality of the solution.
+
+    scoring : string, callable or None, optional, default: None
+        A string (see model evaluation documentation) or
+        a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+
+    fit_params : dict, optional
+        Parameters to pass to the fit method.
+
+    n_jobs : int, optional
+        Number of jobs to run in parallel (default 1).
+
+    pre_dispatch : int, or string, optional
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+            - None, in which case all the jobs are immediately
+              created and spawned. Use this for lightweight and
+              fast-running jobs, to avoid delays due to on-demand
+              spawning of the jobs
+
+            - An int, giving the exact number of total jobs that are
+              spawned
+
+            - A string, giving an expression as a function of n_jobs,
+              as in '2*n_jobs'
+
+    iid : boolean, optional
+        If True, the data is assumed to be identically distributed across
+        the folds, and the loss minimized is the total loss per sample,
+        and not the mean loss across the folds.
+
+    cv : integer or cross-validation generator, optional
+        If an integer is passed, it is the number of folds (default 3).
+        Specific cross-validation objects can be passed, see
+        sklearn.cross_validation module for the list of possible objects
+
+    refit : boolean
+        Refit the best estimator with the entire dataset.
+        If "False", it is impossible to make predictions using
+        this RandomizedSearchCV instance after fitting.
+
+    verbose : integer
+        Controls the verbosity: the higher, the more messages.
+
+
+    Attributes
+    ----------
+    `grid_scores_` : list of named tuples
+        Contains scores for all parameter combinations in param_grid.
+        Each entry corresponds to one parameter setting.
+        Each named tuple has the attributes:
+
+            * ``parameters``, a dict of parameter settings
+            * ``mean_validation_score``, the mean score over the
+              cross-validation folds
+            * ``cv_validation_scores``, the list of scores for each fold
+
+    `best_estimator_` : estimator
+        Estimator that was chosen by the search, i.e. estimator
+        which gave highest score (or smallest loss if specified)
+        on the left out data.
+
+    `best_score_` : float
+        Score of best_estimator on the left out data.
+
+    `best_params_` : dict
+        Parameter setting that gave the best results on the hold out data.
+
+    Notes
+    -----
+    The parameters selected are those that maximize the score of the held-out
+    data, according to the scoring parameter.
+
+    If `n_jobs` was set to a value higher than one, the data is copied for each
+    parameter setting(and not `n_jobs` times). This is done for efficiency
+    reasons if individual jobs take very little time, but may raise errors if
+    the dataset is large and not enough memory is available.  A workaround in
+    this case is to set `pre_dispatch`. Then, the memory is copied only
+    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
+    n_jobs`.
+
+    See Also
+    --------
+    :class:`GridSearchCV`:
+        Does exhaustive search over a grid of parameters.
+
+    :class:`ParameterSampler`:
+        A generator over parameter settins, constructed from
+        param_distributions.
+
+    """
+
+    def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
+                 fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
+                 verbose=0, pre_dispatch='2*n_jobs', random_state=None):
+
+        self.param_distributions = param_distributions
+        self.n_iter = n_iter
+        self.random_state = random_state
+        super(RandomizedSearchCV, self).__init__(
+            estimator=estimator, scoring=scoring, fit_params=fit_params,
+            n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
+            pre_dispatch=pre_dispatch)
+
+    def fit(self, X, y=None):
+        """Run fit on the estimator with randomly drawn parameters.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            Training vector, where n_samples in the number of samples and
+            n_features is the number of features.
+
+        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        """
+        sampled_params = ParameterSampler(self.param_distributions,
+                                          self.n_iter,
+                                          random_state=self.random_state)
+        return self._fit(X, y, sampled_params)
diff --git a/sklearn/model_selection/utils.py b/sklearn/model_selection/utils.py
new file mode 100644
index 0000000000000..b29b57329690a
--- /dev/null
+++ b/sklearn/model_selection/utils.py
@@ -0,0 +1,222 @@
+"""
+The :mod:`sklearn.model_selection.utils` module includes
+"""
+#TODO Complete docstring
+from __future__ import print_function
+
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>,
+#         Gael Varoquaux <gael.varoquaux@normalesup.org>
+#         Andreas Mueller <amueller@ais.uni-bonn.de>
+#         Olivier Grisel <olivier.grisel@ensta.org>
+# License: BSD 3 clause
+
+from collections import Mapping
+from functools import partial, reduce
+from itertools import product
+import operator
+
+import numpy as np
+
+from sklearn.model_selection.validate import _fit_and_score
+from sklearn.utils import check_random_state
+
+__all__ = ['ParameterGrid', 'fit_grid_point', 'ParameterSampler']
+
+
+class ParameterGrid(object):
+    """Grid of parameters with a discrete number of values for each.
+
+    Can be used to iterate over parameter value combinations with the
+    Python built-in function iter.
+
+    Parameters
+    ----------
+    param_grid : dict of string to sequence, or sequence of such
+        The parameter grid to explore, as a dictionary mapping estimator
+        parameters to sequences of allowed values.
+
+        An empty dict signifies default parameters.
+
+        A sequence of dicts signifies a sequence of grids to search, and is
+        useful to avoid exploring parameter combinations that make no sense
+        or have no effect. See the examples below.
+
+    Examples
+    --------
+    >>> from sklearn.grid_search import ParameterGrid
+    >>> param_grid = {'a': [1, 2], 'b': [True, False]}
+    >>> list(ParameterGrid(param_grid)) == (
+    ...    [{'a': 1, 'b': True}, {'a': 1, 'b': False},
+    ...     {'a': 2, 'b': True}, {'a': 2, 'b': False}])
+    True
+
+    >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
+    >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
+    ...                               {'kernel': 'rbf', 'gamma': 1},
+    ...                               {'kernel': 'rbf', 'gamma': 10}]
+    True
+
+    See also
+    --------
+    :class:`GridSearchCV`:
+        uses ``ParameterGrid`` to perform a full parallelized parameter search.
+    """
+
+    def __init__(self, param_grid):
+        if isinstance(param_grid, Mapping):
+            # wrap dictionary in a singleton list to support either dict
+            # or list of dicts
+            param_grid = [param_grid]
+        self.param_grid = param_grid
+
+    def __iter__(self):
+        """Iterate over the points in the grid.
+
+        Returns
+        -------
+        params : iterator over dict of string to any
+            Yields dictionaries mapping each estimator parameter to one of its
+            allowed values.
+        """
+        for p in self.param_grid:
+            # Always sort the keys of a dictionary, for reproducibility
+            items = sorted(p.items())
+            if not items:
+                yield {}
+            else:
+                keys, values = zip(*items)
+                for v in product(*values):
+                    params = dict(zip(keys, v))
+                    yield params
+
+    def __len__(self):
+        """Number of points on the grid."""
+        # Product function that can handle iterables (np.product can't).
+        product = partial(reduce, operator.mul)
+        return sum(product(len(v) for v in p.values()) if p else 1
+                   for p in self.param_grid)
+
+
+def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
+                   verbose, **fit_params):
+    """Run fit on one set of parameters.
+
+    Parameters
+    ----------
+    X : array-like, sparse matrix or list
+        Input data.
+
+    y : array-like or None
+        Targets for input data.
+
+    estimator : estimator object
+        This estimator will be cloned and then fitted.
+
+    parameters : dict
+        Parameters to be set on estimator for this grid point.
+
+    train : ndarray, dtype int or bool
+        Boolean mask or indices for training set.
+
+    test : ndarray, dtype int or bool
+        Boolean mask or indices for test set.
+
+    scorer : callable or None.
+        If provided must be a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+
+    verbose : int
+        Verbosity level.
+
+    **fit_params : kwargs
+        Additional parameter passed to the fit function of the estimator.
+
+
+    Returns
+    -------
+    score : float
+        Score of this parameter setting on given training / test split.
+
+    parameters : dict
+        The parameters that have been evaluated.
+
+    n_samples_test : int
+        Number of test samples in this split.
+    """
+    score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train,
+                                              test, verbose, parameters,
+                                              fit_params)
+    return score, parameters, n_samples_test
+
+
+class ParameterSampler(object):
+    """Generator on parameters sampled from given distributions.
+
+    Non-deterministic iterable over random candidate combinations for hyper-
+    parameter search.
+
+    Note that as of SciPy 0.12, the ``scipy.stats.distributions`` do not accept
+    a custom RNG instance and always use the singleton RNG from
+    ``numpy.random``. Hence setting ``random_state`` will not guarantee a
+    deterministic iteration whenever ``scipy.stats`` distributions are used to
+    define the parameter search space.
+
+    Parameters
+    ----------
+    param_distributions : dict
+        Dictionary where the keys are parameters and values
+        are distributions from which a parameter is to be sampled.
+        Distributions either have to provide a ``rvs`` function
+        to sample from them, or can be given as a list of values,
+        where a uniform distribution is assumed.
+
+    n_iter : integer
+        Number of parameter settings that are produced.
+
+    random_state : int or RandomState
+        Pseudo random number generator state used for random uniform sampling
+        from lists of possible values instead of scipy.stats distributions.
+
+    Returns
+    -------
+    params : dict of string to any
+        **Yields** dictionaries mapping each estimator parameter to
+        as sampled value.
+
+    Examples
+    --------
+    >>> from sklearn.grid_search import ParameterSampler
+    >>> from scipy.stats.distributions import expon
+    >>> import numpy as np
+    >>> np.random.seed(0)
+    >>> param_grid = {'a':[1, 2], 'b': expon()}
+    >>> param_list = list(ParameterSampler(param_grid, n_iter=4))
+    >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())
+    ...                 for d in param_list]
+    >>> rounded_list == [{'b': 0.89856, 'a': 1},
+    ...                  {'b': 0.923223, 'a': 1},
+    ...                  {'b': 1.878964, 'a': 2},
+    ...                  {'b': 1.038159, 'a': 2}]
+    True
+    """
+    def __init__(self, param_distributions, n_iter, random_state=None):
+        self.param_distributions = param_distributions
+        self.n_iter = n_iter
+        self.random_state = random_state
+
+    def __iter__(self):
+        rnd = check_random_state(self.random_state)
+        # Always sort the keys of a dictionary, for reproducibility
+        items = sorted(self.param_distributions.items())
+        for _ in range(self.n_iter):
+            params = dict()
+            for k, v in items:
+                if hasattr(v, "rvs"):
+                    params[k] = v.rvs()
+                else:
+                    params[k] = v[rnd.randint(len(v))]
+            yield params
+
+    def __len__(self):
+        """Number of points that will be sampled."""
+        return self.n_iter
diff --git a/sklearn/model_selection/validate.py b/sklearn/model_selection/validate.py
new file mode 100644
index 0000000000000..815064dfb6219
--- /dev/null
+++ b/sklearn/model_selection/validate.py
@@ -0,0 +1,652 @@
+"""
+The :mod:`sklearn.model_selection.validate` module includes
+"""
+#TODO Complete docstring
+
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>,
+#         Gael Varoquaux <gael.varoquaux@normalesup.org>,
+#         Olivier Grisel <olivier.grisel@ensta.org>
+#         Alexander Fabisch <afabisch@informatik.uni-bremen.de>
+# License: BSD 3 clause
+
+from __future__ import print_function
+from __future__ import division
+
+import numbers
+import time
+import warnings
+
+import numpy as np
+
+from sklearn.base import is_classifier, clone
+from sklearn.utils import check_arrays, check_random_state, safe_mask
+from sklearn.utils.validation import _num_samples
+from sklearn.externals.joblib import Parallel, delayed, logger
+from sklearn.metrics.scorer import check_scoring
+from sklearn.utils import check_arrays
+from sklearn.utils.fixes import astype
+from .partition import _check_cv
+
+def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
+                    verbose=0, fit_params=None, score_func=None,
+                    pre_dispatch='2*n_jobs'):
+    """Evaluate a score by cross-validation
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit'
+        The object to use to fit the data.
+
+    X : array-like
+        The data to fit. Can be, for example a list, or an array at least 2d.
+
+    y : array-like, optional, default: None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    scoring : string, callable or None, optional, default: None
+        A string (see model evaluation documentation) or
+        a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+
+    cv : cross-validation generator, optional, default: None
+        A cross-validation generator. If None, a 3-fold cross
+        validation is used or 3-fold stratified cross-validation
+        when y is supplied and estimator is a classifier.
+
+    n_jobs : integer, optional
+        The number of CPUs to use to do the computation. -1 means
+        'all CPUs'.
+
+    verbose : integer, optional
+        The verbosity level.
+
+    fit_params : dict, optional
+        Parameters to pass to the fit method of the estimator.
+
+    pre_dispatch : int, or string, optional
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+            - None, in which case all the jobs are immediately
+              created and spawned. Use this for lightweight and
+              fast-running jobs, to avoid delays due to on-demand
+              spawning of the jobs
+
+            - An int, giving the exact number of total jobs that are
+              spawned
+
+            - A string, giving an expression as a function of n_jobs,
+              as in '2*n_jobs'
+
+    Returns
+    -------
+    scores : array of float, shape=(len(list(cv)),)
+        Array of scores of the estimator for each run of the cross validation.
+    """
+    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True,
+                        allow_nans=True, allow_nd=True)
+    if y is not None:
+        y = np.asarray(y)
+
+    cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
+    scorer = check_scoring(estimator, score_func=score_func, scoring=scoring)
+    # We clone the estimator to make sure that all the folds are
+    # independent, and that it is pickle-able.
+    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
+                        pre_dispatch=pre_dispatch)
+    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
+                                              train, test, verbose, None,
+                                              fit_params)
+                      for train, test in cv)
+    return np.array(scores)[:, 0]
+
+
+def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
+                   fit_params, return_train_score=False,
+                   return_parameters=False):
+    """Fit estimator and compute scores for a given dataset split.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit'
+        The object to use to fit the data.
+
+    X : array-like of shape at least 2D
+        The data to fit.
+
+    y : array-like, optional, default: None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    scoring : callable
+        A scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+
+    train : array-like, shape = (n_train_samples,)
+        Indices of training samples.
+
+    test : array-like, shape = (n_test_samples,)
+        Indices of test samples.
+
+    verbose : integer
+        The verbosity level.
+
+    parameters : dict or None
+        Parameters to be set on the estimator.
+
+    fit_params : dict or None
+        Parameters that will be passed to ``estimator.fit``.
+
+    return_train_score : boolean, optional, default: False
+        Compute and return score on training set.
+
+    return_parameters : boolean, optional, default: False
+        Return parameters that has been used for the estimator.
+
+    Returns
+    -------
+    train_score : float, optional
+        Score on training set, returned only if `return_train_score` is `True`.
+
+    test_score : float
+        Score on test set.
+
+    n_test_samples : int
+        Number of test samples.
+
+    scoring_time : float
+        Time spent for fitting and scoring in seconds.
+
+    parameters : dict or None, optional
+        The parameters that have been evaluated.
+    """
+    if verbose > 1:
+        if parameters is None:
+            msg = "no parameters to be set"
+        else:
+            msg = '%s' % (', '.join('%s=%s' % (k, v)
+                          for k, v in parameters.items()))
+        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))
+
+    # Adjust lenght of sample weights
+    n_samples = _num_samples(X)
+    fit_params = fit_params if fit_params is not None else {}
+    fit_params = dict([(k, np.asarray(v)[train]
+                       if hasattr(v, '__len__') and len(v) == n_samples else v)
+                       for k, v in fit_params.items()])
+
+    if parameters is not None:
+        estimator.set_params(**parameters)
+
+    start_time = time.time()
+
+    X_train, y_train = _safe_split(estimator, X, y, train)
+    X_test, y_test = _safe_split(estimator, X, y, test, train)
+    if y_train is None:
+        estimator.fit(X_train, **fit_params)
+    else:
+        estimator.fit(X_train, y_train, **fit_params)
+    test_score = _score(estimator, X_test, y_test, scorer)
+    if return_train_score:
+        train_score = _score(estimator, X_train, y_train, scorer)
+
+    scoring_time = time.time() - start_time
+
+    if verbose > 2:
+        msg += ", score=%f" % test_score
+    if verbose > 1:
+        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
+        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
+
+    ret = [train_score] if return_train_score else []
+    ret.extend([test_score, _num_samples(X_test), scoring_time])
+    if return_parameters:
+        ret.append(parameters)
+    return ret
+
+
+def _safe_split(estimator, X, y, indices, train_indices=None):
+    """Create subset of dataset and properly handle kernels."""
+    if hasattr(estimator, 'kernel') and callable(estimator.kernel):
+        # cannot compute the kernel values with custom function
+        raise ValueError("Cannot use a custom kernel function. "
+                         "Precompute the kernel matrix instead.")
+
+    if not hasattr(X, "shape"):
+        if getattr(estimator, "_pairwise", False):
+            raise ValueError("Precomputed kernels or affinity matrices have "
+                             "to be passed as arrays or sparse matrices.")
+        X_subset = [X[idx] for idx in indices]
+    else:
+        if getattr(estimator, "_pairwise", False):
+            # X is a precomputed square kernel matrix
+            if X.shape[0] != X.shape[1]:
+                raise ValueError("X should be a square kernel matrix")
+            if train_indices is None:
+                X_subset = X[np.ix_(indices, indices)]
+            else:
+                X_subset = X[np.ix_(indices, train_indices)]
+        else:
+            X_subset = X[safe_mask(X, indices)]
+
+    if y is not None:
+        y_subset = y[safe_mask(y, indices)]
+    else:
+        y_subset = None
+
+    return X_subset, y_subset
+
+
+def _score(estimator, X_test, y_test, scorer):
+    """Compute the score of an estimator on a given test set."""
+    if y_test is None:
+        score = scorer(estimator, X_test)
+    else:
+        score = scorer(estimator, X_test, y_test)
+    if not isinstance(score, numbers.Number):
+        raise ValueError("scoring must return a number, got %s (%s) instead."
+                         % (str(score), type(score)))
+    return score
+
+
+def _permutation_test_score(estimator, X, y, cv, scorer):
+    """Auxiliary function for permutation_test_score"""
+    avg_score = []
+    for train, test in cv:
+        estimator.fit(X[train], y[train])
+        avg_score.append(scorer(estimator, X[test], y[test]))
+    return np.mean(avg_score)
+
+
+def _shuffle(y, labels, random_state):
+    """Return a shuffled copy of y eventually shuffle among same labels."""
+    if labels is None:
+        ind = random_state.permutation(len(y))
+    else:
+        ind = np.arange(len(labels))
+        for label in np.unique(labels):
+            this_mask = (labels == label)
+            ind[this_mask] = random_state.permutation(ind[this_mask])
+    return y[ind]
+
+
+def permutation_test_score(estimator, X, y, score_func=None, cv=None,
+                           n_permutations=100, n_jobs=1, labels=None,
+                           random_state=0, verbose=0, scoring=None):
+    """Evaluate the significance of a cross-validated score with permutations
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit'
+        The object to use to fit the data.
+
+    X : array-like of shape at least 2D
+        The data to fit.
+
+    y : array-like
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    scoring : string, callable or None, optional, default: None
+        A string (see model evaluation documentation) or
+        a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+
+    cv : integer or cross-validation generator, optional
+        If an integer is passed, it is the number of fold (default 3).
+        Specific cross-validation objects can be passed, see
+        sklearn.cross_validation module for the list of possible objects.
+
+    n_permutations : integer, optional
+        Number of times to permute ``y``.
+
+    n_jobs : integer, optional
+        The number of CPUs to use to do the computation. -1 means
+        'all CPUs'.
+
+    labels : array-like of shape [n_samples] (optional)
+        Labels constrain the permutation among groups of samples with
+        a same label.
+
+    random_state : RandomState or an int seed (0 by default)
+        A random number generator instance to define the state of the
+        random permutations generator.
+
+    verbose : integer, optional
+        The verbosity level.
+
+    Returns
+    -------
+    score : float
+        The true score without permuting targets.
+
+    permutation_scores : array, shape = [n_permutations]
+        The scores obtained for each permutations.
+
+    pvalue : float
+        The returned value equals p-value if `score_func` returns bigger
+        numbers for better scores (e.g., accuracy_score). If `score_func` is
+        rather a loss function (i.e. when lower is better such as with
+        `mean_squared_error`) then this is actually the complement of the
+        p-value:  1 - p-value.
+
+    Notes
+    -----
+    This function implements Test 1 in:
+
+        Ojala and Garriga. Permutation Tests for Studying Classifier
+        Performance.  The Journal of Machine Learning Research (2010)
+        vol. 11
+
+    """
+    X, y = check_arrays(X, y, sparse_format='csr', allow_nans=True)
+    cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
+    scorer = check_scoring(estimator, scoring=scoring, score_func=score_func)
+    random_state = check_random_state(random_state)
+
+    # We clone the estimator to make sure that all the folds are
+    # independent, and that it is pickle-able.
+    score = _permutation_test_score(clone(estimator), X, y, cv, scorer)
+    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
+        delayed(_permutation_test_score)(
+            clone(estimator), X, _shuffle(y, labels, random_state), cv,
+            scorer)
+        for _ in range(n_permutations))
+    permutation_scores = np.array(permutation_scores)
+    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
+    return score, permutation_scores, pvalue
+
+
+permutation_test_score.__test__ = False  # to avoid a pb with nosetests
+
+
+def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5),
+                   cv=None, scoring=None, exploit_incremental_learning=False,
+                   n_jobs=1, pre_dispatch="all", verbose=0):
+    """Learning curve.
+
+    Determines cross-validated training and test scores for different training
+    set sizes.
+
+    A cross-validation generator splits the whole dataset k times in training
+    and test data. Subsets of the training set with varying sizes will be used
+    to train the estimator and a score for each training subset size and the
+    test set will be computed. Afterwards, the scores will be averaged over
+    all k runs for each training subset size.
+
+    Parameters
+    ----------
+    estimator : object type that implements the "fit" and "predict" methods
+        An object of that type which is cloned for each validation.
+
+    X : array-like, shape (n_samples, n_features)
+        Training vector, where n_samples is the number of samples and
+        n_features is the number of features.
+
+    y : array-like, shape (n_samples) or (n_samples, n_features), optional
+        Target relative to X for classification or regression;
+        None for unsupervised learning.
+
+    train_sizes : array-like, shape (n_ticks,), dtype float or int
+        Relative or absolute numbers of training examples that will be used to
+        generate the learning curve. If the dtype is float, it is regarded as a
+        fraction of the maximum size of the training set (that is determined
+        by the selected validation method), i.e. it has to be within (0, 1].
+        Otherwise it is interpreted as absolute sizes of the training sets.
+        Note that for classification the number of samples usually have to
+        be big enough to contain at least one sample from each class.
+        (default: np.linspace(0.1, 1.0, 5))
+
+    cv : integer, cross-validation generator, optional
+        If an integer is passed, it is the number of folds (defaults to 3).
+        Specific cross-validation objects can be passed, see
+        sklearn.cross_validation module for the list of possible objects
+
+    scoring : string, callable or None, optional, default: None
+        A string (see model evaluation documentation) or
+        a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+
+    exploit_incremental_learning : boolean, optional, default: False
+        If the estimator supports incremental learning, this will be
+        used to speed up fitting for different training set sizes.
+
+    n_jobs : integer, optional
+        Number of jobs to run in parallel (default 1).
+
+    pre_dispatch : integer or string, optional
+        Number of predispatched jobs for parallel execution (default is
+        all). The option can reduce the allocated memory. The string can
+        be an expression like '2*n_jobs'.
+
+    verbose : integer, optional
+        Controls the verbosity: the higher, the more messages.
+
+    Returns
+    -------
+    train_sizes_abs : array, shape = (n_unique_ticks,), dtype int
+        Numbers of training examples that has been used to generate the
+        learning curve. Note that the number of ticks might be less
+        than n_ticks because duplicate entries will be removed.
+
+    train_scores : array, shape (n_ticks, n_cv_folds)
+        Scores on training sets.
+
+    test_scores : array, shape (n_ticks, n_cv_folds)
+        Scores on test set.
+
+    Notes
+    -----
+    See :ref:`examples/plot_learning_curve.py <example_plot_learning_curve.py>`
+    """
+    if exploit_incremental_learning and not hasattr(estimator, "partial_fit"):
+        raise ValueError("An estimator must support the partial_fit interface "
+                         "to exploit incremental learning")
+
+    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
+    # Make a list since we will be iterating multiple times over the folds
+    cv = list(_check_cv(cv, X, y, classifier=is_classifier(estimator)))
+    scorer = check_scoring(estimator, scoring=scoring)
+
+    # HACK as long as boolean indices are allowed in cv generators
+    if cv[0][0].dtype == bool:
+        new_cv = []
+        for i in range(len(cv)):
+            new_cv.append((np.nonzero(cv[i][0])[0], np.nonzero(cv[i][1])[0]))
+        cv = new_cv
+
+    n_max_training_samples = len(cv[0][0])
+    # Because the lengths of folds can be significantly different, it is
+    # not guaranteed that we use all of the available training data when we
+    # use the first 'n_max_training_samples' samples.
+    train_sizes_abs = _translate_train_sizes(train_sizes,
+                                             n_max_training_samples)
+    n_unique_ticks = train_sizes_abs.shape[0]
+    if verbose > 0:
+        print("[learning_curve] Training set sizes: " + str(train_sizes_abs))
+
+    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
+                        verbose=verbose)
+    if exploit_incremental_learning:
+        classes = np.unique(y) if is_classifier(estimator) else None
+        out = parallel(delayed(_incremental_fit_estimator)(
+            clone(estimator), X, y, classes, train, test, train_sizes_abs,
+            scorer, verbose) for train, test in cv)
+    else:
+        out = parallel(delayed(_fit_and_score)(
+            clone(estimator), X, y, scorer, train[:n_train_samples], test,
+            verbose, parameters=None, fit_params=None, return_train_score=True)
+            for train, test in cv for n_train_samples in train_sizes_abs)
+        out = np.array(out)[:, :2]
+        n_cv_folds = out.shape[0] // n_unique_ticks
+        out = out.reshape(n_cv_folds, n_unique_ticks, 2)
+
+    out = np.asarray(out).transpose((2, 1, 0))
+
+    return train_sizes_abs, out[0], out[1]
+
+
+def _translate_train_sizes(train_sizes, n_max_training_samples):
+    """Determine absolute sizes of training subsets and validate 'train_sizes'.
+
+    Examples:
+        _translate_train_sizes([0.5, 1.0], 10) -> [5, 10]
+        _translate_train_sizes([5, 10], 10) -> [5, 10]
+
+    Parameters
+    ----------
+    train_sizes : array-like, shape (n_ticks,), dtype float or int
+        Numbers of training examples that will be used to generate the
+        learning curve. If the dtype is float, it is regarded as a
+        fraction of 'n_max_training_samples', i.e. it has to be within (0, 1].
+
+    n_max_training_samples : int
+        Maximum number of training samples (upper bound of 'train_sizes').
+
+    Returns
+    -------
+    train_sizes_abs : array, shape (n_unique_ticks,), dtype int
+        Numbers of training examples that will be used to generate the
+        learning curve. Note that the number of ticks might be less
+        than n_ticks because duplicate entries will be removed.
+    """
+    train_sizes_abs = np.asarray(train_sizes)
+    n_ticks = train_sizes_abs.shape[0]
+    n_min_required_samples = np.min(train_sizes_abs)
+    n_max_required_samples = np.max(train_sizes_abs)
+    if np.issubdtype(train_sizes_abs.dtype, np.float):
+        if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0:
+            raise ValueError("train_sizes has been interpreted as fractions "
+                             "of the maximum number of training samples and "
+                             "must be within (0, 1], but is within [%f, %f]."
+                             % (n_min_required_samples,
+                                n_max_required_samples))
+        train_sizes_abs = astype(train_sizes_abs * n_max_training_samples,
+                                 dtype=np.int, copy=False)
+        train_sizes_abs = np.clip(train_sizes_abs, 1,
+                                  n_max_training_samples)
+    else:
+        if (n_min_required_samples <= 0 or
+                n_max_required_samples > n_max_training_samples):
+            raise ValueError("train_sizes has been interpreted as absolute "
+                             "numbers of training samples and must be within "
+                             "(0, %d], but is within [%d, %d]."
+                             % (n_max_training_samples,
+                                n_min_required_samples,
+                                n_max_required_samples))
+
+    train_sizes_abs = np.unique(train_sizes_abs)
+    if n_ticks > train_sizes_abs.shape[0]:
+        warnings.warn("Removed duplicate entries from 'train_sizes'. Number "
+                      "of ticks will be less than than the size of "
+                      "'train_sizes' %d instead of %d)."
+                      % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning)
+
+    return train_sizes_abs
+
+
+def _incremental_fit_estimator(estimator, X, y, classes, train, test,
+                               train_sizes, scorer, verbose):
+    """Train estimator on training subsets incrementally and compute scores."""
+    train_scores, test_scores = [], []
+    partitions = zip(train_sizes, np.split(train, train_sizes)[:-1])
+    for n_train_samples, partial_train in partitions:
+        train_subset = train[:n_train_samples]
+        X_train, y_train = _safe_split(estimator, X, y, train_subset)
+        X_partial_train, y_partial_train = _safe_split(estimator, X, y,
+                                                       partial_train)
+        X_test, y_test = _safe_split(estimator, X, y, test, train_subset)
+        if y_partial_train is None:
+            estimator.partial_fit(X_partial_train, classes=classes)
+        else:
+            estimator.partial_fit(X_partial_train, y_partial_train,
+                                  classes=classes)
+        train_scores.append(_score(estimator, X_train, y_train, scorer))
+        test_scores.append(_score(estimator, X_test, y_test, scorer))
+    return np.array((train_scores, test_scores)).T
+
+
+def validation_curve(estimator, X, y, param_name, param_range, cv=None,
+                     scoring=None, n_jobs=1, pre_dispatch="all", verbose=0):
+    """Validation curve.
+
+    Determine training and test scores for varying parameter values.
+
+    Compute scores for an estimator with different values of a specified
+    parameter. This is similar to grid search with one parameter. However, this
+    will also compute training scores and is merely a utility for plotting the
+    results.
+
+    Parameters
+    ----------
+    estimator : object type that implements the "fit" and "predict" methods
+        An object of that type which is cloned for each validation.
+
+    X : array-like, shape (n_samples, n_features)
+        Training vector, where n_samples is the number of samples and
+        n_features is the number of features.
+
+    y : array-like, shape (n_samples) or (n_samples, n_features), optional
+        Target relative to X for classification or regression;
+        None for unsupervised learning.
+
+    param_name : string
+        Name of the parameter that will be varied.
+
+    param_range : array-like, shape (n_values,)
+        The values of the parameter that will be evaluated.
+
+    cv : integer, cross-validation generator, optional
+        If an integer is passed, it is the number of folds (defaults to 3).
+        Specific cross-validation objects can be passed, see
+        sklearn.cross_validation module for the list of possible objects
+
+    scoring : string, callable or None, optional, default: None
+        A string (see model evaluation documentation) or
+        a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+
+    n_jobs : integer, optional
+        Number of jobs to run in parallel (default 1).
+
+    pre_dispatch : integer or string, optional
+        Number of predispatched jobs for parallel execution (default is
+        all). The option can reduce the allocated memory. The string can
+        be an expression like '2*n_jobs'.
+
+    verbose : integer, optional
+        Controls the verbosity: the higher, the more messages.
+
+    Returns
+    -------
+    train_scores : array, shape (n_ticks, n_cv_folds)
+        Scores on training sets.
+
+    test_scores : array, shape (n_ticks, n_cv_folds)
+        Scores on test set.
+
+    Notes
+    -----
+    See
+    :ref:`examples/plot_validation_curve.py <example_plot_validation_curve.py>`
+    """
+    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
+    cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
+    scorer = check_scoring(estimator, scoring=scoring)
+
+    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
+                        verbose=verbose)
+    out = parallel(delayed(_fit_and_score)(
+        estimator, X, y, scorer, train, test, verbose,
+        parameters={param_name: v}, fit_params=None, return_train_score=True)
+        for train, test in cv for v in param_range)
+
+    out = np.asarray(out)[:, :2]
+    n_params = len(param_range)
+    n_cv_folds = out.shape[0] // n_params
+    out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0))
+
+    return out[0], out[1]
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 8a27d9c5a7872..73bdae2641442 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -19,6 +19,7 @@
 from sklearn.utils.testing import ignore_warnings
 
 from sklearn import cross_validation as cval
+from sklearn.model_selection.validate import _safe_split
 from sklearn.base import BaseEstimator
 from sklearn.datasets import make_regression
 from sklearn.datasets import load_digits
@@ -885,12 +886,12 @@ def test_safe_split_with_precomputed_kernel():
     cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0)
     tr, te = list(cv)[0]
 
-    X_tr, y_tr = cval._safe_split(clf, X, y, tr)
-    K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr)
+    X_tr, y_tr = _safe_split(clf, X, y, tr)
+    K_tr, y_tr2 = _safe_split(clfp, K, y, tr)
     assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))
 
-    X_te, y_te = cval._safe_split(clf, X, y, te, tr)
-    K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr)
+    X_te, y_te = _safe_split(clf, X, y, te, tr)
+    K_te, y_te2 = _safe_split(clfp, K, y, te, tr)
     assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))