diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 38110090dffdc..48fbb10f7f924 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -794,7 +794,7 @@ adheres to the scikit-learn interface and standards by running
 
 The main motivation to make a class compatible to the scikit-learn estimator
 interface might be that you want to use it together with model assessment and
-selection tools such as :class:`grid_search.GridSearchCV`.
+selection tools such as :class:`model_selection.GridSearchCV`.
 
 For this to work, you need to implement the following interface.
 If a dependency on scikit-learn is okay for your code,
@@ -856,7 +856,7 @@ implement the interface is::
 
 Parameters and init
 -------------------
-As :class:`grid_search.GridSearchCV` uses ``set_params``
+As :class:`model_selection.GridSearchCV` uses ``set_params``
 to apply parameter setting to estimators,
 it is essential that calling ``set_params`` has the same effect
 as setting parameters using the ``__init__`` method.
@@ -874,9 +874,8 @@ trailing ``_`` is used to check if the estimator has been fitted.
 
 Cloning
 -------
-For using :class:`grid_search.GridSearch` or any functionality of the
-:mod:`cross_validation` module, an estimator must support the ``base.clone``
-function to replicate an estimator.
+For use with the :mod:`model_selection` module,
+an estimator must support the ``base.clone`` function to replicate an estimator.
 This can be done by providing a ``get_params`` method.
 If ``get_params`` is present, then ``clone(estimator)`` will be an instance of
 ``type(estimator)`` on which ``set_params`` has been called with clones of
@@ -901,8 +900,8 @@ accepts an optional ``y``.
 Estimator types
 ---------------
 Some common functionality depends on the kind of estimator passed.
-For example, cross-validation in :class:`grid_search.GridSearchCV` and
-:func:`cross_validation.cross_val_score` defaults to being stratified when used
+For example, cross-validation in :class:`model_selection.GridSearchCV` and
+:func:`model_selection.cross_val_score` defaults to being stratified when used
 on a classifier, but not otherwise. Similarly, scorers for average precision
 that take a continuous prediction need to call ``decision_function`` for classifiers,
 but ``predict`` for regressors. This distinction between classifiers and regressors
diff --git a/doc/model_selection.rst b/doc/model_selection.rst
index daec6a6ed83e4..43036e9bdf6c7 100644
--- a/doc/model_selection.rst
+++ b/doc/model_selection.rst
@@ -11,4 +11,3 @@ Model selection and evaluation
     modules/grid_search
     modules/model_evaluation
     modules/model_persistence
-    modules/learning_curve
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 3ba4c8952568e..64f99c06ebd8f 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -144,16 +144,50 @@ Classes
    covariance.graph_lasso
 
 
-.. _cross_validation_ref:
-
-:mod:`sklearn.cross_validation`: Cross Validation
-=================================================
+:mod:`sklearn.model_selection`: Model Selection
+===============================================
 
-.. automodule:: sklearn.cross_validation
+.. automodule:: sklearn.model_selection
    :no-members:
    :no-inherited-members:
 
-**User guide:** See the :ref:`cross_validation` section for further details.
+**User guide:** See the :ref:`cross_validation`, :ref:`grid_search` and
+:ref:`learning_curve` sections for further details.
+
+Splitter Classes
+----------------
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   model_selection.KFold
+   model_selection.LabelKFold
+   model_selection.StratifiedKFold
+   model_selection.LeaveOneLabelOut
+   model_selection.LeavePLabelOut
+   model_selection.LeaveOneOut
+   model_selection.LeavePOut
+   model_selection.ShuffleSplit
+   model_selection.LabelShuffleSplit
+   model_selection.StratifiedShuffleSplit
+   model_selection.PredefinedSplit
+
+Splitter Functions
+------------------
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: function.rst
+
+   model_selection.train_test_split
+
+Hyper-parameter optimizers
+--------------------------
 
 .. currentmodule:: sklearn
 
@@ -161,28 +195,25 @@ Classes
    :toctree: generated/
    :template: class.rst
 
-   cross_validation.KFold
-   cross_validation.LabelKFold
-   cross_validation.LabelShuffleSplit
-   cross_validation.LeaveOneLabelOut
-   cross_validation.LeaveOneOut
-   cross_validation.LeavePLabelOut
-   cross_validation.LeavePOut
-   cross_validation.PredefinedSplit
-   cross_validation.ShuffleSplit
-   cross_validation.StratifiedKFold
-   cross_validation.StratifiedShuffleSplit
+   model_selection.GridSearchCV
+   model_selection.RandomizedSearchCV
+   model_selection.ParameterGrid
+   model_selection.ParameterSampler
+
+Model validation
+----------------
 
+.. currentmodule:: sklearn
 
 .. autosummary::
    :toctree: generated/
    :template: function.rst
 
-   cross_validation.train_test_split
-   cross_validation.cross_val_score
-   cross_validation.cross_val_predict
-   cross_validation.permutation_test_score
-   cross_validation.check_cv
+   model_selection.cross_val_score
+   model_selection.cross_val_predict
+   model_selection.permutation_test_score
+   model_selection.learning_curve
+   model_selection.validation_curve
 
 .. _datasets_ref:
 
@@ -547,29 +578,6 @@ Kernels:
   gaussian_process.kernels.CompoundKernel
   gaussian_process.kernels.Hyperparameter
 
-.. _grid_search_ref:
-
-:mod:`sklearn.grid_search`: Grid Search
-=======================================
-
-.. automodule:: sklearn.grid_search
-   :no-members:
-   :no-inherited-members:
-
-**User guide:** See the :ref:`grid_search` section for further details.
-
-.. currentmodule:: sklearn
-
-.. autosummary::
-   :toctree: generated/
-   :template: class.rst
-
-   grid_search.GridSearchCV
-   grid_search.ParameterGrid
-   grid_search.ParameterSampler
-   grid_search.RandomizedSearchCV
-
-
 .. _isotonic_ref:
 
 :mod:`sklearn.isotonic`: Isotonic regression
@@ -658,24 +666,6 @@ Kernels:
    discriminant_analysis.QuadraticDiscriminantAnalysis
 
 
-.. _learning_curve_ref:
-
-:mod:`sklearn.learning_curve` Learning curve evaluation
-=======================================================
-
-.. automodule:: sklearn.learning_curve
-   :no-members:
-   :no-inherited-members:
-
-.. currentmodule:: sklearn
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-
-   learning_curve.learning_curve
-   learning_curve.validation_curve
-
 .. _linear_model_ref:
 
 :mod:`sklearn.linear_model`: Generalized Linear Models
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index e76b95c6e48be..52f98e341971b 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -4,7 +4,7 @@
 Cross-validation: evaluating estimator performance
 ===================================================
 
-.. currentmodule:: sklearn.cross_validation
+.. currentmodule:: sklearn.model_selection
 
 Learning the parameters of a prediction function and testing it on the
 same data is a methodological mistake: a model that would just repeat
@@ -24,7 +24,7 @@ can be quickly computed with the :func:`train_test_split` helper function.
 Let's load the iris data set to fit a linear support vector machine on it::
 
   >>> import numpy as np
-  >>> from sklearn import cross_validation
+  >>> from sklearn.model_selection import train_test_split
   >>> from sklearn import datasets
   >>> from sklearn import svm
 
@@ -35,7 +35,7 @@ Let's load the iris data set to fit a linear support vector machine on it::
 We can now quickly sample a training set while holding out 40% of the
 data for testing (evaluating) our classifier::
 
-  >>> X_train, X_test, y_train, y_test = cross_validation.train_test_split(
+  >>> X_train, X_test, y_train, y_test = train_test_split(
   ...     iris.data, iris.target, test_size=0.4, random_state=0)
 
   >>> X_train.shape, y_train.shape
@@ -101,10 +101,9 @@ kernel support vector machine on the iris dataset by splitting the data, fitting
 a model and computing the score 5 consecutive times (with different splits each
 time)::
 
+  >>> from sklearn.model_selection import cross_val_score
   >>> clf = svm.SVC(kernel='linear', C=1)
-  >>> scores = cross_validation.cross_val_score(
-  ...    clf, iris.data, iris.target, cv=5)
-  ...
+  >>> scores = cross_val_score(clf, iris.data, iris.target, cv=5)
   >>> scores                                              # doctest: +ELLIPSIS
   array([ 0.96...,  1.  ...,  0.96...,  0.96...,  1.        ])
 
@@ -119,8 +118,8 @@ method of the estimator. It is possible to change this by using the
 scoring parameter::
 
   >>> from sklearn import metrics
-  >>> scores = cross_validation.cross_val_score(clf, iris.data, iris.target,
-  ...     cv=5, scoring='f1_weighted')
+  >>> scores = cross_val_score(
+  ...     clf, iris.data, iris.target, cv=5, scoring='f1_macro')
   >>> scores                                              # doctest: +ELLIPSIS
   array([ 0.96...,  1.  ...,  0.96...,  0.96...,  1.        ])
 
@@ -136,11 +135,10 @@ being used if the estimator derives from :class:`ClassifierMixin
 It is also possible to use other cross validation strategies by passing a cross
 validation iterator instead, for instance::
 
+  >>> from sklearn.model_selection import ShuffleSplit
   >>> n_samples = iris.data.shape[0]
-  >>> cv = cross_validation.ShuffleSplit(n_samples, n_iter=3,
-  ...     test_size=0.3, random_state=0)
-
-  >>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=cv)
+  >>> cv = ShuffleSplit(n_iter=3, test_size=0.3, random_state=0)
+  >>> cross_val_score(clf, iris.data, iris.target, cv=cv)
   ...                                                     # doctest: +ELLIPSIS
   array([ 0.97...,  0.97...,  1.        ])
 
@@ -153,7 +151,7 @@ validation iterator instead, for instance::
     be learnt from a training set and applied to held-out data for prediction::
 
       >>> from sklearn import preprocessing
-      >>> X_train, X_test, y_train, y_test = cross_validation.train_test_split(
+      >>> X_train, X_test, y_train, y_test = train_test_split(
       ...     iris.data, iris.target, test_size=0.4, random_state=0)
       >>> scaler = preprocessing.StandardScaler().fit(X_train)
       >>> X_train_transformed = scaler.transform(X_train)
@@ -167,7 +165,7 @@ validation iterator instead, for instance::
 
       >>> from sklearn.pipeline import make_pipeline
       >>> clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
-      >>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=cv)
+      >>> cross_val_score(clf, iris.data, iris.target, cv=cv)
       ...                                                 # doctest: +ELLIPSIS
       array([ 0.97...,  0.93...,  0.95...])
 
@@ -184,8 +182,8 @@ can be used (otherwise, an exception is raised).
 
 These prediction can then be used to evaluate the classifier::
 
-  >>> predicted = cross_validation.cross_val_predict(clf, iris.data,
-  ...                                                iris.target, cv=10)
+  >>> from sklearn.model_selection import cross_val_predict
+  >>> predicted = cross_val_predict(clf, iris.data, iris.target, cv=10)
   >>> metrics.accuracy_score(iris.target, predicted) # doctest: +ELLIPSIS
   0.966...
 
@@ -223,10 +221,11 @@ learned using :math:`k - 1` folds, and the fold left out is used for test.
 Example of 2-fold cross-validation on a dataset with 4 samples::
 
   >>> import numpy as np
-  >>> from sklearn.cross_validation import KFold
+  >>> from sklearn.model_selection import KFold
 
-  >>> kf = KFold(4, n_folds=2)
-  >>> for train, test in kf:
+  >>> X = ["a", "b", "c", "d"]
+  >>> kf = KFold(n_folds=2)
+  >>> for train, test in kf.split(X):
   ...     print("%s %s" % (train, test))
   [2 3] [0 1]
   [0 1] [2 3]
@@ -250,11 +249,12 @@ target class as the complete set.
 Example of stratified 3-fold cross-validation on a dataset with 10 samples from
 two slightly unbalanced classes::
 
-  >>> from sklearn.cross_validation import StratifiedKFold
+  >>> from sklearn.model_selection import StratifiedKFold
 
-  >>> labels = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
-  >>> skf = StratifiedKFold(labels, 3)
-  >>> for train, test in skf:
+  >>> X = np.ones(10)
+  >>> y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
+  >>> skf = StratifiedKFold(n_folds=3)
+  >>> for train, test in skf.split(X, y):
   ...     print("%s %s" % (train, test))
   [2 3 6 7 8 9] [0 1 4 5]
   [0 1 3 4 5 8 9] [2 6 7]
@@ -272,12 +272,14 @@ subjects.
 
 Imagine you have three subjects, each with an associated number from 1 to 3::
 
-  >>> from sklearn.cross_validation import LabelKFold
+  >>> from sklearn.model_selection import LabelKFold
 
+  >>> X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
+  >>> y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
   >>> labels = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
 
-  >>> lkf = LabelKFold(labels, n_folds=3)
-  >>> for train, test in lkf:
+  >>> lkf = LabelKFold(n_folds=3)
+  >>> for train, test in lkf.split(X, y, labels):
   ...     print("%s %s" % (train, test))
   [0 1 2 3 4 5] [6 7 8 9]
   [0 1 2 6 7 8 9] [3 4 5]
@@ -298,10 +300,11 @@ training sets and :math:`n` different tests set. This cross-validation
 procedure does not waste much data as only one sample is removed from the
 training set::
 
-  >>> from sklearn.cross_validation import LeaveOneOut
+  >>> from sklearn.model_selection import LeaveOneOut
 
-  >>> loo = LeaveOneOut(4)
-  >>> for train, test in loo:
+  >>> X = [1, 2, 3, 4]
+  >>> loo = LeaveOneOut()
+  >>> for train, test in loo.split(X):
   ...     print("%s %s" % (train, test))
   [1 2 3] [0]
   [0 2 3] [1]
@@ -356,10 +359,11 @@ overlap for :math:`p > 1`.
 
 Example of Leave-2-Out on a dataset with 4 samples::
 
-  >>> from sklearn.cross_validation import LeavePOut
+  >>> from sklearn.model_selection import LeavePOut
 
-  >>> lpo = LeavePOut(4, p=2)
-  >>> for train, test in lpo:
+  >>> X = np.ones(4)
+  >>> lpo = LeavePOut(p=2)
+  >>> for train, test in lpo.split(X):
   ...     print("%s %s" % (train, test))
   [2 3] [0 1]
   [1 3] [0 2]
@@ -384,11 +388,13 @@ For example, in the cases of multiple experiments, *LOLO* can be used to
 create a cross-validation based on the different experiments: we create
 a training set using the samples of all the experiments except one::
 
-  >>> from sklearn.cross_validation import LeaveOneLabelOut
+  >>> from sklearn.model_selection import LeaveOneLabelOut
 
+  >>> X = [1, 5, 10, 50]
+  >>> y = [0, 1, 1, 2]
   >>> labels = [1, 1, 2, 2]
-  >>> lolo = LeaveOneLabelOut(labels)
-  >>> for train, test in lolo:
+  >>> lolo = LeaveOneLabelOut()
+  >>> for train, test in lolo.split(X, y, labels):
   ...     print("%s %s" % (train, test))
   [2 3] [0 1]
   [0 1] [2 3]
@@ -416,11 +422,13 @@ samples related to :math:`P` labels for each training/test set.
 
 Example of Leave-2-Label Out::
 
-  >>> from sklearn.cross_validation import LeavePLabelOut
+  >>> from sklearn.model_selection import LeavePLabelOut
 
+  >>> X = np.arange(6)
+  >>> y = [1, 1, 1, 2, 2, 2]
   >>> labels = [1, 1, 2, 2, 3, 3]
-  >>> lplo = LeavePLabelOut(labels, p=2)
-  >>> for train, test in lplo:
+  >>> lplo = LeavePLabelOut(n_labels=2)
+  >>> for train, test in lplo.split(X, y, labels):
   ...     print("%s %s" % (train, test))
   [4 5] [0 1 2 3]
   [2 3] [0 1 4 5]
@@ -443,9 +451,11 @@ generator.
 
 Here is a usage example::
 
-  >>> ss = cross_validation.ShuffleSplit(5, n_iter=3, test_size=0.25,
+  >>> from sklearn.model_selection import ShuffleSplit
+  >>> X = np.arange(5)
+  >>> ss = ShuffleSplit(n_iter=3, test_size=0.25,
   ...     random_state=0)
-  >>> for train_index, test_index in ss:
+  >>> for train_index, test_index in ss.split(X):
   ...     print("%s %s" % (train_index, test_index))
   ...
   [1 3 4] [2 0]
@@ -469,12 +479,13 @@ out for each split.
 
 Here is a usage example::
 
-  >>> from sklearn.cross_validation import LabelShuffleSplit
+  >>> from sklearn.model_selection import LabelShuffleSplit
 
+  >>> X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
+  >>> y = ["a", "b", "b", "b", "c", "c", "c", "a"]
   >>> labels = [1, 1, 2, 2, 3, 3, 4, 4]
-  >>> slo = LabelShuffleSplit(labels, n_iter=4, test_size=0.5,
-  ...                        random_state=0)
-  >>> for train, test in slo:
+  >>> lss = LabelShuffleSplit(n_iter=4, test_size=0.5, random_state=0)
+  >>> for train, test in lss.split(X, y, labels):
   ...     print("%s %s" % (train, test))
   ...
   [0 1 2 3] [4 5 6 7]
@@ -541,4 +552,4 @@ Cross validation and model selection
 
 Cross validation iterators can also be used to directly perform model
 selection using Grid Search for the optimal hyperparameters of the
-model. This is the topic if the next section: :ref:`grid_search`.
+model. This is the topic of the next section: :ref:`grid_search`.
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index a033581012100..1e34706c4fec8 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -156,7 +156,7 @@ picked as the splitting rule. This usually allows to reduce the variance
 of the model a bit more, at the expense of a slightly greater increase
 in bias::
 
-    >>> from sklearn.cross_validation import cross_val_score
+    >>> from sklearn.model_selection import cross_val_score
     >>> from sklearn.datasets import make_blobs
     >>> from sklearn.ensemble import RandomForestClassifier
     >>> from sklearn.ensemble import ExtraTreesClassifier
@@ -360,7 +360,7 @@ Usage
 The following example shows how to fit an AdaBoost classifier with 100 weak
 learners::
 
-    >>> from sklearn.cross_validation import cross_val_score
+    >>> from sklearn.model_selection import cross_val_score
     >>> from sklearn.datasets import load_iris
     >>> from sklearn.ensemble import AdaBoostClassifier
 
@@ -948,7 +948,7 @@ Usage
 The following example shows how to fit the majority rule classifier::
 
    >>> from sklearn import datasets
-   >>> from sklearn import cross_validation
+   >>> from sklearn.model_selection import cross_val_score
    >>> from sklearn.linear_model import LogisticRegression
    >>> from sklearn.naive_bayes import GaussianNB
    >>> from sklearn.ensemble import RandomForestClassifier
@@ -964,7 +964,7 @@ The following example shows how to fit the majority rule classifier::
    >>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
 
    >>> for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
-   ...     scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy')
+   ...     scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    ...     print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    Accuracy: 0.90 (+/- 0.05) [Logistic Regression]
    Accuracy: 0.93 (+/- 0.05) [Random Forest]
@@ -1041,7 +1041,7 @@ Using the `VotingClassifier` with `GridSearch`
 The `VotingClassifier` can also be used together with `GridSearch` in order
 to tune the hyperparameters of the individual estimators::
 
-   >>> from sklearn.grid_search import GridSearchCV
+   >>> from sklearn.model_selection import GridSearchCV
    >>> clf1 = LogisticRegression(random_state=1)
    >>> clf2 = RandomForestClassifier(random_state=1)
    >>> clf3 = GaussianNB()
diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst
index eb4a0b10828f3..de66f8bc02906 100644
--- a/doc/modules/grid_search.rst
+++ b/doc/modules/grid_search.rst
@@ -1,26 +1,27 @@
-.. currentmodule:: sklearn.grid_search
+
+
+.. currentmodule:: sklearn.model_selection
 
 .. _grid_search:
 
-===============================================
-Grid Search: Searching for estimator parameters
-===============================================
+===========================================
+Tuning the hyper-parameters of an estimator
+===========================================
+
+Hyper-parameters are parameters that are not directly learnt within estimators.
+In scikit-learn they are passed as arguments to the constructor of the
+estimator classes. Typical examples include ``C``, ``kernel`` and ``gamma``
+for Support Vector Classifier, ``alpha`` for Lasso, etc.
 
-Parameters that are not directly learnt within estimators can be set by
-searching a parameter space for the best :ref:`cross_validation` score.
-Typical examples include ``C``, ``kernel`` and ``gamma`` for Support Vector
-Classifier, ``alpha`` for Lasso, etc.
+It is possible and recommended to search the hyper-parameter space for the
+best :ref:`cross_validation` score.
 
 Any parameter provided when constructing an estimator may be optimized in this
-manner.  Specifically, to find the names and current values for all parameters
+manner. Specifically, to find the names and current values for all parameters
 for a given estimator, use::
 
   estimator.get_params()
 
-Such parameters are often referred to as *hyperparameters* (particularly in
-Bayesian learning), distinguishing them from the parameters optimised in a
-machine learning procedure.
-
 A search consists of:
 
 - an estimator (regressor or classifier such as ``sklearn.svm.SVC()``);
@@ -38,6 +39,12 @@ given number of candidates from a parameter space with a specified
 distribution. After describing these tools we detail
 :ref:`best practice <grid_search_tips>` applicable to both approaches.
 
+Note that it is common that a small subset of those parameters can have a large
+impact on the predictive or computation performance of the model while others
+can be left to their default values. It is recommend to read the docstring of
+the estimator class to get a finer understanding of their expected behavior,
+possibly by reading the enclosed reference to the literature.  
+
 Exhaustive Grid Search
 ======================
 
@@ -59,7 +66,7 @@ The :class:`GridSearchCV` instance implements the usual estimator API: when
 "fitting" it on a dataset all the possible combinations of parameter values are
 evaluated and the best combination is retained.
 
-.. currentmodule:: sklearn.grid_search
+.. currentmodule:: sklearn.model_selection
 
 .. topic:: Examples:
 
@@ -164,7 +171,7 @@ it is recommended to split the data into a **development set** (to
 be fed to the ``GridSearchCV`` instance) and an **evaluation set**
 to compute performance metrics.
 
-This can be done by using the :func:`cross_validation.train_test_split`
+This can be done by using the :func:`train_test_split`
 utility function.
 
 Parallelism
diff --git a/doc/modules/learning_curve.rst b/doc/modules/learning_curve.rst
index 4cd655fdbb6c0..8708ef8c7acdf 100644
--- a/doc/modules/learning_curve.rst
+++ b/doc/modules/learning_curve.rst
@@ -4,7 +4,7 @@
 Validation curves: plotting scores to evaluate models
 =====================================================
 
-.. currentmodule:: sklearn.learning_curve
+.. currentmodule:: sklearn.model_selection
 
 Every estimator has its advantages and drawbacks. Its generalization error
 can be decomposed in terms of bias, variance and noise. The **bias** of an
@@ -69,7 +69,7 @@ values.
 The function :func:`validation_curve` can help in this case::
 
   >>> import numpy as np
-  >>> from sklearn.learning_curve import validation_curve
+  >>> from sklearn.model_selection import validation_curve
   >>> from sklearn.datasets import load_iris
   >>> from sklearn.linear_model import Ridge
 
@@ -140,7 +140,7 @@ that are required to plot such a learning curve (number of samples
 that have been used, the average scores on the training sets and the
 average scores on the validation sets)::
 
-  >>> from sklearn.learning_curve import learning_curve
+  >>> from sklearn.model_selection import learning_curve
   >>> from sklearn.svm import SVC
 
   >>> train_sizes, train_scores, valid_scores = learning_curve(
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 1f2d282499646..d7361127d944c 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -15,8 +15,8 @@ model:
 
 * **Scoring parameter**: Model-evaluation tools using
   :ref:`cross-validation <cross_validation>` (such as
-  :func:`cross_validation.cross_val_score` and
-  :class:`grid_search.GridSearchCV`) rely on an internal *scoring* strategy.
+  :func:`model_selection.cross_val_score` and
+  :class:`model_selection.GridSearchCV`) rely on an internal *scoring* strategy.
   This is discussed in the section :ref:`scoring_parameter`.
 
 * **Metric functions**: The :mod:`metrics` module implements functions
@@ -39,8 +39,8 @@ The ``scoring`` parameter: defining model evaluation rules
 ==========================================================
 
 Model selection and evaluation using tools, such as
-:class:`grid_search.GridSearchCV` and
-:func:`cross_validation.cross_val_score`, take a ``scoring`` parameter that
+:class:`model_selection.GridSearchCV` and
+:func:`model_selection.cross_val_score`, take a ``scoring`` parameter that
 controls what metric they apply to the estimators evaluated.
 
 Common cases: predefined values
@@ -82,16 +82,17 @@ Scoring                      Function                                    Comment
 
 Usage examples:
 
-    >>> from sklearn import svm, cross_validation, datasets
+    >>> from sklearn import svm, datasets
+    >>> from sklearn.model_selection import cross_val_score
     >>> iris = datasets.load_iris()
     >>> X, y = iris.data, iris.target
+    >>> clf = svm.SVC(probability=True, random_state=0)
+    >>> cross_val_score(clf, X, y, scoring='log_loss') # doctest: +ELLIPSIS
+    array([-0.07..., -0.16..., -0.06...])
     >>> model = svm.SVC()
-    >>> cross_validation.cross_val_score(model, X, y, scoring='wrong_choice')
+    >>> cross_val_score(model, X, y, scoring='wrong_choice')
     Traceback (most recent call last):
     ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']
-    >>> clf = svm.SVC(probability=True, random_state=0)
-    >>> cross_validation.cross_val_score(clf, X, y, scoring='log_loss') # doctest: +ELLIPSIS
-    array([-0.07..., -0.16..., -0.06...])
 
 .. note::
 
@@ -135,7 +136,7 @@ the :func:`fbeta_score` function::
 
     >>> from sklearn.metrics import fbeta_score, make_scorer
     >>> ftwo_scorer = make_scorer(fbeta_score, beta=2)
-    >>> from sklearn.grid_search import GridSearchCV
+    >>> from sklearn.model_selection import GridSearchCV
     >>> from sklearn.svm import LinearSVC
     >>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=ftwo_scorer)
 
@@ -1423,7 +1424,7 @@ To illustrate :class:`DummyClassifier`, first let's create an imbalanced
 dataset::
 
   >>> from sklearn.datasets import load_iris
-  >>> from sklearn.cross_validation import train_test_split
+  >>> from sklearn.model_selection import train_test_split
   >>> iris = load_iris()
   >>> X, y = iris.data, iris.target
   >>> y[y != 1] = -1
diff --git a/doc/modules/pipeline.rst b/doc/modules/pipeline.rst
index 0f9c1f635995d..819a3162340a7 100644
--- a/doc/modules/pipeline.rst
+++ b/doc/modules/pipeline.rst
@@ -82,7 +82,7 @@ Parameters of the estimators in the pipeline can be accessed using the
 
 This is particularly important for doing grid searches::
 
-    >>> from sklearn.grid_search import GridSearchCV
+    >>> from sklearn.model_selection import GridSearchCV
     >>> params = dict(reduce_dim__n_components=[2, 5, 10],
     ...               svm__C=[0.1, 10, 100])
     >>> grid_search = GridSearchCV(clf, param_grid=params)
diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
index fda63a3bb66e0..fb3bcc46466bd 100644
--- a/doc/modules/svm.rst
+++ b/doc/modules/svm.rst
@@ -535,8 +535,8 @@ correctly.  ``gamma`` defines how much influence a single training example has.
 The larger ``gamma`` is, the closer other examples must be to be affected.
 
 Proper choice of ``C`` and ``gamma`` is critical to the SVM's performance.  One
-is advised to use :class:`sklearn.grid_search.GridSearchCV` with ``C`` and ``gamma`` spaced
-exponentially far apart to choose good values.
+is advised to use :class:`sklearn.model_selection.GridSearchCV` with 
+``C`` and ``gamma`` spaced exponentially far apart to choose good values.
 
 .. topic:: Examples:
 
diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst
index a77ca00459eb8..67215503cb38e 100644
--- a/doc/tutorial/statistical_inference/model_selection.rst
+++ b/doc/tutorial/statistical_inference/model_selection.rst
@@ -41,45 +41,64 @@ data in *folds* that we use for training and testing::
     >>> print(scores)
     [0.93489148580968284, 0.95659432387312182, 0.93989983305509184]
 
-.. currentmodule:: sklearn.cross_validation
+.. currentmodule:: sklearn.model_selection
 
-This is called a :class:`KFold` cross validation
+This is called a :class:`KFold` cross-validation.
 
 .. _cv_generators_tut:
 
 Cross-validation generators
 =============================
 
+Scikit-learn has a collection of classes which can be used to generate lists of
+train/test indices for popular cross-validation strategies.
 
+They expose a ``split`` method which accepts the input
+dataset to be split and yields the train/test set indices for each iteration
+of the chosen cross-validation strategy.
 
-The code above to split data in train and test sets is tedious to write.
-Scikit-learn exposes cross-validation generators to generate list
-of indices for this purpose::
+This example shows an example usage of the ``split`` method.
 
-    >>> from sklearn import cross_validation
-    >>> k_fold = cross_validation.KFold(n=6, n_folds=3)
-    >>> for train_indices, test_indices in k_fold:
+    >>> from sklearn.model_selection import KFold, cross_val_score
+    >>> X = ["a", "a", "b", "c", "c", "c"]
+    >>> k_fold = KFold(n_folds=3)
+    >>> for train_indices, test_indices in k_fold.split(X):
     ...      print('Train: %s | test: %s' % (train_indices, test_indices))
     Train: [2 3 4 5] | test: [0 1]
     Train: [0 1 4 5] | test: [2 3]
     Train: [0 1 2 3] | test: [4 5]
 
-The cross-validation can then be implemented easily::
+The cross-validation can then be performed easily::
 
-    >>> kfold = cross_validation.KFold(len(X_digits), n_folds=3)
+    >>> kfold = KFold(n_folds=3)
     >>> [svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test])
-    ...          for train, test in kfold]
+    ...          for train, test in k_fold.split(X_digits)]
     [0.93489148580968284, 0.95659432387312182, 0.93989983305509184]
 
-To compute the ``score`` method of an estimator, the sklearn exposes
-a helper function::
+The cross-validation score can be directly calculated using the
+:func:`cross_val_score` helper. Given an estimator, the cross-validation object
+and the input dataset, the :func:`cross_val_score` splits the data repeatedly into
+a training and a testing set, trains the estimator using the training set and
+computes the scores based on the testing set for each iteration of cross-validation.
 
-    >>> cross_validation.cross_val_score(svc, X_digits, y_digits, cv=kfold, n_jobs=-1)
+By default the estimator's ``score`` method is used to compute the individual scores.
+
+Refer the :ref:`metrics module <metrics>` to learn more on the available scoring
+methods.
+
+    >>> cross_val_score(svc, X_digits, y_digits, cv=k_fold, n_jobs=-1)
     array([ 0.93489149,  0.95659432,  0.93989983])
 
 `n_jobs=-1` means that the computation will be dispatched on all the CPUs
 of the computer.
 
+Alternatively, the ``scoring`` argument can be provided to specify an alternative
+scoring method.
+
+    >>> cross_val_score(svc, X_digits, y_digits, cv=k_fold,
+    ...                 scoring='precision_macro')
+    array([ 0.93969761,  0.95911415,  0.94041254])
+
    **Cross-validation generators**
 
 
@@ -87,23 +106,77 @@ of the computer.
 
    *
 
-    - :class:`KFold` **(n, k)**
+    - :class:`KFold` **(n_folds, shuffle, random_state)**
+
+    - :class:`StratifiedKFold` **(n_iter, test_size, train_size, random_state)**
+
+    - :class:`LabelKFold` **(n_folds, shuffle, random_state)**
+
+
+   *
+
+    - Splits it into K folds, trains on K-1 and then tests on the left-out.
+
+    - Same as K-Fold but preserves the class distribution within each fold.
+
+    - Ensures that the same label is not in both testing and training sets.
+
+
+.. list-table::
+
+   *
+
+    - :class:`ShuffleSplit` **(n_iter, test_size, train_size, random_state)**
+
+    - :class:`StratifiedShuffleSplit`
+
+    - :class:`LabelShuffleSplit`
+
+   *
+
+    - Generates train/test indices based on random permutation.
+
+    - Same as shuffle split but preserves the class distribution within each iteration.
+
+    - Ensures that the same label is not in both testing and training sets.
+
+
+.. list-table::
+
+   *
+
+    - :class:`LeaveOneLabelOut` **()**
+
+    - :class:`LeavePLabelOut`  **(p)**
+
+    - :class:`LeaveOneOut` **()**
+
+
+
+   *
+
+    - Takes a label array to group observations.
+
+    - Leave P labels out.
 
-    - :class:`StratifiedKFold` **(y, k)**
+    - Leave one observation out.
 
-    - :class:`LeaveOneOut` **(n)**
 
-    - :class:`LeaveOneLabelOut` **(labels)**
+
+.. list-table::
 
    *
 
-    - Split it K folds, train on K-1 and then test on left-out
+    - :class:`LeavePOut` **(p)**
+
+    - :class:`PredefinedSplit`
+
+   *
 
-    - It preserves the class ratios / label distribution within each fold.
+    - Leave P observations out.
 
-    - Leave one observation out
+    - Generates train/test indices based on predefined splits.
 
-    - Takes a label array to group observations
 
 .. currentmodule:: sklearn.svm
 
@@ -132,14 +205,14 @@ Grid-search and cross-validated estimators
 Grid-search
 -------------
 
-.. currentmodule:: sklearn.grid_search
+.. currentmodule:: sklearn.model_selection
 
 The sklearn provides an object that, given data, computes the score
 during the fit of an estimator on a parameter grid and chooses the
 parameters to maximize the cross-validation score. This object takes an
 estimator during the construction and exposes an estimator API::
 
-    >>> from sklearn.grid_search import GridSearchCV
+    >>> from sklearn.model_selection import GridSearchCV, cross_val_score
     >>> Cs = np.logspace(-6, -1, 10)
     >>> clf = GridSearchCV(estimator=svc, param_grid=dict(C=Cs),
     ...                    n_jobs=-1)
@@ -163,8 +236,8 @@ a stratified 3-fold.
 
     ::
 
-        >>> cross_validation.cross_val_score(clf, X_digits, y_digits)
-        ...                                                  # doctest: +ELLIPSIS
+        >>> cross_val_score(clf, X_digits, y_digits)
+        ...                                               # doctest: +ELLIPSIS
         array([ 0.938...,  0.963...,  0.944...])
 
     Two cross-validation loops are performed in parallel: one by the
diff --git a/doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py b/doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py
index fe2cde6e3f8fb..682fb45bd2aaa 100644
--- a/doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py
+++ b/doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py
@@ -15,7 +15,7 @@
 from sklearn.linear_model import Perceptron
 from sklearn.pipeline import Pipeline
 from sklearn.datasets import load_files
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn import metrics
 
 
diff --git a/doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py b/doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py
index bd2b44a506226..11b1ff07acf7e 100644
--- a/doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py
+++ b/doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py
@@ -15,9 +15,9 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.svm import LinearSVC
 from sklearn.pipeline import Pipeline
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import GridSearchCV
 from sklearn.datasets import load_files
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn import metrics
 
 
diff --git a/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py b/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py
index 2de33948bec58..c86b51eaeef9e 100644
--- a/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py
+++ b/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py
@@ -15,7 +15,7 @@
 from sklearn.linear_model import Perceptron
 from sklearn.pipeline import Pipeline
 from sklearn.datasets import load_files
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn import metrics
 
 
diff --git a/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py b/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py
index c4dcba5da9f90..85c4989786934 100644
--- a/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py
+++ b/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py
@@ -15,9 +15,9 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.svm import LinearSVC
 from sklearn.pipeline import Pipeline
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import GridSearchCV
 from sklearn.datasets import load_files
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn import metrics
 
 
diff --git a/doc/tutorial/text_analytics/working_with_text_data.rst b/doc/tutorial/text_analytics/working_with_text_data.rst
index f842185ddc75a..184b02a09dd11 100644
--- a/doc/tutorial/text_analytics/working_with_text_data.rst
+++ b/doc/tutorial/text_analytics/working_with_text_data.rst
@@ -420,7 +420,7 @@ parameters on a grid of possible values. We try out all classifiers
 on either words or bigrams, with or without idf, and with a penalty
 parameter of either 0.01 or 0.001 for the linear SVM::
 
-  >>> from sklearn.grid_search import GridSearchCV
+  >>> from sklearn.model_selection import GridSearchCV
   >>> parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
   ...               'tfidf__use_idf': (True, False),
   ...               'clf__alpha': (1e-2, 1e-3),
diff --git a/examples/applications/face_recognition.py b/examples/applications/face_recognition.py
index be466e9532cdf..b79599ecb3a06 100644
--- a/examples/applications/face_recognition.py
+++ b/examples/applications/face_recognition.py
@@ -31,9 +31,9 @@
 import logging
 import matplotlib.pyplot as plt
 
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV
 from sklearn.datasets import fetch_lfw_people
-from sklearn.grid_search import GridSearchCV
 from sklearn.metrics import classification_report
 from sklearn.metrics import confusion_matrix
 from sklearn.decomposition import RandomizedPCA
diff --git a/examples/calibration/plot_calibration.py b/examples/calibration/plot_calibration.py
index 2267f02dd0022..299f924e2a468 100644
--- a/examples/calibration/plot_calibration.py
+++ b/examples/calibration/plot_calibration.py
@@ -36,7 +36,7 @@
 from sklearn.naive_bayes import GaussianNB
 from sklearn.metrics import brier_score_loss
 from sklearn.calibration import CalibratedClassifierCV
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 
 
 n_samples = 50000
diff --git a/examples/calibration/plot_calibration_curve.py b/examples/calibration/plot_calibration_curve.py
index 42dc8473e6c30..a37e3158c0c12 100644
--- a/examples/calibration/plot_calibration_curve.py
+++ b/examples/calibration/plot_calibration_curve.py
@@ -56,7 +56,7 @@
 from sklearn.metrics import (brier_score_loss, precision_score, recall_score,
                              f1_score)
 from sklearn.calibration import CalibratedClassifierCV, calibration_curve
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 
 
 # Create dataset of classification task with many redundant and few
diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py
index 935d388c047e1..38a5bea6e1dc2 100644
--- a/examples/classification/plot_classifier_comparison.py
+++ b/examples/classification/plot_classifier_comparison.py
@@ -31,7 +31,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib.colors import ListedColormap
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.datasets import make_moons, make_circles, make_classification
 from sklearn.neural_network import MLPClassifier
diff --git a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
index 488db4d64855d..a0152c85baf60 100644
--- a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
+++ b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
@@ -30,9 +30,9 @@
 from sklearn.cluster import FeatureAgglomeration
 from sklearn.linear_model import BayesianRidge
 from sklearn.pipeline import Pipeline
-from sklearn.grid_search import GridSearchCV
 from sklearn.externals.joblib import Memory
-from sklearn.cross_validation import KFold
+from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import KFold
 
 ###############################################################################
 # Generate data
@@ -60,7 +60,7 @@
 
 ###############################################################################
 # Compute the coefs of a Bayesian Ridge with GridSearch
-cv = KFold(len(y), 2)  # cross-validation generator for model selection
+cv = KFold(2)  # cross-validation generator for model selection
 ridge = BayesianRidge()
 cachedir = tempfile.mkdtemp()
 mem = Memory(cachedir=cachedir, verbose=1)
diff --git a/examples/covariance/plot_covariance_estimation.py b/examples/covariance/plot_covariance_estimation.py
index 85e26705b03e9..96f637974ee29 100644
--- a/examples/covariance/plot_covariance_estimation.py
+++ b/examples/covariance/plot_covariance_estimation.py
@@ -49,7 +49,7 @@
 
 from sklearn.covariance import LedoitWolf, OAS, ShrunkCovariance, \
     log_likelihood, empirical_covariance
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import GridSearchCV
 
 
 ###############################################################################
diff --git a/examples/decomposition/plot_pca_vs_fa_model_selection.py b/examples/decomposition/plot_pca_vs_fa_model_selection.py
index 067bcd7b6b479..89bb707a37699 100644
--- a/examples/decomposition/plot_pca_vs_fa_model_selection.py
+++ b/examples/decomposition/plot_pca_vs_fa_model_selection.py
@@ -35,8 +35,8 @@
 
 from sklearn.decomposition import PCA, FactorAnalysis
 from sklearn.covariance import ShrunkCovariance, LedoitWolf
-from sklearn.cross_validation import cross_val_score
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import GridSearchCV
 
 ###############################################################################
 # Create the data
diff --git a/examples/ensemble/plot_feature_transformation.py b/examples/ensemble/plot_feature_transformation.py
index 1d5b927e48988..e004c167e67af 100644
--- a/examples/ensemble/plot_feature_transformation.py
+++ b/examples/ensemble/plot_feature_transformation.py
@@ -35,7 +35,7 @@
 from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                               GradientBoostingClassifier)
 from sklearn.preprocessing import OneHotEncoder
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn.metrics import roc_curve
 from sklearn.pipeline import make_pipeline
 
@@ -118,4 +118,4 @@
 plt.ylabel('True positive rate')
 plt.title('ROC curve (zoomed in at top left)')
 plt.legend(loc='best')
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/examples/ensemble/plot_gradient_boosting_oob.py b/examples/ensemble/plot_gradient_boosting_oob.py
index a39f709d36979..39e623f261cca 100644
--- a/examples/ensemble/plot_gradient_boosting_oob.py
+++ b/examples/ensemble/plot_gradient_boosting_oob.py
@@ -33,8 +33,8 @@
 import matplotlib.pyplot as plt
 
 from sklearn import ensemble
-from sklearn.cross_validation import KFold
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import KFold
+from sklearn.model_selection import train_test_split
 
 
 # Generate data (adapted from G. Ridgeway's gbm example)
@@ -75,10 +75,10 @@ def heldout_score(clf, X_test, y_test):
 
 
 def cv_estimate(n_folds=3):
-    cv = KFold(n=X_train.shape[0], n_folds=n_folds)
+    cv = KFold(n_folds=n_folds)
     cv_clf = ensemble.GradientBoostingClassifier(**params)
     val_scores = np.zeros((n_estimators,), dtype=np.float64)
-    for train, test in cv:
+    for train, test in cv.split(X_train, y_train):
         cv_clf.fit(X_train[train], y_train[train])
         val_scores += heldout_score(cv_clf, X_train[test], y_train[test])
     val_scores /= n_folds
diff --git a/examples/ensemble/plot_partial_dependence.py b/examples/ensemble/plot_partial_dependence.py
index b480e228d3ca3..d4a26166944c3 100644
--- a/examples/ensemble/plot_partial_dependence.py
+++ b/examples/ensemble/plot_partial_dependence.py
@@ -51,7 +51,7 @@
 
 from mpl_toolkits.mplot3d import Axes3D
 
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.ensemble.partial_dependence import plot_partial_dependence
 from sklearn.ensemble.partial_dependence import partial_dependence
diff --git a/examples/exercises/plot_cv_diabetes.py b/examples/exercises/plot_cv_diabetes.py
index 424c037a5547e..5524d2e3c1334 100644
--- a/examples/exercises/plot_cv_diabetes.py
+++ b/examples/exercises/plot_cv_diabetes.py
@@ -14,13 +14,17 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-from sklearn import cross_validation, datasets, linear_model
+from sklearn import datasets
+from sklearn.linear_model import LassoCV
+from sklearn.linear_model import Lasso
+from sklearn.model_selection import KFold
+from sklearn.model_selection import cross_val_score
 
 diabetes = datasets.load_diabetes()
 X = diabetes.data[:150]
 y = diabetes.target[:150]
 
-lasso = linear_model.Lasso()
+lasso = Lasso()
 alphas = np.logspace(-4, -.5, 30)
 
 scores = list()
@@ -28,7 +32,7 @@
 
 for alpha in alphas:
     lasso.alpha = alpha
-    this_scores = cross_validation.cross_val_score(lasso, X, y, n_jobs=1)
+    this_scores = cross_val_score(lasso, X, y, n_jobs=1)
     scores.append(np.mean(this_scores))
     scores_std.append(np.std(this_scores))
 
@@ -51,15 +55,15 @@
 # performs cross-validation on the training data it receives).
 # We use external cross-validation to see how much the automatically obtained
 # alphas differ across different cross-validation folds.
-lasso_cv = linear_model.LassoCV(alphas=alphas)
-k_fold = cross_validation.KFold(len(X), 3)
+lasso_cv = LassoCV(alphas=alphas)
+k_fold = KFold(3)
 
 print("Answer to the bonus question:",
       "how much can you trust the selection of alpha?")
 print()
 print("Alpha parameters maximising the generalization score on different")
 print("subsets of the data:")
-for k, (train, test) in enumerate(k_fold):
+for k, (train, test) in enumerate(k_fold.split(X, y)):
     lasso_cv.fit(X[train], y[train])
     print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".
           format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))
diff --git a/examples/exercises/plot_cv_digits.py b/examples/exercises/plot_cv_digits.py
index 92f04a935cc5a..a68f92afbdad9 100644
--- a/examples/exercises/plot_cv_digits.py
+++ b/examples/exercises/plot_cv_digits.py
@@ -12,7 +12,8 @@
 
 
 import numpy as np
-from sklearn import cross_validation, datasets, svm
+from sklearn.model_selection import cross_val_score
+from sklearn import datasets, svm
 
 digits = datasets.load_digits()
 X = digits.data
@@ -25,7 +26,7 @@
 scores_std = list()
 for C in C_s:
     svc.C = C
-    this_scores = cross_validation.cross_val_score(svc, X, y, n_jobs=1)
+    this_scores = cross_val_score(svc, X, y, n_jobs=1)
     scores.append(np.mean(this_scores))
     scores_std.append(np.std(this_scores))
 
diff --git a/examples/feature_selection/plot_permutation_test_for_classification.py b/examples/feature_selection/plot_permutation_test_for_classification.py
index 4df102578c9da..24b999451a067 100644
--- a/examples/feature_selection/plot_permutation_test_for_classification.py
+++ b/examples/feature_selection/plot_permutation_test_for_classification.py
@@ -20,7 +20,8 @@
 import matplotlib.pyplot as plt
 
 from sklearn.svm import SVC
-from sklearn.cross_validation import StratifiedKFold, permutation_test_score
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import permutation_test_score
 from sklearn import datasets
 
 
@@ -39,7 +40,7 @@
 X = np.c_[X, E]
 
 svm = SVC(kernel='linear')
-cv = StratifiedKFold(y, 2)
+cv = StratifiedKFold(2)
 
 score, permutation_scores, pvalue = permutation_test_score(
     svm, X, y, scoring="accuracy", cv=cv, n_permutations=100, n_jobs=1)
diff --git a/examples/feature_selection/plot_rfe_with_cross_validation.py b/examples/feature_selection/plot_rfe_with_cross_validation.py
index 232aa115c2d77..8b22ab0d54108 100644
--- a/examples/feature_selection/plot_rfe_with_cross_validation.py
+++ b/examples/feature_selection/plot_rfe_with_cross_validation.py
@@ -10,7 +10,7 @@
 
 import matplotlib.pyplot as plt
 from sklearn.svm import SVC
-from sklearn.cross_validation import StratifiedKFold
+from sklearn.model_selection import StratifiedKFold
 from sklearn.feature_selection import RFECV
 from sklearn.datasets import make_classification
 
@@ -23,7 +23,7 @@
 svc = SVC(kernel="linear")
 # The "accuracy" scoring is proportional to the number of correct
 # classifications
-rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),
+rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2),
               scoring='accuracy')
 rfecv.fit(X, y)
 
diff --git a/examples/feature_stacker.py b/examples/feature_stacker.py
index d1f9453e28e58..4ce574aa36bca 100644
--- a/examples/feature_stacker.py
+++ b/examples/feature_stacker.py
@@ -20,7 +20,7 @@
 # License: BSD 3 clause
 
 from sklearn.pipeline import Pipeline, FeatureUnion
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import GridSearchCV
 from sklearn.svm import SVC
 from sklearn.datasets import load_iris
 from sklearn.decomposition import PCA
diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index 721598d765589..bcfa3c68c69f3 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -57,7 +57,7 @@
 import matplotlib.pyplot as plt
 
 from sklearn.kernel_ridge import KernelRidge
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import GridSearchCV
 from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels import WhiteKernel, ExpSineSquared
 
diff --git a/examples/linear_model/plot_sgd_comparison.py b/examples/linear_model/plot_sgd_comparison.py
index 98bc39cff755e..7506718f93f90 100644
--- a/examples/linear_model/plot_sgd_comparison.py
+++ b/examples/linear_model/plot_sgd_comparison.py
@@ -14,7 +14,7 @@
 import matplotlib.pyplot as plt
 from sklearn import datasets
 
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn.linear_model import SGDClassifier, Perceptron
 from sklearn.linear_model import PassiveAggressiveClassifier
 from sklearn.linear_model import LogisticRegression
diff --git a/examples/missing_values.py b/examples/missing_values.py
index 59444b36490e3..8a0895f9a589f 100644
--- a/examples/missing_values.py
+++ b/examples/missing_values.py
@@ -28,7 +28,7 @@
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import Imputer
-from sklearn.cross_validation import cross_val_score
+from sklearn.model_selection import cross_val_score
 
 rng = np.random.RandomState(0)
 
diff --git a/examples/mixture/plot_gmm_classifier.py b/examples/mixture/plot_gmm_classifier.py
index a3179c5aee054..5fd5be80c77cb 100644
--- a/examples/mixture/plot_gmm_classifier.py
+++ b/examples/mixture/plot_gmm_classifier.py
@@ -33,7 +33,7 @@
 import numpy as np
 
 from sklearn import datasets
-from sklearn.cross_validation import StratifiedKFold
+from sklearn.model_selection import StratifiedKFold
 from sklearn.externals.six.moves import xrange
 from sklearn.mixture import GMM
 
@@ -56,9 +56,9 @@ def make_ellipses(gmm, ax):
 
 # Break up the dataset into non-overlapping training (75%) and testing
 # (25%) sets.
-skf = StratifiedKFold(iris.target, n_folds=4)
+skf = StratifiedKFold(n_folds=4)
 # Only take the first fold.
-train_index, test_index = next(iter(skf))
+train_index, test_index = next(iter(skf.split(iris.data, iris.target)))
 
 
 X_train = iris.data[train_index]
diff --git a/examples/model_selection/README.txt b/examples/model_selection/README.txt
index 553c6e7d6498e..b35a778b28a7f 100644
--- a/examples/model_selection/README.txt
+++ b/examples/model_selection/README.txt
@@ -3,5 +3,4 @@
 Model Selection
 -----------------------
 
-Examples concerning model selection, mostly contained in the
-:mod:`sklearn.grid_search` and :mod:`sklearn.cross_validation` modules.
+Examples related to the :mod:`sklearn.model_selection` module.
diff --git a/examples/model_selection/grid_search_digits.py b/examples/model_selection/grid_search_digits.py
index c8aec1bab8c0f..6b039629df2a5 100644
--- a/examples/model_selection/grid_search_digits.py
+++ b/examples/model_selection/grid_search_digits.py
@@ -4,7 +4,7 @@
 ============================================================
 
 This examples shows how a classifier is optimized by cross-validation,
-which is done using the :class:`sklearn.grid_search.GridSearchCV` object
+which is done using the :class:`sklearn.model_selection.GridSearchCV` object
 on a development set that comprises only half of the available labeled data.
 
 The performance of the selected hyper-parameters and trained model is
@@ -19,8 +19,8 @@
 from __future__ import print_function
 
 from sklearn import datasets
-from sklearn.cross_validation import train_test_split
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV
 from sklearn.metrics import classification_report
 from sklearn.svm import SVC
 
diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
index 11b690d91e2ef..daf82718d42e1 100644
--- a/examples/model_selection/grid_search_text_feature_extraction.py
+++ b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -1,3 +1,4 @@
+
 """
 ==========================================================
 Sample pipeline for text feature extraction and evaluation
@@ -56,7 +57,7 @@
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfTransformer
 from sklearn.linear_model import SGDClassifier
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
 
 print(__doc__)
diff --git a/examples/model_selection/plot_confusion_matrix.py b/examples/model_selection/plot_confusion_matrix.py
index 771d058e2a4a5..250d71c08c442 100644
--- a/examples/model_selection/plot_confusion_matrix.py
+++ b/examples/model_selection/plot_confusion_matrix.py
@@ -30,7 +30,7 @@
 import matplotlib.pyplot as plt
 
 from sklearn import svm, datasets
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn.metrics import confusion_matrix
 
 # import some data to play with
diff --git a/examples/model_selection/plot_learning_curve.py b/examples/model_selection/plot_learning_curve.py
index 7a47fd574635d..3adcd78aa8b6a 100644
--- a/examples/model_selection/plot_learning_curve.py
+++ b/examples/model_selection/plot_learning_curve.py
@@ -17,11 +17,11 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
-from sklearn import cross_validation
 from sklearn.naive_bayes import GaussianNB
 from sklearn.svm import SVC
 from sklearn.datasets import load_digits
-from sklearn.learning_curve import learning_curve
+from sklearn.model_selection import learning_curve
+from sklearn.model_selection import ShuffleSplit
 
 
 def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
@@ -48,10 +48,20 @@ def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
     ylim : tuple, shape (ymin, ymax), optional
         Defines minimum and maximum yvalues plotted.
 
-    cv : integer, cross-validation generator, optional
-        If an integer is passed, it is the number of folds (defaults to 3).
-        Specific cross-validation objects can be passed, see
-        sklearn.cross_validation module for the list of possible objects
+    cv : int, cross-validation generator or an iterable, optional
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+          - None, to use the default 3-fold cross-validation,
+          - integer, to specify the number of folds.
+          - An object to be used as a cross-validation generator.
+          - An iterable yielding train/test splits.
+
+        For integer/None inputs, if ``y`` is binary or multiclass,
+        :class:`StratifiedKFold` used. If the estimator is not a classifier
+        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validators that can be used here.
 
     n_jobs : integer, optional
         Number of jobs to run in parallel (default 1).
@@ -91,16 +101,14 @@ def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
 title = "Learning Curves (Naive Bayes)"
 # Cross validation with 100 iterations to get smoother mean test and train
 # score curves, each time with 20% data randomly selected as a validation set.
-cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100,
-                                   test_size=0.2, random_state=0)
+cv = ShuffleSplit(n_iter=100, test_size=0.2, random_state=0)
 
 estimator = GaussianNB()
 plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4)
 
 title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
 # SVC is more expensive so we do a lower number of CV iterations:
-cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10,
-                                   test_size=0.2, random_state=0)
+cv = ShuffleSplit(n_iter=10, test_size=0.2, random_state=0)
 estimator = SVC(gamma=0.001)
 plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)
 
diff --git a/examples/model_selection/plot_precision_recall.py b/examples/model_selection/plot_precision_recall.py
index 4d83910008f4a..f9244410d5792 100644
--- a/examples/model_selection/plot_precision_recall.py
+++ b/examples/model_selection/plot_precision_recall.py
@@ -80,7 +80,7 @@
 from sklearn import svm, datasets
 from sklearn.metrics import precision_recall_curve
 from sklearn.metrics import average_precision_score
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import label_binarize
 from sklearn.multiclass import OneVsRestClassifier
 
diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 8448998e2ae7f..a3884eff33c62 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -43,7 +43,7 @@
 
 from sklearn import svm, datasets
 from sklearn.metrics import roc_curve, auc
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import label_binarize
 from sklearn.multiclass import OneVsRestClassifier
 from scipy import interp
diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index 6671e60166dd4..1508839386ffe 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -25,7 +25,7 @@
 .. note::
 
     See also :func:`sklearn.metrics.auc_score`,
-             :func:`sklearn.cross_validation.cross_val_score`,
+             :func:`sklearn.model_selection.cross_val_score`,
              :ref:`example_model_selection_plot_roc.py`,
 
 """
@@ -38,7 +38,7 @@
 
 from sklearn import svm, datasets
 from sklearn.metrics import roc_curve, auc
-from sklearn.cross_validation import StratifiedKFold
+from sklearn.model_selection import StratifiedKFold
 
 ###############################################################################
 # Data IO and generation
@@ -58,7 +58,7 @@
 # Classification and ROC analysis
 
 # Run classifier with cross-validation and plot ROC curves
-cv = StratifiedKFold(y, n_folds=6)
+cv = StratifiedKFold(n_folds=6)
 classifier = svm.SVC(kernel='linear', probability=True,
                      random_state=random_state)
 
@@ -70,7 +70,7 @@
 lw = 2
 
 i = 0
-for (train, test), color in zip(cv, colors):
+for (train, test), color in zip(cv.split(X, y), colors):
     probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
     # Compute ROC curve and area the curve
     fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
@@ -84,7 +84,7 @@
 plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',
          label='Luck')
 
-mean_tpr /= len(cv)
+mean_tpr /= cv.get_n_splits(X, y)
 mean_tpr[-1] = 1.0
 mean_auc = auc(mean_fpr, mean_tpr)
 plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
diff --git a/examples/model_selection/plot_underfitting_overfitting.py b/examples/model_selection/plot_underfitting_overfitting.py
index f8958cbffe21b..ff454664c7b18 100644
--- a/examples/model_selection/plot_underfitting_overfitting.py
+++ b/examples/model_selection/plot_underfitting_overfitting.py
@@ -27,7 +27,7 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.linear_model import LinearRegression
-from sklearn import cross_validation
+from sklearn.model_selection import cross_val_score
 
 np.random.seed(0)
 
@@ -51,8 +51,8 @@
     pipeline.fit(X[:, np.newaxis], y)
 
     # Evaluate the models using crossvalidation
-    scores = cross_validation.cross_val_score(pipeline,
-        X[:, np.newaxis], y, scoring="mean_squared_error", cv=10)
+    scores = cross_val_score(pipeline, X[:, np.newaxis], y,
+                             scoring="mean_squared_error", cv=10)
 
     X_test = np.linspace(0, 1, 100)
     plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py
index e3232984aa33a..ed74a41ff100b 100644
--- a/examples/model_selection/plot_validation_curve.py
+++ b/examples/model_selection/plot_validation_curve.py
@@ -18,7 +18,7 @@
 
 from sklearn.datasets import load_digits
 from sklearn.svm import SVC
-from sklearn.learning_curve import validation_curve
+from sklearn.model_selection import validation_curve
 
 digits = load_digits()
 X, y = digits.data, digits.target
diff --git a/examples/model_selection/randomized_search.py b/examples/model_selection/randomized_search.py
index 0682b24689813..85a16c6f52d55 100644
--- a/examples/model_selection/randomized_search.py
+++ b/examples/model_selection/randomized_search.py
@@ -26,7 +26,8 @@
 from operator import itemgetter
 from scipy.stats import randint as sp_randint
 
-from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
+from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import RandomizedSearchCV
 from sklearn.datasets import load_digits
 from sklearn.ensemble import RandomForestClassifier
 
diff --git a/examples/neighbors/plot_digits_kde_sampling.py b/examples/neighbors/plot_digits_kde_sampling.py
index 4680a41780aed..ba59fb5ece537 100644
--- a/examples/neighbors/plot_digits_kde_sampling.py
+++ b/examples/neighbors/plot_digits_kde_sampling.py
@@ -16,7 +16,7 @@
 from sklearn.datasets import load_digits
 from sklearn.neighbors import KernelDensity
 from sklearn.decomposition import PCA
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import GridSearchCV
 
 # load the data
 digits = load_digits()
diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py
index 9f085036d2013..2b9b15fe3d966 100644
--- a/examples/neural_networks/plot_rbm_logistic_classification.py
+++ b/examples/neural_networks/plot_rbm_logistic_classification.py
@@ -37,7 +37,7 @@
 
 from scipy.ndimage import convolve
 from sklearn import linear_model, datasets, metrics
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn.neural_network import BernoulliRBM
 from sklearn.pipeline import Pipeline
 
diff --git a/examples/plot_cv_predict.py b/examples/plot_cv_predict.py
index 5a9e541949dc9..4657ff816369a 100644
--- a/examples/plot_cv_predict.py
+++ b/examples/plot_cv_predict.py
@@ -8,7 +8,7 @@
 
 """
 from sklearn import datasets
-from sklearn.cross_validation import cross_val_predict
+from sklearn.model_selection import cross_val_predict
 from sklearn import linear_model
 import matplotlib.pyplot as plt
 
diff --git a/examples/plot_digits_pipe.py b/examples/plot_digits_pipe.py
index 139ade15ba7c2..cd134fe20ddec 100644
--- a/examples/plot_digits_pipe.py
+++ b/examples/plot_digits_pipe.py
@@ -25,7 +25,7 @@
 
 from sklearn import linear_model, decomposition, datasets
 from sklearn.pipeline import Pipeline
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import GridSearchCV
 
 logistic = linear_model.LogisticRegression()
 
diff --git a/examples/plot_kernel_ridge_regression.py b/examples/plot_kernel_ridge_regression.py
index 19aeece6658cc..5ca4093a05b06 100644
--- a/examples/plot_kernel_ridge_regression.py
+++ b/examples/plot_kernel_ridge_regression.py
@@ -41,8 +41,8 @@
 import numpy as np
 
 from sklearn.svm import SVR
-from sklearn.grid_search import GridSearchCV
-from sklearn.learning_curve import learning_curve
+from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import learning_curve
 from sklearn.kernel_ridge import KernelRidge
 import matplotlib.pyplot as plt
 
diff --git a/examples/preprocessing/plot_function_transformer.py b/examples/preprocessing/plot_function_transformer.py
index 031a126a92e6d..bfe5e41932201 100644
--- a/examples/preprocessing/plot_function_transformer.py
+++ b/examples/preprocessing/plot_function_transformer.py
@@ -11,7 +11,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn.decomposition import PCA
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import FunctionTransformer
diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py
index d3ea13e2e88e7..27bbd94ee00c9 100644
--- a/examples/svm/plot_rbf_parameters.py
+++ b/examples/svm/plot_rbf_parameters.py
@@ -74,8 +74,8 @@
 from sklearn.svm import SVC
 from sklearn.preprocessing import StandardScaler
 from sklearn.datasets import load_iris
-from sklearn.cross_validation import StratifiedShuffleSplit
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import StratifiedShuffleSplit
+from sklearn.model_selection import GridSearchCV
 
 
 # Utility function to move the midpoint of a colormap to be around
@@ -128,7 +128,7 @@ def __call__(self, value, clip=None):
 C_range = np.logspace(-2, 10, 13)
 gamma_range = np.logspace(-9, 3, 13)
 param_grid = dict(gamma=gamma_range, C=C_range)
-cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
+cv = StratifiedShuffleSplit(n_iter=5, test_size=0.2, random_state=42)
 grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
 grid.fit(X, y)
 
diff --git a/examples/svm/plot_svm_anova.py b/examples/svm/plot_svm_anova.py
index 9ce225bf980e5..45da4c35e0a64 100644
--- a/examples/svm/plot_svm_anova.py
+++ b/examples/svm/plot_svm_anova.py
@@ -10,7 +10,8 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
-from sklearn import svm, datasets, feature_selection, cross_validation
+from sklearn import svm, datasets, feature_selection
+from sklearn.model_selection import cross_val_score
 from sklearn.pipeline import Pipeline
 
 ###############################################################################
@@ -42,7 +43,7 @@
 for percentile in percentiles:
     clf.set_params(anova__percentile=percentile)
     # Compute cross-validation score using all CPUs
-    this_scores = cross_validation.cross_val_score(clf, X, y, n_jobs=1)
+    this_scores = cross_val_score(clf, X, y, n_jobs=1)
     score_means.append(this_scores.mean())
     score_stds.append(this_scores.std())
 
diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py
index 60173338bd0cd..ed92bc19dcada 100644
--- a/examples/svm/plot_svm_scale_c.py
+++ b/examples/svm/plot_svm_scale_c.py
@@ -88,8 +88,8 @@
 import matplotlib.pyplot as plt
 
 from sklearn.svm import LinearSVC
-from sklearn.cross_validation import ShuffleSplit
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import ShuffleSplit
+from sklearn.model_selection import GridSearchCV
 from sklearn.utils import check_random_state
 from sklearn import datasets
 
@@ -128,8 +128,8 @@
         # To get nice curve, we need a large number of iterations to
         # reduce the variance
         grid = GridSearchCV(clf, refit=False, param_grid=param_grid,
-                            cv=ShuffleSplit(n=n_samples, train_size=train_size,
-                                            n_iter=250, random_state=1))
+                            cv=ShuffleSplit(train_size=train_size, n_iter=250,
+                                            random_state=1))
         grid.fit(X, y)
         scores = [x[1] for x in grid.grid_scores_]