diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 38110090dffdc..48fbb10f7f924 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -794,7 +794,7 @@ adheres to the scikit-learn interface and standards by running The main motivation to make a class compatible to the scikit-learn estimator interface might be that you want to use it together with model assessment and -selection tools such as :class:`grid_search.GridSearchCV`. +selection tools such as :class:`model_selection.GridSearchCV`. For this to work, you need to implement the following interface. If a dependency on scikit-learn is okay for your code, @@ -856,7 +856,7 @@ implement the interface is:: Parameters and init ------------------- -As :class:`grid_search.GridSearchCV` uses ``set_params`` +As :class:`model_selection.GridSearchCV` uses ``set_params`` to apply parameter setting to estimators, it is essential that calling ``set_params`` has the same effect as setting parameters using the ``__init__`` method. @@ -874,9 +874,8 @@ trailing ``_`` is used to check if the estimator has been fitted. Cloning ------- -For using :class:`grid_search.GridSearch` or any functionality of the -:mod:`cross_validation` module, an estimator must support the ``base.clone`` -function to replicate an estimator. +For use with the :mod:`model_selection` module, +an estimator must support the ``base.clone`` function to replicate an estimator. This can be done by providing a ``get_params`` method. If ``get_params`` is present, then ``clone(estimator)`` will be an instance of ``type(estimator)`` on which ``set_params`` has been called with clones of @@ -901,8 +900,8 @@ accepts an optional ``y``. Estimator types --------------- Some common functionality depends on the kind of estimator passed. -For example, cross-validation in :class:`grid_search.GridSearchCV` and -:func:`cross_validation.cross_val_score` defaults to being stratified when used +For example, cross-validation in :class:`model_selection.GridSearchCV` and +:func:`model_selection.cross_val_score` defaults to being stratified when used on a classifier, but not otherwise. Similarly, scorers for average precision that take a continuous prediction need to call ``decision_function`` for classifiers, but ``predict`` for regressors. This distinction between classifiers and regressors diff --git a/doc/model_selection.rst b/doc/model_selection.rst index daec6a6ed83e4..43036e9bdf6c7 100644 --- a/doc/model_selection.rst +++ b/doc/model_selection.rst @@ -11,4 +11,3 @@ Model selection and evaluation modules/grid_search modules/model_evaluation modules/model_persistence - modules/learning_curve diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 3ba4c8952568e..64f99c06ebd8f 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -144,16 +144,50 @@ Classes covariance.graph_lasso -.. _cross_validation_ref: - -:mod:`sklearn.cross_validation`: Cross Validation -================================================= +:mod:`sklearn.model_selection`: Model Selection +=============================================== -.. automodule:: sklearn.cross_validation +.. automodule:: sklearn.model_selection :no-members: :no-inherited-members: -**User guide:** See the :ref:`cross_validation` section for further details. +**User guide:** See the :ref:`cross_validation`, :ref:`grid_search` and +:ref:`learning_curve` sections for further details. + +Splitter Classes +---------------- + +.. currentmodule:: sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + model_selection.KFold + model_selection.LabelKFold + model_selection.StratifiedKFold + model_selection.LeaveOneLabelOut + model_selection.LeavePLabelOut + model_selection.LeaveOneOut + model_selection.LeavePOut + model_selection.ShuffleSplit + model_selection.LabelShuffleSplit + model_selection.StratifiedShuffleSplit + model_selection.PredefinedSplit + +Splitter Functions +------------------ + +.. currentmodule:: sklearn + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + model_selection.train_test_split + +Hyper-parameter optimizers +-------------------------- .. currentmodule:: sklearn @@ -161,28 +195,25 @@ Classes :toctree: generated/ :template: class.rst - cross_validation.KFold - cross_validation.LabelKFold - cross_validation.LabelShuffleSplit - cross_validation.LeaveOneLabelOut - cross_validation.LeaveOneOut - cross_validation.LeavePLabelOut - cross_validation.LeavePOut - cross_validation.PredefinedSplit - cross_validation.ShuffleSplit - cross_validation.StratifiedKFold - cross_validation.StratifiedShuffleSplit + model_selection.GridSearchCV + model_selection.RandomizedSearchCV + model_selection.ParameterGrid + model_selection.ParameterSampler + +Model validation +---------------- +.. currentmodule:: sklearn .. autosummary:: :toctree: generated/ :template: function.rst - cross_validation.train_test_split - cross_validation.cross_val_score - cross_validation.cross_val_predict - cross_validation.permutation_test_score - cross_validation.check_cv + model_selection.cross_val_score + model_selection.cross_val_predict + model_selection.permutation_test_score + model_selection.learning_curve + model_selection.validation_curve .. _datasets_ref: @@ -547,29 +578,6 @@ Kernels: gaussian_process.kernels.CompoundKernel gaussian_process.kernels.Hyperparameter -.. _grid_search_ref: - -:mod:`sklearn.grid_search`: Grid Search -======================================= - -.. automodule:: sklearn.grid_search - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`grid_search` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - grid_search.GridSearchCV - grid_search.ParameterGrid - grid_search.ParameterSampler - grid_search.RandomizedSearchCV - - .. _isotonic_ref: :mod:`sklearn.isotonic`: Isotonic regression @@ -658,24 +666,6 @@ Kernels: discriminant_analysis.QuadraticDiscriminantAnalysis -.. _learning_curve_ref: - -:mod:`sklearn.learning_curve` Learning curve evaluation -======================================================= - -.. automodule:: sklearn.learning_curve - :no-members: - :no-inherited-members: - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - learning_curve.learning_curve - learning_curve.validation_curve - .. _linear_model_ref: :mod:`sklearn.linear_model`: Generalized Linear Models diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index e76b95c6e48be..52f98e341971b 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -4,7 +4,7 @@ Cross-validation: evaluating estimator performance =================================================== -.. currentmodule:: sklearn.cross_validation +.. currentmodule:: sklearn.model_selection Learning the parameters of a prediction function and testing it on the same data is a methodological mistake: a model that would just repeat @@ -24,7 +24,7 @@ can be quickly computed with the :func:`train_test_split` helper function. Let's load the iris data set to fit a linear support vector machine on it:: >>> import numpy as np - >>> from sklearn import cross_validation + >>> from sklearn.model_selection import train_test_split >>> from sklearn import datasets >>> from sklearn import svm @@ -35,7 +35,7 @@ Let's load the iris data set to fit a linear support vector machine on it:: We can now quickly sample a training set while holding out 40% of the data for testing (evaluating) our classifier:: - >>> X_train, X_test, y_train, y_test = cross_validation.train_test_split( + >>> X_train, X_test, y_train, y_test = train_test_split( ... iris.data, iris.target, test_size=0.4, random_state=0) >>> X_train.shape, y_train.shape @@ -101,10 +101,9 @@ kernel support vector machine on the iris dataset by splitting the data, fitting a model and computing the score 5 consecutive times (with different splits each time):: + >>> from sklearn.model_selection import cross_val_score >>> clf = svm.SVC(kernel='linear', C=1) - >>> scores = cross_validation.cross_val_score( - ... clf, iris.data, iris.target, cv=5) - ... + >>> scores = cross_val_score(clf, iris.data, iris.target, cv=5) >>> scores # doctest: +ELLIPSIS array([ 0.96..., 1. ..., 0.96..., 0.96..., 1. ]) @@ -119,8 +118,8 @@ method of the estimator. It is possible to change this by using the scoring parameter:: >>> from sklearn import metrics - >>> scores = cross_validation.cross_val_score(clf, iris.data, iris.target, - ... cv=5, scoring='f1_weighted') + >>> scores = cross_val_score( + ... clf, iris.data, iris.target, cv=5, scoring='f1_macro') >>> scores # doctest: +ELLIPSIS array([ 0.96..., 1. ..., 0.96..., 0.96..., 1. ]) @@ -136,11 +135,10 @@ being used if the estimator derives from :class:`ClassifierMixin It is also possible to use other cross validation strategies by passing a cross validation iterator instead, for instance:: + >>> from sklearn.model_selection import ShuffleSplit >>> n_samples = iris.data.shape[0] - >>> cv = cross_validation.ShuffleSplit(n_samples, n_iter=3, - ... test_size=0.3, random_state=0) - - >>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=cv) + >>> cv = ShuffleSplit(n_iter=3, test_size=0.3, random_state=0) + >>> cross_val_score(clf, iris.data, iris.target, cv=cv) ... # doctest: +ELLIPSIS array([ 0.97..., 0.97..., 1. ]) @@ -153,7 +151,7 @@ validation iterator instead, for instance:: be learnt from a training set and applied to held-out data for prediction:: >>> from sklearn import preprocessing - >>> X_train, X_test, y_train, y_test = cross_validation.train_test_split( + >>> X_train, X_test, y_train, y_test = train_test_split( ... iris.data, iris.target, test_size=0.4, random_state=0) >>> scaler = preprocessing.StandardScaler().fit(X_train) >>> X_train_transformed = scaler.transform(X_train) @@ -167,7 +165,7 @@ validation iterator instead, for instance:: >>> from sklearn.pipeline import make_pipeline >>> clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1)) - >>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=cv) + >>> cross_val_score(clf, iris.data, iris.target, cv=cv) ... # doctest: +ELLIPSIS array([ 0.97..., 0.93..., 0.95...]) @@ -184,8 +182,8 @@ can be used (otherwise, an exception is raised). These prediction can then be used to evaluate the classifier:: - >>> predicted = cross_validation.cross_val_predict(clf, iris.data, - ... iris.target, cv=10) + >>> from sklearn.model_selection import cross_val_predict + >>> predicted = cross_val_predict(clf, iris.data, iris.target, cv=10) >>> metrics.accuracy_score(iris.target, predicted) # doctest: +ELLIPSIS 0.966... @@ -223,10 +221,11 @@ learned using :math:`k - 1` folds, and the fold left out is used for test. Example of 2-fold cross-validation on a dataset with 4 samples:: >>> import numpy as np - >>> from sklearn.cross_validation import KFold + >>> from sklearn.model_selection import KFold - >>> kf = KFold(4, n_folds=2) - >>> for train, test in kf: + >>> X = ["a", "b", "c", "d"] + >>> kf = KFold(n_folds=2) + >>> for train, test in kf.split(X): ... print("%s %s" % (train, test)) [2 3] [0 1] [0 1] [2 3] @@ -250,11 +249,12 @@ target class as the complete set. Example of stratified 3-fold cross-validation on a dataset with 10 samples from two slightly unbalanced classes:: - >>> from sklearn.cross_validation import StratifiedKFold + >>> from sklearn.model_selection import StratifiedKFold - >>> labels = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1] - >>> skf = StratifiedKFold(labels, 3) - >>> for train, test in skf: + >>> X = np.ones(10) + >>> y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1] + >>> skf = StratifiedKFold(n_folds=3) + >>> for train, test in skf.split(X, y): ... print("%s %s" % (train, test)) [2 3 6 7 8 9] [0 1 4 5] [0 1 3 4 5 8 9] [2 6 7] @@ -272,12 +272,14 @@ subjects. Imagine you have three subjects, each with an associated number from 1 to 3:: - >>> from sklearn.cross_validation import LabelKFold + >>> from sklearn.model_selection import LabelKFold + >>> X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10] + >>> y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"] >>> labels = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] - >>> lkf = LabelKFold(labels, n_folds=3) - >>> for train, test in lkf: + >>> lkf = LabelKFold(n_folds=3) + >>> for train, test in lkf.split(X, y, labels): ... print("%s %s" % (train, test)) [0 1 2 3 4 5] [6 7 8 9] [0 1 2 6 7 8 9] [3 4 5] @@ -298,10 +300,11 @@ training sets and :math:`n` different tests set. This cross-validation procedure does not waste much data as only one sample is removed from the training set:: - >>> from sklearn.cross_validation import LeaveOneOut + >>> from sklearn.model_selection import LeaveOneOut - >>> loo = LeaveOneOut(4) - >>> for train, test in loo: + >>> X = [1, 2, 3, 4] + >>> loo = LeaveOneOut() + >>> for train, test in loo.split(X): ... print("%s %s" % (train, test)) [1 2 3] [0] [0 2 3] [1] @@ -356,10 +359,11 @@ overlap for :math:`p > 1`. Example of Leave-2-Out on a dataset with 4 samples:: - >>> from sklearn.cross_validation import LeavePOut + >>> from sklearn.model_selection import LeavePOut - >>> lpo = LeavePOut(4, p=2) - >>> for train, test in lpo: + >>> X = np.ones(4) + >>> lpo = LeavePOut(p=2) + >>> for train, test in lpo.split(X): ... print("%s %s" % (train, test)) [2 3] [0 1] [1 3] [0 2] @@ -384,11 +388,13 @@ For example, in the cases of multiple experiments, *LOLO* can be used to create a cross-validation based on the different experiments: we create a training set using the samples of all the experiments except one:: - >>> from sklearn.cross_validation import LeaveOneLabelOut + >>> from sklearn.model_selection import LeaveOneLabelOut + >>> X = [1, 5, 10, 50] + >>> y = [0, 1, 1, 2] >>> labels = [1, 1, 2, 2] - >>> lolo = LeaveOneLabelOut(labels) - >>> for train, test in lolo: + >>> lolo = LeaveOneLabelOut() + >>> for train, test in lolo.split(X, y, labels): ... print("%s %s" % (train, test)) [2 3] [0 1] [0 1] [2 3] @@ -416,11 +422,13 @@ samples related to :math:`P` labels for each training/test set. Example of Leave-2-Label Out:: - >>> from sklearn.cross_validation import LeavePLabelOut + >>> from sklearn.model_selection import LeavePLabelOut + >>> X = np.arange(6) + >>> y = [1, 1, 1, 2, 2, 2] >>> labels = [1, 1, 2, 2, 3, 3] - >>> lplo = LeavePLabelOut(labels, p=2) - >>> for train, test in lplo: + >>> lplo = LeavePLabelOut(n_labels=2) + >>> for train, test in lplo.split(X, y, labels): ... print("%s %s" % (train, test)) [4 5] [0 1 2 3] [2 3] [0 1 4 5] @@ -443,9 +451,11 @@ generator. Here is a usage example:: - >>> ss = cross_validation.ShuffleSplit(5, n_iter=3, test_size=0.25, + >>> from sklearn.model_selection import ShuffleSplit + >>> X = np.arange(5) + >>> ss = ShuffleSplit(n_iter=3, test_size=0.25, ... random_state=0) - >>> for train_index, test_index in ss: + >>> for train_index, test_index in ss.split(X): ... print("%s %s" % (train_index, test_index)) ... [1 3 4] [2 0] @@ -469,12 +479,13 @@ out for each split. Here is a usage example:: - >>> from sklearn.cross_validation import LabelShuffleSplit + >>> from sklearn.model_selection import LabelShuffleSplit + >>> X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001] + >>> y = ["a", "b", "b", "b", "c", "c", "c", "a"] >>> labels = [1, 1, 2, 2, 3, 3, 4, 4] - >>> slo = LabelShuffleSplit(labels, n_iter=4, test_size=0.5, - ... random_state=0) - >>> for train, test in slo: + >>> lss = LabelShuffleSplit(n_iter=4, test_size=0.5, random_state=0) + >>> for train, test in lss.split(X, y, labels): ... print("%s %s" % (train, test)) ... [0 1 2 3] [4 5 6 7] @@ -541,4 +552,4 @@ Cross validation and model selection Cross validation iterators can also be used to directly perform model selection using Grid Search for the optimal hyperparameters of the -model. This is the topic if the next section: :ref:`grid_search`. +model. This is the topic of the next section: :ref:`grid_search`. diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index a033581012100..1e34706c4fec8 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -156,7 +156,7 @@ picked as the splitting rule. This usually allows to reduce the variance of the model a bit more, at the expense of a slightly greater increase in bias:: - >>> from sklearn.cross_validation import cross_val_score + >>> from sklearn.model_selection import cross_val_score >>> from sklearn.datasets import make_blobs >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.ensemble import ExtraTreesClassifier @@ -360,7 +360,7 @@ Usage The following example shows how to fit an AdaBoost classifier with 100 weak learners:: - >>> from sklearn.cross_validation import cross_val_score + >>> from sklearn.model_selection import cross_val_score >>> from sklearn.datasets import load_iris >>> from sklearn.ensemble import AdaBoostClassifier @@ -948,7 +948,7 @@ Usage The following example shows how to fit the majority rule classifier:: >>> from sklearn import datasets - >>> from sklearn import cross_validation + >>> from sklearn.model_selection import cross_val_score >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.naive_bayes import GaussianNB >>> from sklearn.ensemble import RandomForestClassifier @@ -964,7 +964,7 @@ The following example shows how to fit the majority rule classifier:: >>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') >>> for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']): - ... scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy') + ... scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy') ... print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) Accuracy: 0.90 (+/- 0.05) [Logistic Regression] Accuracy: 0.93 (+/- 0.05) [Random Forest] @@ -1041,7 +1041,7 @@ Using the `VotingClassifier` with `GridSearch` The `VotingClassifier` can also be used together with `GridSearch` in order to tune the hyperparameters of the individual estimators:: - >>> from sklearn.grid_search import GridSearchCV + >>> from sklearn.model_selection import GridSearchCV >>> clf1 = LogisticRegression(random_state=1) >>> clf2 = RandomForestClassifier(random_state=1) >>> clf3 = GaussianNB() diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index eb4a0b10828f3..de66f8bc02906 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -1,26 +1,27 @@ -.. currentmodule:: sklearn.grid_search + + +.. currentmodule:: sklearn.model_selection .. _grid_search: -=============================================== -Grid Search: Searching for estimator parameters -=============================================== +=========================================== +Tuning the hyper-parameters of an estimator +=========================================== + +Hyper-parameters are parameters that are not directly learnt within estimators. +In scikit-learn they are passed as arguments to the constructor of the +estimator classes. Typical examples include ``C``, ``kernel`` and ``gamma`` +for Support Vector Classifier, ``alpha`` for Lasso, etc. -Parameters that are not directly learnt within estimators can be set by -searching a parameter space for the best :ref:`cross_validation` score. -Typical examples include ``C``, ``kernel`` and ``gamma`` for Support Vector -Classifier, ``alpha`` for Lasso, etc. +It is possible and recommended to search the hyper-parameter space for the +best :ref:`cross_validation` score. Any parameter provided when constructing an estimator may be optimized in this -manner. Specifically, to find the names and current values for all parameters +manner. Specifically, to find the names and current values for all parameters for a given estimator, use:: estimator.get_params() -Such parameters are often referred to as *hyperparameters* (particularly in -Bayesian learning), distinguishing them from the parameters optimised in a -machine learning procedure. - A search consists of: - an estimator (regressor or classifier such as ``sklearn.svm.SVC()``); @@ -38,6 +39,12 @@ given number of candidates from a parameter space with a specified distribution. After describing these tools we detail :ref:`best practice ` applicable to both approaches. +Note that it is common that a small subset of those parameters can have a large +impact on the predictive or computation performance of the model while others +can be left to their default values. It is recommend to read the docstring of +the estimator class to get a finer understanding of their expected behavior, +possibly by reading the enclosed reference to the literature. + Exhaustive Grid Search ====================== @@ -59,7 +66,7 @@ The :class:`GridSearchCV` instance implements the usual estimator API: when "fitting" it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained. -.. currentmodule:: sklearn.grid_search +.. currentmodule:: sklearn.model_selection .. topic:: Examples: @@ -164,7 +171,7 @@ it is recommended to split the data into a **development set** (to be fed to the ``GridSearchCV`` instance) and an **evaluation set** to compute performance metrics. -This can be done by using the :func:`cross_validation.train_test_split` +This can be done by using the :func:`train_test_split` utility function. Parallelism diff --git a/doc/modules/learning_curve.rst b/doc/modules/learning_curve.rst index 4cd655fdbb6c0..8708ef8c7acdf 100644 --- a/doc/modules/learning_curve.rst +++ b/doc/modules/learning_curve.rst @@ -4,7 +4,7 @@ Validation curves: plotting scores to evaluate models ===================================================== -.. currentmodule:: sklearn.learning_curve +.. currentmodule:: sklearn.model_selection Every estimator has its advantages and drawbacks. Its generalization error can be decomposed in terms of bias, variance and noise. The **bias** of an @@ -69,7 +69,7 @@ values. The function :func:`validation_curve` can help in this case:: >>> import numpy as np - >>> from sklearn.learning_curve import validation_curve + >>> from sklearn.model_selection import validation_curve >>> from sklearn.datasets import load_iris >>> from sklearn.linear_model import Ridge @@ -140,7 +140,7 @@ that are required to plot such a learning curve (number of samples that have been used, the average scores on the training sets and the average scores on the validation sets):: - >>> from sklearn.learning_curve import learning_curve + >>> from sklearn.model_selection import learning_curve >>> from sklearn.svm import SVC >>> train_sizes, train_scores, valid_scores = learning_curve( diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 1f2d282499646..d7361127d944c 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -15,8 +15,8 @@ model: * **Scoring parameter**: Model-evaluation tools using :ref:`cross-validation ` (such as - :func:`cross_validation.cross_val_score` and - :class:`grid_search.GridSearchCV`) rely on an internal *scoring* strategy. + :func:`model_selection.cross_val_score` and + :class:`model_selection.GridSearchCV`) rely on an internal *scoring* strategy. This is discussed in the section :ref:`scoring_parameter`. * **Metric functions**: The :mod:`metrics` module implements functions @@ -39,8 +39,8 @@ The ``scoring`` parameter: defining model evaluation rules ========================================================== Model selection and evaluation using tools, such as -:class:`grid_search.GridSearchCV` and -:func:`cross_validation.cross_val_score`, take a ``scoring`` parameter that +:class:`model_selection.GridSearchCV` and +:func:`model_selection.cross_val_score`, take a ``scoring`` parameter that controls what metric they apply to the estimators evaluated. Common cases: predefined values @@ -82,16 +82,17 @@ Scoring Function Comment Usage examples: - >>> from sklearn import svm, cross_validation, datasets + >>> from sklearn import svm, datasets + >>> from sklearn.model_selection import cross_val_score >>> iris = datasets.load_iris() >>> X, y = iris.data, iris.target + >>> clf = svm.SVC(probability=True, random_state=0) + >>> cross_val_score(clf, X, y, scoring='log_loss') # doctest: +ELLIPSIS + array([-0.07..., -0.16..., -0.06...]) >>> model = svm.SVC() - >>> cross_validation.cross_val_score(model, X, y, scoring='wrong_choice') + >>> cross_val_score(model, X, y, scoring='wrong_choice') Traceback (most recent call last): ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'] - >>> clf = svm.SVC(probability=True, random_state=0) - >>> cross_validation.cross_val_score(clf, X, y, scoring='log_loss') # doctest: +ELLIPSIS - array([-0.07..., -0.16..., -0.06...]) .. note:: @@ -135,7 +136,7 @@ the :func:`fbeta_score` function:: >>> from sklearn.metrics import fbeta_score, make_scorer >>> ftwo_scorer = make_scorer(fbeta_score, beta=2) - >>> from sklearn.grid_search import GridSearchCV + >>> from sklearn.model_selection import GridSearchCV >>> from sklearn.svm import LinearSVC >>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=ftwo_scorer) @@ -1423,7 +1424,7 @@ To illustrate :class:`DummyClassifier`, first let's create an imbalanced dataset:: >>> from sklearn.datasets import load_iris - >>> from sklearn.cross_validation import train_test_split + >>> from sklearn.model_selection import train_test_split >>> iris = load_iris() >>> X, y = iris.data, iris.target >>> y[y != 1] = -1 diff --git a/doc/modules/pipeline.rst b/doc/modules/pipeline.rst index 0f9c1f635995d..819a3162340a7 100644 --- a/doc/modules/pipeline.rst +++ b/doc/modules/pipeline.rst @@ -82,7 +82,7 @@ Parameters of the estimators in the pipeline can be accessed using the This is particularly important for doing grid searches:: - >>> from sklearn.grid_search import GridSearchCV + >>> from sklearn.model_selection import GridSearchCV >>> params = dict(reduce_dim__n_components=[2, 5, 10], ... svm__C=[0.1, 10, 100]) >>> grid_search = GridSearchCV(clf, param_grid=params) diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index fda63a3bb66e0..fb3bcc46466bd 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -535,8 +535,8 @@ correctly. ``gamma`` defines how much influence a single training example has. The larger ``gamma`` is, the closer other examples must be to be affected. Proper choice of ``C`` and ``gamma`` is critical to the SVM's performance. One -is advised to use :class:`sklearn.grid_search.GridSearchCV` with ``C`` and ``gamma`` spaced -exponentially far apart to choose good values. +is advised to use :class:`sklearn.model_selection.GridSearchCV` with +``C`` and ``gamma`` spaced exponentially far apart to choose good values. .. topic:: Examples: diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst index a77ca00459eb8..67215503cb38e 100644 --- a/doc/tutorial/statistical_inference/model_selection.rst +++ b/doc/tutorial/statistical_inference/model_selection.rst @@ -41,45 +41,64 @@ data in *folds* that we use for training and testing:: >>> print(scores) [0.93489148580968284, 0.95659432387312182, 0.93989983305509184] -.. currentmodule:: sklearn.cross_validation +.. currentmodule:: sklearn.model_selection -This is called a :class:`KFold` cross validation +This is called a :class:`KFold` cross-validation. .. _cv_generators_tut: Cross-validation generators ============================= +Scikit-learn has a collection of classes which can be used to generate lists of +train/test indices for popular cross-validation strategies. +They expose a ``split`` method which accepts the input +dataset to be split and yields the train/test set indices for each iteration +of the chosen cross-validation strategy. -The code above to split data in train and test sets is tedious to write. -Scikit-learn exposes cross-validation generators to generate list -of indices for this purpose:: +This example shows an example usage of the ``split`` method. - >>> from sklearn import cross_validation - >>> k_fold = cross_validation.KFold(n=6, n_folds=3) - >>> for train_indices, test_indices in k_fold: + >>> from sklearn.model_selection import KFold, cross_val_score + >>> X = ["a", "a", "b", "c", "c", "c"] + >>> k_fold = KFold(n_folds=3) + >>> for train_indices, test_indices in k_fold.split(X): ... print('Train: %s | test: %s' % (train_indices, test_indices)) Train: [2 3 4 5] | test: [0 1] Train: [0 1 4 5] | test: [2 3] Train: [0 1 2 3] | test: [4 5] -The cross-validation can then be implemented easily:: +The cross-validation can then be performed easily:: - >>> kfold = cross_validation.KFold(len(X_digits), n_folds=3) + >>> kfold = KFold(n_folds=3) >>> [svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test]) - ... for train, test in kfold] + ... for train, test in k_fold.split(X_digits)] [0.93489148580968284, 0.95659432387312182, 0.93989983305509184] -To compute the ``score`` method of an estimator, the sklearn exposes -a helper function:: +The cross-validation score can be directly calculated using the +:func:`cross_val_score` helper. Given an estimator, the cross-validation object +and the input dataset, the :func:`cross_val_score` splits the data repeatedly into +a training and a testing set, trains the estimator using the training set and +computes the scores based on the testing set for each iteration of cross-validation. - >>> cross_validation.cross_val_score(svc, X_digits, y_digits, cv=kfold, n_jobs=-1) +By default the estimator's ``score`` method is used to compute the individual scores. + +Refer the :ref:`metrics module ` to learn more on the available scoring +methods. + + >>> cross_val_score(svc, X_digits, y_digits, cv=k_fold, n_jobs=-1) array([ 0.93489149, 0.95659432, 0.93989983]) `n_jobs=-1` means that the computation will be dispatched on all the CPUs of the computer. +Alternatively, the ``scoring`` argument can be provided to specify an alternative +scoring method. + + >>> cross_val_score(svc, X_digits, y_digits, cv=k_fold, + ... scoring='precision_macro') + array([ 0.93969761, 0.95911415, 0.94041254]) + **Cross-validation generators** @@ -87,23 +106,77 @@ of the computer. * - - :class:`KFold` **(n, k)** + - :class:`KFold` **(n_folds, shuffle, random_state)** + + - :class:`StratifiedKFold` **(n_iter, test_size, train_size, random_state)** + + - :class:`LabelKFold` **(n_folds, shuffle, random_state)** + + + * + + - Splits it into K folds, trains on K-1 and then tests on the left-out. + + - Same as K-Fold but preserves the class distribution within each fold. + + - Ensures that the same label is not in both testing and training sets. + + +.. list-table:: + + * + + - :class:`ShuffleSplit` **(n_iter, test_size, train_size, random_state)** + + - :class:`StratifiedShuffleSplit` + + - :class:`LabelShuffleSplit` + + * + + - Generates train/test indices based on random permutation. + + - Same as shuffle split but preserves the class distribution within each iteration. + + - Ensures that the same label is not in both testing and training sets. + + +.. list-table:: + + * + + - :class:`LeaveOneLabelOut` **()** + + - :class:`LeavePLabelOut` **(p)** + + - :class:`LeaveOneOut` **()** + + + + * + + - Takes a label array to group observations. + + - Leave P labels out. - - :class:`StratifiedKFold` **(y, k)** + - Leave one observation out. - - :class:`LeaveOneOut` **(n)** - - :class:`LeaveOneLabelOut` **(labels)** + +.. list-table:: * - - Split it K folds, train on K-1 and then test on left-out + - :class:`LeavePOut` **(p)** + + - :class:`PredefinedSplit` + + * - - It preserves the class ratios / label distribution within each fold. + - Leave P observations out. - - Leave one observation out + - Generates train/test indices based on predefined splits. - - Takes a label array to group observations .. currentmodule:: sklearn.svm @@ -132,14 +205,14 @@ Grid-search and cross-validated estimators Grid-search ------------- -.. currentmodule:: sklearn.grid_search +.. currentmodule:: sklearn.model_selection The sklearn provides an object that, given data, computes the score during the fit of an estimator on a parameter grid and chooses the parameters to maximize the cross-validation score. This object takes an estimator during the construction and exposes an estimator API:: - >>> from sklearn.grid_search import GridSearchCV + >>> from sklearn.model_selection import GridSearchCV, cross_val_score >>> Cs = np.logspace(-6, -1, 10) >>> clf = GridSearchCV(estimator=svc, param_grid=dict(C=Cs), ... n_jobs=-1) @@ -163,8 +236,8 @@ a stratified 3-fold. :: - >>> cross_validation.cross_val_score(clf, X_digits, y_digits) - ... # doctest: +ELLIPSIS + >>> cross_val_score(clf, X_digits, y_digits) + ... # doctest: +ELLIPSIS array([ 0.938..., 0.963..., 0.944...]) Two cross-validation loops are performed in parallel: one by the diff --git a/doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py b/doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py index fe2cde6e3f8fb..682fb45bd2aaa 100644 --- a/doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py +++ b/doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py @@ -15,7 +15,7 @@ from sklearn.linear_model import Perceptron from sklearn.pipeline import Pipeline from sklearn.datasets import load_files -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn import metrics diff --git a/doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py b/doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py index bd2b44a506226..11b1ff07acf7e 100644 --- a/doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py +++ b/doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py @@ -15,9 +15,9 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC from sklearn.pipeline import Pipeline -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV from sklearn.datasets import load_files -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn import metrics diff --git a/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py b/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py index 2de33948bec58..c86b51eaeef9e 100644 --- a/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py +++ b/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py @@ -15,7 +15,7 @@ from sklearn.linear_model import Perceptron from sklearn.pipeline import Pipeline from sklearn.datasets import load_files -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn import metrics diff --git a/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py b/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py index c4dcba5da9f90..85c4989786934 100644 --- a/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py +++ b/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py @@ -15,9 +15,9 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC from sklearn.pipeline import Pipeline -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV from sklearn.datasets import load_files -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn import metrics diff --git a/doc/tutorial/text_analytics/working_with_text_data.rst b/doc/tutorial/text_analytics/working_with_text_data.rst index f842185ddc75a..184b02a09dd11 100644 --- a/doc/tutorial/text_analytics/working_with_text_data.rst +++ b/doc/tutorial/text_analytics/working_with_text_data.rst @@ -420,7 +420,7 @@ parameters on a grid of possible values. We try out all classifiers on either words or bigrams, with or without idf, and with a penalty parameter of either 0.01 or 0.001 for the linear SVM:: - >>> from sklearn.grid_search import GridSearchCV + >>> from sklearn.model_selection import GridSearchCV >>> parameters = {'vect__ngram_range': [(1, 1), (1, 2)], ... 'tfidf__use_idf': (True, False), ... 'clf__alpha': (1e-2, 1e-3), diff --git a/examples/applications/face_recognition.py b/examples/applications/face_recognition.py index be466e9532cdf..b79599ecb3a06 100644 --- a/examples/applications/face_recognition.py +++ b/examples/applications/face_recognition.py @@ -31,9 +31,9 @@ import logging import matplotlib.pyplot as plt -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split +from sklearn.model_selection import GridSearchCV from sklearn.datasets import fetch_lfw_people -from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.decomposition import RandomizedPCA diff --git a/examples/calibration/plot_calibration.py b/examples/calibration/plot_calibration.py index 2267f02dd0022..299f924e2a468 100644 --- a/examples/calibration/plot_calibration.py +++ b/examples/calibration/plot_calibration.py @@ -36,7 +36,7 @@ from sklearn.naive_bayes import GaussianNB from sklearn.metrics import brier_score_loss from sklearn.calibration import CalibratedClassifierCV -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split n_samples = 50000 diff --git a/examples/calibration/plot_calibration_curve.py b/examples/calibration/plot_calibration_curve.py index 42dc8473e6c30..a37e3158c0c12 100644 --- a/examples/calibration/plot_calibration_curve.py +++ b/examples/calibration/plot_calibration_curve.py @@ -56,7 +56,7 @@ from sklearn.metrics import (brier_score_loss, precision_score, recall_score, f1_score) from sklearn.calibration import CalibratedClassifierCV, calibration_curve -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split # Create dataset of classification task with many redundant and few diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py index 935d388c047e1..38a5bea6e1dc2 100644 --- a/examples/classification/plot_classifier_comparison.py +++ b/examples/classification/plot_classifier_comparison.py @@ -31,7 +31,7 @@ import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_moons, make_circles, make_classification from sklearn.neural_network import MLPClassifier diff --git a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py index 488db4d64855d..a0152c85baf60 100644 --- a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py +++ b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py @@ -30,9 +30,9 @@ from sklearn.cluster import FeatureAgglomeration from sklearn.linear_model import BayesianRidge from sklearn.pipeline import Pipeline -from sklearn.grid_search import GridSearchCV from sklearn.externals.joblib import Memory -from sklearn.cross_validation import KFold +from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import KFold ############################################################################### # Generate data @@ -60,7 +60,7 @@ ############################################################################### # Compute the coefs of a Bayesian Ridge with GridSearch -cv = KFold(len(y), 2) # cross-validation generator for model selection +cv = KFold(2) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(cachedir=cachedir, verbose=1) diff --git a/examples/covariance/plot_covariance_estimation.py b/examples/covariance/plot_covariance_estimation.py index 85e26705b03e9..96f637974ee29 100644 --- a/examples/covariance/plot_covariance_estimation.py +++ b/examples/covariance/plot_covariance_estimation.py @@ -49,7 +49,7 @@ from sklearn.covariance import LedoitWolf, OAS, ShrunkCovariance, \ log_likelihood, empirical_covariance -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV ############################################################################### diff --git a/examples/decomposition/plot_pca_vs_fa_model_selection.py b/examples/decomposition/plot_pca_vs_fa_model_selection.py index 067bcd7b6b479..89bb707a37699 100644 --- a/examples/decomposition/plot_pca_vs_fa_model_selection.py +++ b/examples/decomposition/plot_pca_vs_fa_model_selection.py @@ -35,8 +35,8 @@ from sklearn.decomposition import PCA, FactorAnalysis from sklearn.covariance import ShrunkCovariance, LedoitWolf -from sklearn.cross_validation import cross_val_score -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import cross_val_score +from sklearn.model_selection import GridSearchCV ############################################################################### # Create the data diff --git a/examples/ensemble/plot_feature_transformation.py b/examples/ensemble/plot_feature_transformation.py index 1d5b927e48988..e004c167e67af 100644 --- a/examples/ensemble/plot_feature_transformation.py +++ b/examples/ensemble/plot_feature_transformation.py @@ -35,7 +35,7 @@ from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier, GradientBoostingClassifier) from sklearn.preprocessing import OneHotEncoder -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.metrics import roc_curve from sklearn.pipeline import make_pipeline @@ -118,4 +118,4 @@ plt.ylabel('True positive rate') plt.title('ROC curve (zoomed in at top left)') plt.legend(loc='best') -plt.show() \ No newline at end of file +plt.show() diff --git a/examples/ensemble/plot_gradient_boosting_oob.py b/examples/ensemble/plot_gradient_boosting_oob.py index a39f709d36979..39e623f261cca 100644 --- a/examples/ensemble/plot_gradient_boosting_oob.py +++ b/examples/ensemble/plot_gradient_boosting_oob.py @@ -33,8 +33,8 @@ import matplotlib.pyplot as plt from sklearn import ensemble -from sklearn.cross_validation import KFold -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import KFold +from sklearn.model_selection import train_test_split # Generate data (adapted from G. Ridgeway's gbm example) @@ -75,10 +75,10 @@ def heldout_score(clf, X_test, y_test): def cv_estimate(n_folds=3): - cv = KFold(n=X_train.shape[0], n_folds=n_folds) + cv = KFold(n_folds=n_folds) cv_clf = ensemble.GradientBoostingClassifier(**params) val_scores = np.zeros((n_estimators,), dtype=np.float64) - for train, test in cv: + for train, test in cv.split(X_train, y_train): cv_clf.fit(X_train[train], y_train[train]) val_scores += heldout_score(cv_clf, X_train[test], y_train[test]) val_scores /= n_folds diff --git a/examples/ensemble/plot_partial_dependence.py b/examples/ensemble/plot_partial_dependence.py index b480e228d3ca3..d4a26166944c3 100644 --- a/examples/ensemble/plot_partial_dependence.py +++ b/examples/ensemble/plot_partial_dependence.py @@ -51,7 +51,7 @@ from mpl_toolkits.mplot3d import Axes3D -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble.partial_dependence import plot_partial_dependence from sklearn.ensemble.partial_dependence import partial_dependence diff --git a/examples/exercises/plot_cv_diabetes.py b/examples/exercises/plot_cv_diabetes.py index 424c037a5547e..5524d2e3c1334 100644 --- a/examples/exercises/plot_cv_diabetes.py +++ b/examples/exercises/plot_cv_diabetes.py @@ -14,13 +14,17 @@ import numpy as np import matplotlib.pyplot as plt -from sklearn import cross_validation, datasets, linear_model +from sklearn import datasets +from sklearn.linear_model import LassoCV +from sklearn.linear_model import Lasso +from sklearn.model_selection import KFold +from sklearn.model_selection import cross_val_score diabetes = datasets.load_diabetes() X = diabetes.data[:150] y = diabetes.target[:150] -lasso = linear_model.Lasso() +lasso = Lasso() alphas = np.logspace(-4, -.5, 30) scores = list() @@ -28,7 +32,7 @@ for alpha in alphas: lasso.alpha = alpha - this_scores = cross_validation.cross_val_score(lasso, X, y, n_jobs=1) + this_scores = cross_val_score(lasso, X, y, n_jobs=1) scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) @@ -51,15 +55,15 @@ # performs cross-validation on the training data it receives). # We use external cross-validation to see how much the automatically obtained # alphas differ across different cross-validation folds. -lasso_cv = linear_model.LassoCV(alphas=alphas) -k_fold = cross_validation.KFold(len(X), 3) +lasso_cv = LassoCV(alphas=alphas) +k_fold = KFold(3) print("Answer to the bonus question:", "how much can you trust the selection of alpha?") print() print("Alpha parameters maximising the generalization score on different") print("subsets of the data:") -for k, (train, test) in enumerate(k_fold): +for k, (train, test) in enumerate(k_fold.split(X, y)): lasso_cv.fit(X[train], y[train]) print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}". format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test]))) diff --git a/examples/exercises/plot_cv_digits.py b/examples/exercises/plot_cv_digits.py index 92f04a935cc5a..a68f92afbdad9 100644 --- a/examples/exercises/plot_cv_digits.py +++ b/examples/exercises/plot_cv_digits.py @@ -12,7 +12,8 @@ import numpy as np -from sklearn import cross_validation, datasets, svm +from sklearn.model_selection import cross_val_score +from sklearn import datasets, svm digits = datasets.load_digits() X = digits.data @@ -25,7 +26,7 @@ scores_std = list() for C in C_s: svc.C = C - this_scores = cross_validation.cross_val_score(svc, X, y, n_jobs=1) + this_scores = cross_val_score(svc, X, y, n_jobs=1) scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) diff --git a/examples/feature_selection/plot_permutation_test_for_classification.py b/examples/feature_selection/plot_permutation_test_for_classification.py index 4df102578c9da..24b999451a067 100644 --- a/examples/feature_selection/plot_permutation_test_for_classification.py +++ b/examples/feature_selection/plot_permutation_test_for_classification.py @@ -20,7 +20,8 @@ import matplotlib.pyplot as plt from sklearn.svm import SVC -from sklearn.cross_validation import StratifiedKFold, permutation_test_score +from sklearn.model_selection import StratifiedKFold +from sklearn.model_selection import permutation_test_score from sklearn import datasets @@ -39,7 +40,7 @@ X = np.c_[X, E] svm = SVC(kernel='linear') -cv = StratifiedKFold(y, 2) +cv = StratifiedKFold(2) score, permutation_scores, pvalue = permutation_test_score( svm, X, y, scoring="accuracy", cv=cv, n_permutations=100, n_jobs=1) diff --git a/examples/feature_selection/plot_rfe_with_cross_validation.py b/examples/feature_selection/plot_rfe_with_cross_validation.py index 232aa115c2d77..8b22ab0d54108 100644 --- a/examples/feature_selection/plot_rfe_with_cross_validation.py +++ b/examples/feature_selection/plot_rfe_with_cross_validation.py @@ -10,7 +10,7 @@ import matplotlib.pyplot as plt from sklearn.svm import SVC -from sklearn.cross_validation import StratifiedKFold +from sklearn.model_selection import StratifiedKFold from sklearn.feature_selection import RFECV from sklearn.datasets import make_classification @@ -23,7 +23,7 @@ svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications -rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2), +rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfecv.fit(X, y) diff --git a/examples/feature_stacker.py b/examples/feature_stacker.py index d1f9453e28e58..4ce574aa36bca 100644 --- a/examples/feature_stacker.py +++ b/examples/feature_stacker.py @@ -20,7 +20,7 @@ # License: BSD 3 clause from sklearn.pipeline import Pipeline, FeatureUnion -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC from sklearn.datasets import load_iris from sklearn.decomposition import PCA diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py index 721598d765589..bcfa3c68c69f3 100644 --- a/examples/gaussian_process/plot_compare_gpr_krr.py +++ b/examples/gaussian_process/plot_compare_gpr_krr.py @@ -57,7 +57,7 @@ import matplotlib.pyplot as plt from sklearn.kernel_ridge import KernelRidge -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import WhiteKernel, ExpSineSquared diff --git a/examples/linear_model/plot_sgd_comparison.py b/examples/linear_model/plot_sgd_comparison.py index 98bc39cff755e..7506718f93f90 100644 --- a/examples/linear_model/plot_sgd_comparison.py +++ b/examples/linear_model/plot_sgd_comparison.py @@ -14,7 +14,7 @@ import matplotlib.pyplot as plt from sklearn import datasets -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.linear_model import SGDClassifier, Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.linear_model import LogisticRegression diff --git a/examples/missing_values.py b/examples/missing_values.py index 59444b36490e3..8a0895f9a589f 100644 --- a/examples/missing_values.py +++ b/examples/missing_values.py @@ -28,7 +28,7 @@ from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import Pipeline from sklearn.preprocessing import Imputer -from sklearn.cross_validation import cross_val_score +from sklearn.model_selection import cross_val_score rng = np.random.RandomState(0) diff --git a/examples/mixture/plot_gmm_classifier.py b/examples/mixture/plot_gmm_classifier.py index a3179c5aee054..5fd5be80c77cb 100644 --- a/examples/mixture/plot_gmm_classifier.py +++ b/examples/mixture/plot_gmm_classifier.py @@ -33,7 +33,7 @@ import numpy as np from sklearn import datasets -from sklearn.cross_validation import StratifiedKFold +from sklearn.model_selection import StratifiedKFold from sklearn.externals.six.moves import xrange from sklearn.mixture import GMM @@ -56,9 +56,9 @@ def make_ellipses(gmm, ax): # Break up the dataset into non-overlapping training (75%) and testing # (25%) sets. -skf = StratifiedKFold(iris.target, n_folds=4) +skf = StratifiedKFold(n_folds=4) # Only take the first fold. -train_index, test_index = next(iter(skf)) +train_index, test_index = next(iter(skf.split(iris.data, iris.target))) X_train = iris.data[train_index] diff --git a/examples/model_selection/README.txt b/examples/model_selection/README.txt index 553c6e7d6498e..b35a778b28a7f 100644 --- a/examples/model_selection/README.txt +++ b/examples/model_selection/README.txt @@ -3,5 +3,4 @@ Model Selection ----------------------- -Examples concerning model selection, mostly contained in the -:mod:`sklearn.grid_search` and :mod:`sklearn.cross_validation` modules. +Examples related to the :mod:`sklearn.model_selection` module. diff --git a/examples/model_selection/grid_search_digits.py b/examples/model_selection/grid_search_digits.py index c8aec1bab8c0f..6b039629df2a5 100644 --- a/examples/model_selection/grid_search_digits.py +++ b/examples/model_selection/grid_search_digits.py @@ -4,7 +4,7 @@ ============================================================ This examples shows how a classifier is optimized by cross-validation, -which is done using the :class:`sklearn.grid_search.GridSearchCV` object +which is done using the :class:`sklearn.model_selection.GridSearchCV` object on a development set that comprises only half of the available labeled data. The performance of the selected hyper-parameters and trained model is @@ -19,8 +19,8 @@ from __future__ import print_function from sklearn import datasets -from sklearn.cross_validation import train_test_split -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import train_test_split +from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report from sklearn.svm import SVC diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py index 11b690d91e2ef..daf82718d42e1 100644 --- a/examples/model_selection/grid_search_text_feature_extraction.py +++ b/examples/model_selection/grid_search_text_feature_extraction.py @@ -1,3 +1,4 @@ + """ ========================================================== Sample pipeline for text feature extraction and evaluation @@ -56,7 +57,7 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.linear_model import SGDClassifier -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline print(__doc__) diff --git a/examples/model_selection/plot_confusion_matrix.py b/examples/model_selection/plot_confusion_matrix.py index 771d058e2a4a5..250d71c08c442 100644 --- a/examples/model_selection/plot_confusion_matrix.py +++ b/examples/model_selection/plot_confusion_matrix.py @@ -30,7 +30,7 @@ import matplotlib.pyplot as plt from sklearn import svm, datasets -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix # import some data to play with diff --git a/examples/model_selection/plot_learning_curve.py b/examples/model_selection/plot_learning_curve.py index 7a47fd574635d..3adcd78aa8b6a 100644 --- a/examples/model_selection/plot_learning_curve.py +++ b/examples/model_selection/plot_learning_curve.py @@ -17,11 +17,11 @@ import numpy as np import matplotlib.pyplot as plt -from sklearn import cross_validation from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.datasets import load_digits -from sklearn.learning_curve import learning_curve +from sklearn.model_selection import learning_curve +from sklearn.model_selection import ShuffleSplit def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, @@ -48,10 +48,20 @@ def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. - cv : integer, cross-validation generator, optional - If an integer is passed, it is the number of folds (defaults to 3). - Specific cross-validation objects can be passed, see - sklearn.cross_validation module for the list of possible objects + cv : int, cross-validation generator or an iterable, optional + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + - None, to use the default 3-fold cross-validation, + - integer, to specify the number of folds. + - An object to be used as a cross-validation generator. + - An iterable yielding train/test splits. + + For integer/None inputs, if ``y`` is binary or multiclass, + :class:`StratifiedKFold` used. If the estimator is not a classifier + or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validators that can be used here. n_jobs : integer, optional Number of jobs to run in parallel (default 1). @@ -91,16 +101,14 @@ def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, title = "Learning Curves (Naive Bayes)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. -cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100, - test_size=0.2, random_state=0) +cv = ShuffleSplit(n_iter=100, test_size=0.2, random_state=0) estimator = GaussianNB() plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4) title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" # SVC is more expensive so we do a lower number of CV iterations: -cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10, - test_size=0.2, random_state=0) +cv = ShuffleSplit(n_iter=10, test_size=0.2, random_state=0) estimator = SVC(gamma=0.001) plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4) diff --git a/examples/model_selection/plot_precision_recall.py b/examples/model_selection/plot_precision_recall.py index 4d83910008f4a..f9244410d5792 100644 --- a/examples/model_selection/plot_precision_recall.py +++ b/examples/model_selection/plot_precision_recall.py @@ -80,7 +80,7 @@ from sklearn import svm, datasets from sklearn.metrics import precision_recall_curve from sklearn.metrics import average_precision_score -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.preprocessing import label_binarize from sklearn.multiclass import OneVsRestClassifier diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py index 8448998e2ae7f..a3884eff33c62 100644 --- a/examples/model_selection/plot_roc.py +++ b/examples/model_selection/plot_roc.py @@ -43,7 +43,7 @@ from sklearn import svm, datasets from sklearn.metrics import roc_curve, auc -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.preprocessing import label_binarize from sklearn.multiclass import OneVsRestClassifier from scipy import interp diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index 6671e60166dd4..1508839386ffe 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -25,7 +25,7 @@ .. note:: See also :func:`sklearn.metrics.auc_score`, - :func:`sklearn.cross_validation.cross_val_score`, + :func:`sklearn.model_selection.cross_val_score`, :ref:`example_model_selection_plot_roc.py`, """ @@ -38,7 +38,7 @@ from sklearn import svm, datasets from sklearn.metrics import roc_curve, auc -from sklearn.cross_validation import StratifiedKFold +from sklearn.model_selection import StratifiedKFold ############################################################################### # Data IO and generation @@ -58,7 +58,7 @@ # Classification and ROC analysis # Run classifier with cross-validation and plot ROC curves -cv = StratifiedKFold(y, n_folds=6) +cv = StratifiedKFold(n_folds=6) classifier = svm.SVC(kernel='linear', probability=True, random_state=random_state) @@ -70,7 +70,7 @@ lw = 2 i = 0 -for (train, test), color in zip(cv, colors): +for (train, test), color in zip(cv.split(X, y), colors): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) @@ -84,7 +84,7 @@ plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck') -mean_tpr /= len(cv) +mean_tpr /= cv.get_n_splits(X, y) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', diff --git a/examples/model_selection/plot_underfitting_overfitting.py b/examples/model_selection/plot_underfitting_overfitting.py index f8958cbffe21b..ff454664c7b18 100644 --- a/examples/model_selection/plot_underfitting_overfitting.py +++ b/examples/model_selection/plot_underfitting_overfitting.py @@ -27,7 +27,7 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression -from sklearn import cross_validation +from sklearn.model_selection import cross_val_score np.random.seed(0) @@ -51,8 +51,8 @@ pipeline.fit(X[:, np.newaxis], y) # Evaluate the models using crossvalidation - scores = cross_validation.cross_val_score(pipeline, - X[:, np.newaxis], y, scoring="mean_squared_error", cv=10) + scores = cross_val_score(pipeline, X[:, np.newaxis], y, + scoring="mean_squared_error", cv=10) X_test = np.linspace(0, 1, 100) plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model") diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py index e3232984aa33a..ed74a41ff100b 100644 --- a/examples/model_selection/plot_validation_curve.py +++ b/examples/model_selection/plot_validation_curve.py @@ -18,7 +18,7 @@ from sklearn.datasets import load_digits from sklearn.svm import SVC -from sklearn.learning_curve import validation_curve +from sklearn.model_selection import validation_curve digits = load_digits() X, y = digits.data, digits.target diff --git a/examples/model_selection/randomized_search.py b/examples/model_selection/randomized_search.py index 0682b24689813..85a16c6f52d55 100644 --- a/examples/model_selection/randomized_search.py +++ b/examples/model_selection/randomized_search.py @@ -26,7 +26,8 @@ from operator import itemgetter from scipy.stats import randint as sp_randint -from sklearn.grid_search import GridSearchCV, RandomizedSearchCV +from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import RandomizedSearchCV from sklearn.datasets import load_digits from sklearn.ensemble import RandomForestClassifier diff --git a/examples/neighbors/plot_digits_kde_sampling.py b/examples/neighbors/plot_digits_kde_sampling.py index 4680a41780aed..ba59fb5ece537 100644 --- a/examples/neighbors/plot_digits_kde_sampling.py +++ b/examples/neighbors/plot_digits_kde_sampling.py @@ -16,7 +16,7 @@ from sklearn.datasets import load_digits from sklearn.neighbors import KernelDensity from sklearn.decomposition import PCA -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV # load the data digits = load_digits() diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py index 9f085036d2013..2b9b15fe3d966 100644 --- a/examples/neural_networks/plot_rbm_logistic_classification.py +++ b/examples/neural_networks/plot_rbm_logistic_classification.py @@ -37,7 +37,7 @@ from scipy.ndimage import convolve from sklearn import linear_model, datasets, metrics -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.neural_network import BernoulliRBM from sklearn.pipeline import Pipeline diff --git a/examples/plot_cv_predict.py b/examples/plot_cv_predict.py index 5a9e541949dc9..4657ff816369a 100644 --- a/examples/plot_cv_predict.py +++ b/examples/plot_cv_predict.py @@ -8,7 +8,7 @@ """ from sklearn import datasets -from sklearn.cross_validation import cross_val_predict +from sklearn.model_selection import cross_val_predict from sklearn import linear_model import matplotlib.pyplot as plt diff --git a/examples/plot_digits_pipe.py b/examples/plot_digits_pipe.py index 139ade15ba7c2..cd134fe20ddec 100644 --- a/examples/plot_digits_pipe.py +++ b/examples/plot_digits_pipe.py @@ -25,7 +25,7 @@ from sklearn import linear_model, decomposition, datasets from sklearn.pipeline import Pipeline -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV logistic = linear_model.LogisticRegression() diff --git a/examples/plot_kernel_ridge_regression.py b/examples/plot_kernel_ridge_regression.py index 19aeece6658cc..5ca4093a05b06 100644 --- a/examples/plot_kernel_ridge_regression.py +++ b/examples/plot_kernel_ridge_regression.py @@ -41,8 +41,8 @@ import numpy as np from sklearn.svm import SVR -from sklearn.grid_search import GridSearchCV -from sklearn.learning_curve import learning_curve +from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import learning_curve from sklearn.kernel_ridge import KernelRidge import matplotlib.pyplot as plt diff --git a/examples/preprocessing/plot_function_transformer.py b/examples/preprocessing/plot_function_transformer.py index 031a126a92e6d..bfe5e41932201 100644 --- a/examples/preprocessing/plot_function_transformer.py +++ b/examples/preprocessing/plot_function_transformer.py @@ -11,7 +11,7 @@ import matplotlib.pyplot as plt import numpy as np -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index d3ea13e2e88e7..27bbd94ee00c9 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -74,8 +74,8 @@ from sklearn.svm import SVC from sklearn.preprocessing import StandardScaler from sklearn.datasets import load_iris -from sklearn.cross_validation import StratifiedShuffleSplit -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import StratifiedShuffleSplit +from sklearn.model_selection import GridSearchCV # Utility function to move the midpoint of a colormap to be around @@ -128,7 +128,7 @@ def __call__(self, value, clip=None): C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) param_grid = dict(gamma=gamma_range, C=C_range) -cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) +cv = StratifiedShuffleSplit(n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv) grid.fit(X, y) diff --git a/examples/svm/plot_svm_anova.py b/examples/svm/plot_svm_anova.py index 9ce225bf980e5..45da4c35e0a64 100644 --- a/examples/svm/plot_svm_anova.py +++ b/examples/svm/plot_svm_anova.py @@ -10,7 +10,8 @@ import numpy as np import matplotlib.pyplot as plt -from sklearn import svm, datasets, feature_selection, cross_validation +from sklearn import svm, datasets, feature_selection +from sklearn.model_selection import cross_val_score from sklearn.pipeline import Pipeline ############################################################################### @@ -42,7 +43,7 @@ for percentile in percentiles: clf.set_params(anova__percentile=percentile) # Compute cross-validation score using all CPUs - this_scores = cross_validation.cross_val_score(clf, X, y, n_jobs=1) + this_scores = cross_val_score(clf, X, y, n_jobs=1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py index 60173338bd0cd..ed92bc19dcada 100644 --- a/examples/svm/plot_svm_scale_c.py +++ b/examples/svm/plot_svm_scale_c.py @@ -88,8 +88,8 @@ import matplotlib.pyplot as plt from sklearn.svm import LinearSVC -from sklearn.cross_validation import ShuffleSplit -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import ShuffleSplit +from sklearn.model_selection import GridSearchCV from sklearn.utils import check_random_state from sklearn import datasets @@ -128,8 +128,8 @@ # To get nice curve, we need a large number of iterations to # reduce the variance grid = GridSearchCV(clf, refit=False, param_grid=param_grid, - cv=ShuffleSplit(n=n_samples, train_size=train_size, - n_iter=250, random_state=1)) + cv=ShuffleSplit(train_size=train_size, n_iter=250, + random_state=1)) grid.fit(X, y) scores = [x[1] for x in grid.grid_scores_]