Skip to content

WIP support future extensiblity of grid search results #2079

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
335 changes: 276 additions & 59 deletions sklearn/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,22 @@
from math import ceil, floor, factorial
import numbers
from abc import ABCMeta, abstractmethod
import time
import operator

import numpy as np
import scipy.sparse as sp

from .base import is_classifier, clone
from .utils import check_arrays, check_random_state, safe_mask
from .utils.validation import _num_samples
from .utils.fixes import unique
from .externals.joblib import Parallel, delayed
from .externals.joblib import Parallel, delayed, logger
from .externals.six import string_types, with_metaclass
from .metrics import SCORERS, Scorer

__all__ = ['Bootstrap',
'CVScorer',
'KFold',
'LeaveOneLabelOut',
'LeaveOneOut',
Expand Down Expand Up @@ -1024,13 +1028,74 @@ def __len__(self):

##############################################################################

def _cross_val_score(estimator, X, y, scorer, train, test, verbose,
fit_params):
"""Inner loop for cross validation"""
n_samples = X.shape[0] if sp.issparse(X) else len(X)
def _fit_fold(estimator, X, y, fold_index, train, test, scoring, verbose,
parameters=None, candidate_index=None, fit_params=None):
"""Run fit on one set of parameters.

Parameters
----------
estimator : estimator object
This estimator will be cloned and then fitted.

X : array-like, sparse matrix or list
Input data.

y : array-like or None
Targets for input data.

train : ndarray, dtype int or bool
Boolean mask or indices for training set.

test : ndarray, dtype int or bool
Boolean mask or indices for test set.

scoring : callable or None.
If provided must be a scoring object / function with signature
``scoring(estimator, X, y)``.

verbose : int
Verbosity level.

parameters : dict
Parameters to be set on estimator for this fold.

**fit_params : kwargs
Additional parameter passed to the fit function of the estimator.


Returns
-------
results : dict of string to any
An extensible storage of fold results including the following keys:

``'test_score'`` : float
The estimator's score on the test set.
``'test_n_samples'`` : int
The number of samples in the test set.
"""
if verbose > 1:
start_time = time.time()
if parameters is None:
msg = ''
else:
msg = '%s' % (', '.join('%s=%s' % (k, v)
for k, v in parameters.items()))
print("Fitting fold %s %s" % (msg, (64 - len(msg)) * '.'))

n_samples = _num_samples(X)

# Adapt fit_params to train portion only
if fit_params is None:
fit_params = {}
fit_params = dict([(k, np.asarray(v)[train]
if hasattr(v, '__len__') and len(v) == n_samples else v)
for k, v in fit_params.items()])

if hasattr(estimator, 'kernel') and callable(estimator.kernel):
# cannot compute the kernel values with custom function
raise ValueError("Cannot use a custom kernel function. "
"Precompute the kernel matrix instead.")

if not hasattr(X, "shape"):
if getattr(estimator, "_pairwise", False):
raise ValueError("Precomputed kernels or affinity matrices have "
Expand All @@ -1048,23 +1113,193 @@ def _cross_val_score(estimator, X, y, scorer, train, test, verbose,
X_train = X[safe_mask(X, train)]
X_test = X[safe_mask(X, test)]

if y is None:
y_train = None
y_test = None
else:
y_train = y[train]
y_test = y[test]
estimator.fit(X_train, y_train, **fit_params)
if scorer is None:
score = estimator.score(X_test, y_test)
# update parameters of the classifier after a copy of its base structure
if parameters is not None:
estimator = clone(estimator)
estimator.set_params(**parameters)

if scoring is None:
scoring = lambda estimator, *args: estimator.score(*args)

if y is not None:
y_test = y[safe_mask(y, test)]
y_train = y[safe_mask(y, train)]
fit_args = (X_train, y_train)
score_args = (X_test, y_test)
else:
score = scorer(estimator, X_test, y_test)
if not isinstance(score, numbers.Number):
raise ValueError("scoring must return a number, got %s (%s)"
" instead." % (str(score), type(score)))
fit_args = (X_train,)
score_args = (X_test,)

# do actual fitting
estimator.fit(*fit_args, **fit_params)
test_score = scoring(estimator, *score_args)

if not isinstance(test_score, numbers.Number):
raise ValueError("scoring must return a number, got %s (%s)"
" instead." % (str(test_score), type(test_score)))

if verbose > 2:
msg += ", score=%f" % test_score
if verbose > 1:
print("score: %f" % score)
return score
end_msg = "%s -%s" % (msg,
logger.short_format_time(time.time() -
start_time))
print("Fitting fold %s %s" % ((64 - len(end_msg)) * '.', end_msg))
return {
'candidate_index': candidate_index,
'fold_index': fold_index,
'parameters': parameters,
'test_score': test_score,
'train_n_samples': _num_samples(X_train),
'test_n_samples': _num_samples(X_test),
}


class CVScorer(object):
"""Parallelized cross-validation for a given estimator and dataset

Parameters
----------
cv : integer or cross-validation generator, optional
A cross-validation generator or number of folds (default 3). Folds will
be stratified if the estimator is a classifier.

scoring : string or callable, optional
Either one of either a string ("zero_one", "f1", "roc_auc", ... for
classification, "mse", "r2", ... for regression) or a callable.
See 'Scoring objects' in the model evaluation section of the user guide
for details.

iid : boolean, optional
If True (default), the data is assumed to be identically distributed
across the folds, and the mean score is the total score per sample,
and not the mean score across the folds.

n_jobs : integer, optional
The number of jobs to run in parallel (default 1). -1 means 'all CPUs'.

pre_dispatch : int, or string, optional
Controls the number of jobs that get dispatched during parallel
execution. Reducing this number can be useful to avoid an
- None, in which case all the jobs are immediatly
created and spawned. Use this for lightweight and
fast-running jobs, to avoid delays due to on-demand
spawning of the jobs

- An int, giving the exact number of total jobs that are
spawned

- A string, giving an expression as a function of n_jobs,
as in '2*n_jobs'

verbose : integer, optional
The verbosity level.

fit_params : dict, optional
Parameters to pass to the fit method of the estimator.
"""

def __init__(self, cv=None, scoring=None, iid=True,
n_jobs=1, pre_dispatch='2*n_jobs', verbose=0, fit_params=None,
score_func=None):

self.cv = cv
self.iid = iid
self.verbose = verbose
self.fit_params = fit_params

if score_func is not None:
warnings.warn("Passing function as ``score_func`` is "
"deprecated and will be removed in 0.15. "
"Either use strings or score objects.", stacklevel=2)
self.scoring = Scorer(score_func)
elif isinstance(scoring, string_types):
self.scoring = SCORERS[scoring]
else:
self.scoring = scoring

self.parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
pre_dispatch=pre_dispatch)

def __call__(self, estimator, X, y=None):
"""Cross-validate the estimator on the given data.

Parameters
----------
estimator : estimator object implementing 'fit'
The object to use to fit the data.

X : array-like of shape at least 2D
The data to fit.

y : array-like, optional
The target variable to try to predict in the case of
supervised learning.

Returns
-------
fold_results : list of dicts
For each cross-validation fold, this provides fields:
* ``test_score``, the score for this fold
* ``fold_index``, the index of the fold
* ``train_n_samples``, the number of samples in training
* ``test_n_samples``, the number of samples in testing
"""
return sorted(self.search([{}], estimator, X, y),
key=operator.itemgetter('fold_index'))

def search(self, candidates, estimator, X, y=None):
"""Cross-validate the estimator for candidate parameter settings.

Parameters
----------
candidates : iterable of dicts
The estimator will be cloned and have these parameters
set for each candidate.

estimator : estimator object implementing 'fit'
The object to use to fit the data.

X : array-like of shape at least 2D
The data to fit.

y : array-like, optional
The target variable to try to predict in the case of
supervised learning.

Returns
-------
fold_results : iterable of dicts
For each cross-validation fold, this provides fields:
* ``test_score``, the score for this fold
* ``parameters``, a dictionary of parameter settings
* ``candidate_index``, the index of the candidate
* ``fold_index``, the index of the fold
* ``train_n_samples``, the number of samples in training
* ``test_n_samples``, the number of samples in testing
"""
X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
cv = check_cv(self.cv, X, y, classifier=is_classifier(estimator))
cv = list(cv)
if self.scoring is None and not hasattr(estimator, 'score'):
raise TypeError(
"If no scoring is specified, the estimator passed "
"should have a 'score' method. The estimator %s "
"does not." % estimator)

out = self.parallel(
delayed(_fit_fold)(
estimator, X, y,
fold_index, train=train, test=test,
parameters=parameters, candidate_index=candidate_index,
scoring=self.scoring, verbose=self.verbose,
fit_params=self.fit_params
)
for candidate_index, parameters in enumerate(candidates)
for fold_index, (train, test) in enumerate(cv))

# return iterator for future compatibility
return iter(out)


def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
Expand All @@ -1090,20 +1325,17 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
See 'Scoring objects' in the model evaluation section of the user guide
for details.

cv : cross-validation generator, optional
A cross-validation generator. If None, a 3-fold cross
validation is used or 3-fold stratified cross-validation
when y is supplied and estimator is a classifier.

n_jobs : integer, optional
The number of CPUs to use to do the computation. -1 means
'all CPUs'.
cv : integer or cross-validation generator, optional
A cross-validation generator or number of folds (default 3). Folds will
be stratified if the estimator is a classifier.

verbose : integer, optional
The verbosity level.
iid : boolean, optional
If True (default), the data is assumed to be identically distributed
across the folds, and the mean score is the total score per sample,
and not the mean score across the folds.

fit_params : dict, optional
Parameters to pass to the fit method of the estimator.
n_jobs : integer, optional
The number of jobs to run in parallel (default 1). -1 means 'all CPUs'.

pre_dispatch : int, or string, optional
Controls the number of jobs that get dispatched during parallel
Expand All @@ -1122,37 +1354,22 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
- A string, giving an expression as a function of n_jobs,
as in '2*n_jobs'

verbose : integer, optional
The verbosity level.

fit_params : dict, optional
Parameters to pass to the fit method of the estimator.

Returns
-------
scores : array of float, shape=(len(list(cv)),)
scores : array of float, shape=[len(cv)]
Array of scores of the estimator for each run of the cross validation.
"""
X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
if score_func is not None:
warnings.warn("Passing function as ``score_func`` is "
"deprecated and will be removed in 0.15. "
"Either use strings or score objects.", stacklevel=2)
scorer = Scorer(score_func)
elif isinstance(scoring, string_types):
scorer = SCORERS[scoring]
else:
scorer = scoring
if scorer is None and not hasattr(estimator, 'score'):
raise TypeError(
"If no scoring is specified, the estimator passed "
"should have a 'score' method. The estimator %s "
"does not." % estimator)
# We clone the estimator to make sure that all the folds are
# independent, and that it is pickle-able.
fit_params = fit_params if fit_params is not None else {}
parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
pre_dispatch=pre_dispatch)
scores = parallel(
delayed(_cross_val_score)(clone(estimator), X, y, scorer, train, test,
verbose, fit_params)
for train, test in cv)
return np.array(scores)
cv_score = CVScorer(
scoring=scoring, cv=cv, n_jobs=n_jobs,
verbose=verbose, fit_params=fit_params, score_func=score_func,
pre_dispatch=pre_dispatch)
return [fold['test_score'] for fold in cv_score(estimator, X, y)]


def _permutation_test_score(estimator, X, y, cv, scorer):
Expand Down
Loading