From b217697e4c3b6e2cf5f01cb87fe7a094b2168669 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Thu, 9 Jan 2014 09:18:02 +0100 Subject: [PATCH 01/20] Refactor cv code --- sklearn/cross_validation.py | 153 ++++++++++++++++++++++++------------ sklearn/grid_search.py | 75 +----------------- sklearn/learning_curve.py | 2 +- 3 files changed, 103 insertions(+), 127 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 84045105d56a9..06d909ce35dfa 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -22,6 +22,7 @@ from .base import is_classifier, clone from .utils import check_arrays, check_random_state, safe_mask +from .utils.validation import _num_samples from .utils.fixes import unique from .externals.joblib import Parallel, delayed from .externals.six import string_types, with_metaclass @@ -1023,48 +1024,6 @@ def __len__(self): ############################################################################## -def _cross_val_score(estimator, X, y, scorer, train, test, verbose, - fit_params): - """Inner loop for cross validation""" - n_samples = X.shape[0] if sp.issparse(X) else len(X) - fit_params = dict([(k, np.asarray(v)[train] - if hasattr(v, '__len__') and len(v) == n_samples else v) - for k, v in fit_params.items()]) - if not hasattr(X, "shape"): - if getattr(estimator, "_pairwise", False): - raise ValueError("Precomputed kernels or affinity matrices have " - "to be passed as arrays or sparse matrices.") - X_train = [X[idx] for idx in train] - X_test = [X[idx] for idx in test] - else: - if getattr(estimator, "_pairwise", False): - # X is a precomputed square kernel matrix - if X.shape[0] != X.shape[1]: - raise ValueError("X should be a square kernel matrix") - X_train = X[np.ix_(train, train)] - X_test = X[np.ix_(test, train)] - else: - X_train = X[safe_mask(X, train)] - X_test = X[safe_mask(X, test)] - - if y is None: - y_train = None - y_test = None - else: - y_train = y[train] - y_test = y[test] - estimator.fit(X_train, y_train, **fit_params) - if scorer is None: - score = estimator.score(X_test, y_test) - else: - score = scorer(estimator, X_test, y_test) - if not isinstance(score, numbers.Number): - raise ValueError("scoring must return a number, got %s (%s)" - " instead." % (str(score), type(score))) - if verbose > 1: - print("score: %f" % score) - return score - def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, score_func=None, @@ -1127,16 +1086,9 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, """ X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) - scorer = _deprecate_loss_and_score_funcs( - loss_func=None, - score_func=score_func, - scoring=scoring - ) - if scorer is None and not hasattr(estimator, 'score'): - raise TypeError( - "If no scoring is specified, the estimator passed " - "should have a 'score' method. The estimator %s " - "does not." % estimator) + _check_scorable(estimator, score_func=score_func, scoring=scoring) + scorer = _deprecate_loss_and_score_funcs(score_func=score_func, + scoring=scoring) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. fit_params = fit_params if fit_params is not None else {} @@ -1149,6 +1101,85 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, return np.array(scores) +def _cross_val_score(estimator, X, y, scorer, train, test, verbose, + fit_params): + """Inner loop for cross validation""" + # TODO replace with grid_search.fit_grid_point() + n_samples = _num_samples(X) + fit_params = dict([(k, np.asarray(v)[train] # TODO why is this necessary? + if hasattr(v, '__len__') and len(v) == n_samples else v) + for k, v in fit_params.items()]) + + X_train, y_train = _split(estimator, X, y, train) + X_test, y_test = _split(estimator, X, y, test, train) + estimator.fit(X_train, y_train, **fit_params) + score = _score(estimator, X_test, y_test, scorer) + + if verbose > 1: + print("score: %f" % score) + return score + + +def _split(estimator, X, y, indices, train_indices=None): + """Create subset of dataset.""" + if hasattr(estimator, 'kernel') and callable(estimator.kernel): + # cannot compute the kernel values with custom function + raise ValueError("Cannot use a custom kernel function. " + "Precompute the kernel matrix instead.") + + if not hasattr(X, "shape"): + if getattr(estimator, "_pairwise", False): + raise ValueError("Precomputed kernels or affinity matrices have " + "to be passed as arrays or sparse matrices.") + X_subset = [X[idx] for idx in indices] + else: + if getattr(estimator, "_pairwise", False): + # X is a precomputed square kernel matrix + if X.shape[0] != X.shape[1]: + raise ValueError("X should be a square kernel matrix") + if train_indices is None: + X_subset = X[np.ix_(indices, indices)] + else: + X_subset = X[np.ix_(indices, train_indices)] + else: + X_subset = X[safe_mask(X, indices)] + + if y is not None: + y_subset = y[safe_mask(y, indices)] + else: + y_subset = None + + return X_subset, y_subset + + +def _fit(fit_function, X_train, y_train, **fit_params): + """Fit and estimator on a given training set.""" + if y_train is None: + fit_function(X_train, **fit_params) + else: + fit_function(X_train, y_train, **fit_params) + + +def _score(estimator, X_test, y_test, scorer): + """Compute the score of an estimator on a given test set.""" + if y_test is None: + if scorer is None: + score = estimator.score(X_test) + else: + score = scorer(estimator, X_test) + else: + if scorer is None: + score = estimator.score(X_test, y_test) + else: + score = scorer(estimator, X_test, y_test) + + if not isinstance(score, numbers.Number): + raise ValueError("scoring must return a number, got %s (%s) instead." + % (str(score), type(score))) + + return score + + def _permutation_test_score(estimator, X, y, cv, scorer): """Auxiliary function for permutation_test_score""" avg_score = [] @@ -1226,6 +1257,24 @@ def _check_cv(cv, X=None, y=None, classifier=False, warn_mask=False): return cv +def _check_scorable(estimator, scoring=None, loss_func=None, score_func=None): + """Check that estimator can be fitted and score can be computed.""" + if (not hasattr(estimator, 'fit') or + not (hasattr(estimator, 'predict') + or hasattr(estimator, 'score'))): + raise TypeError("estimator should a be an estimator implementing" + " 'fit' and 'predict' or 'score' methods," + " %s (type %s) was passed" % + (estimator, type(estimator))) + if (scoring is None and loss_func is None and score_func + is None): + if not hasattr(estimator, 'score'): + raise TypeError( + "If no scoring is specified, the estimator passed " + "should have a 'score' method. The estimator %s " + "does not." % estimator) + + def permutation_test_score(estimator, X, y, score_func=None, cv=None, n_permutations=100, n_jobs=1, labels=None, random_state=0, verbose=0, scoring=None): diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 108d320139c2e..b3fa04dc4bc8a 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -24,6 +24,7 @@ from .base import BaseEstimator, is_classifier, clone from .base import MetaEstimatorMixin from .cross_validation import _check_cv as check_cv +from .cross_validation import _check_scorable, _split, _fit, _score from .externals.joblib import Parallel, delayed, logger from .externals import six from .utils import safe_mask, check_random_state @@ -255,62 +256,6 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, return this_score, parameters, _num_samples(X_test) -def _split(estimator, X, y, indices, train_indices=None): - """Create subset of dataset.""" - if hasattr(estimator, 'kernel') and callable(estimator.kernel): - # cannot compute the kernel values with custom function - raise ValueError("Cannot use a custom kernel function. " - "Precompute the kernel matrix instead.") - - if not hasattr(X, "shape"): - if getattr(estimator, "_pairwise", False): - raise ValueError("Precomputed kernels or affinity matrices have " - "to be passed as arrays or sparse matrices.") - X_subset = [X[idx] for idx in indices] - else: - if getattr(estimator, "_pairwise", False): - # X is a precomputed square kernel matrix - if X.shape[0] != X.shape[1]: - raise ValueError("X should be a square kernel matrix") - if train_indices is None: - X_subset = X[np.ix_(indices, indices)] - else: - X_subset = X[np.ix_(indices, train_indices)] - else: - X_subset = X[safe_mask(X, indices)] - - if y is not None: - y_subset = y[safe_mask(y, indices)] - else: - y_subset = None - - return X_subset, y_subset - - -def _fit(fit_function, X_train, y_train, **fit_params): - """Fit and estimator on a given training set.""" - if y_train is None: - fit_function(X_train, **fit_params) - else: - fit_function(X_train, y_train, **fit_params) - - -def _score(estimator, X_test, y_test, scorer): - """Compute the score of an estimator on a given test set.""" - if y_test is None: - if scorer is None: - this_score = estimator.score(X_test) - else: - this_score = scorer(estimator, X_test) - else: - if scorer is None: - this_score = estimator.score(X_test, y_test) - else: - this_score = scorer(estimator, X_test, y_test) - - return this_score - - def _check_param_grid(param_grid): if hasattr(param_grid, 'items'): param_grid = [param_grid] @@ -351,24 +296,6 @@ def __repr__(self): self.parameters) -def _check_scorable(estimator, scoring=None, loss_func=None, score_func=None): - """Check that estimator can be fitted and score can be computed.""" - if (not hasattr(estimator, 'fit') or - not (hasattr(estimator, 'predict') - or hasattr(estimator, 'score'))): - raise TypeError("estimator should a be an estimator implementing" - " 'fit' and 'predict' or 'score' methods," - " %s (type %s) was passed" % - (estimator, type(estimator))) - if (scoring is None and loss_func is None and score_func - is None): - if not hasattr(estimator, 'score'): - raise TypeError( - "If no scoring is specified, the estimator passed " - "should have a 'score' method. The estimator %s " - "does not." % estimator) - - class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator, MetaEstimatorMixin)): """Base class for hyper parameter search with cross-validation.""" diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 406f28b12c280..f17c9a5a9fe30 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -11,7 +11,7 @@ from .utils import check_arrays from .externals.joblib import Parallel, delayed from .metrics.scorer import get_scorer -from .grid_search import _check_scorable, _split, _fit, _score +from .cross_validation import _check_scorable, _split, _fit, _score def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), From c4d6278c83859de73bb6c53a62dc98973d6a9c79 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Thu, 9 Jan 2014 23:37:01 +0100 Subject: [PATCH 02/20] Clean up --- sklearn/cross_validation.py | 5 ++--- sklearn/grid_search.py | 11 +++++------ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 06d909ce35dfa..4d42a626c24eb 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1091,7 +1091,6 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, scoring=scoring) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. - fit_params = fit_params if fit_params is not None else {} parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel( @@ -1104,15 +1103,15 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, def _cross_val_score(estimator, X, y, scorer, train, test, verbose, fit_params): """Inner loop for cross validation""" - # TODO replace with grid_search.fit_grid_point() n_samples = _num_samples(X) + fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] # TODO why is this necessary? if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) X_train, y_train = _split(estimator, X, y, train) X_test, y_test = _split(estimator, X, y, test, train) - estimator.fit(X_train, y_train, **fit_params) + _fit(estimator.fit, X_train, y_train, **fit_params) score = _score(estimator, X_test, y_test, scorer) if verbose > 1: diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index b3fa04dc4bc8a..bdbc26c9436c6 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -243,17 +243,16 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, X_train, y_train = _split(estimator, X, y, train) X_test, y_test = _split(estimator, X, y, test, train) _fit(estimator.fit, X_train, y_train, **fit_params) - this_score = _score(estimator, X_test, y_test, scorer) + score = _score(estimator, X_test, y_test, scorer) if verbose > 2: - msg += ", score=%f" % this_score + msg += ", score=%f" % score if verbose > 1: - end_msg = "%s -%s" % (msg, - logger.short_format_time(time.time() - - start_time)) + end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - + start_time)) print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) - return this_score, parameters, _num_samples(X_test) + return score, parameters, _num_samples(X_test) def _check_param_grid(param_grid): From 1599952d022fee81fc043a712dee4eae5a2dae5a Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Fri, 10 Jan 2014 09:27:43 +0100 Subject: [PATCH 03/20] Refactor RFE and add _check_scorable --- sklearn/feature_selection/rfe.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index cc80d7ffdcbda..49820742289f8 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -13,6 +13,7 @@ from ..base import clone from ..base import is_classifier from ..cross_validation import _check_cv as check_cv +from ..cross_validation import _check_scorable, _split, _score from .base import SelectorMixin from ..metrics.scorer import _deprecate_loss_and_score_funcs @@ -325,29 +326,31 @@ def fit(self, X, y): verbose=self.verbose - 1) cv = check_cv(self.cv, X, y, is_classifier(self.estimator)) + _check_scorable(self.estimator, scoring=self.scoring, + loss_func=self.loss_func) scores = np.zeros(X.shape[1]) # Cross-validation for n, (train, test) in enumerate(cv): - X_train, X_test = X[train], X[test] - y_train, y_test = y[train], y[test] + X_train, y_train = _split(self.estimator, X, y, train) + X_test, y_test = _split(self.estimator, X, y, test, train) # Compute a full ranking of the features ranking_ = rfe.fit(X_train, y_train).ranking_ # Score each subset of features for k in range(0, max(ranking_)): mask = np.where(ranking_ <= k + 1)[0] + X_train_subset = X_train[:, mask] + X_test_subset = X_test[:, mask] + estimator = clone(self.estimator) - estimator.fit(X_train[:, mask], y_train) - - if self.loss_func is None and self.scoring is None: - score = estimator.score(X_test[:, mask], y_test) - else: - scorer = _deprecate_loss_and_score_funcs( - loss_func=self.loss_func, - scoring=self.scoring - ) - score = scorer(estimator, X_test[:, mask], y_test) + estimator.fit(X_train_subset, y_train) + + scorer = _deprecate_loss_and_score_funcs( + loss_func=self.loss_func, + scoring=self.scoring + ) + score = _score(estimator, X_test_subset, y_test, scorer) if self.verbose > 0: print("Finished fold with %d / %d feature ranks, score=%f" From 5e520318c508d7fa151495e637ecbdb23264dc6c Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Fri, 10 Jan 2014 09:41:08 +0100 Subject: [PATCH 04/20] FIX typo in docstring --- sklearn/cross_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 4d42a626c24eb..ca818076c3dcb 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1152,7 +1152,7 @@ def _split(estimator, X, y, indices, train_indices=None): def _fit(fit_function, X_train, y_train, **fit_params): - """Fit and estimator on a given training set.""" + """Fit an estimator on a given training set.""" if y_train is None: fit_function(X_train, **fit_params) else: From 4b5f468c05814efa9baf8d8d3c34a54f5ae61f1e Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Fri, 10 Jan 2014 10:53:54 +0100 Subject: [PATCH 05/20] Merge `fit_grid_point` into `_cross_val_score` --- sklearn/cross_validation.py | 35 +++++++++++++++++++++++-------- sklearn/grid_search.py | 41 ++++++++++--------------------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index ca818076c3dcb..050e70c2d8089 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -15,6 +15,7 @@ from itertools import chain, combinations from math import ceil, floor, factorial import numbers +import time from abc import ABCMeta, abstractmethod import numpy as np @@ -24,7 +25,7 @@ from .utils import check_arrays, check_random_state, safe_mask from .utils.validation import _num_samples from .utils.fixes import unique -from .externals.joblib import Parallel, delayed +from .externals.joblib import Parallel, delayed, logger from .externals.six import string_types, with_metaclass from .metrics.scorer import _deprecate_loss_and_score_funcs @@ -1095,17 +1096,30 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, pre_dispatch=pre_dispatch) scores = parallel( delayed(_cross_val_score)(clone(estimator), X, y, scorer, train, test, - verbose, fit_params) + parameters=None, verbose=verbose, + fit_params=fit_params, + log_label="cross_val_score") for train, test in cv) - return np.array(scores) + return np.array(scores)[:, 0] -def _cross_val_score(estimator, X, y, scorer, train, test, verbose, - fit_params): +def _cross_val_score(estimator, X, y, scorer, train, test, parameters, verbose, + fit_params, log_label): """Inner loop for cross validation""" + if parameters is not None: + estimator.set_params(**parameters) + if verbose > 1: + start_time = time.time() + if parameters is None: + msg = "Evaluating..." + else: + msg = '%s' % (', '.join('%s=%s' % (k, v) + for k, v in parameters.items())) + print("[%s] %s %s" % (log_label, msg, (64 - len(msg)) * '.')) + n_samples = _num_samples(X) fit_params = fit_params if fit_params is not None else {} - fit_params = dict([(k, np.asarray(v)[train] # TODO why is this necessary? + fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) @@ -1114,9 +1128,14 @@ def _cross_val_score(estimator, X, y, scorer, train, test, verbose, _fit(estimator.fit, X_train, y_train, **fit_params) score = _score(estimator, X_test, y_test, scorer) + if verbose > 2: + msg += ", score=%f" % score if verbose > 1: - print("score: %f" % score) - return score + end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - + start_time)) + print("[%s] %s %s" % (log_label, (64 - len(end_msg)) * '.', end_msg)) + + return score, _num_samples(X_test) def _split(estimator, X, y, indices, train_indices=None): diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index bdbc26c9436c6..87ff9dedbdb1b 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -16,7 +16,6 @@ from itertools import product import numbers import operator -import time import warnings import numpy as np @@ -24,8 +23,8 @@ from .base import BaseEstimator, is_classifier, clone from .base import MetaEstimatorMixin from .cross_validation import _check_cv as check_cv -from .cross_validation import _check_scorable, _split, _fit, _score -from .externals.joblib import Parallel, delayed, logger +from .cross_validation import _check_scorable, _cross_val_score +from .externals.joblib import Parallel, delayed from .externals import six from .utils import safe_mask, check_random_state from .utils.validation import _num_samples, check_arrays @@ -184,7 +183,7 @@ def __len__(self): return self.n_iter -def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, +def fit_grid_point(X, y, estimator, parameters, train, test, scorer, verbose, loss_func=None, **fit_params): """Run fit on one set of parameters. @@ -196,11 +195,11 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, y : array-like or None Targets for input data. - base_estimator : estimator object + estimator : estimator object This estimator will be cloned and then fitted. parameters : dict - Parameters to be set on base_estimator clone for this grid point. + Parameters to be set on estimator for this grid point. train : ndarray, dtype int or bool Boolean mask or indices for training set. @@ -230,29 +229,11 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, n_samples_test : int Number of test samples in this split. """ - if verbose > 1: - start_time = time.time() - msg = '%s' % (', '.join('%s=%s' % (k, v) - for k, v in parameters.items())) - print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')) - - # update parameters of the classifier after a copy of its base structure - estimator = clone(base_estimator) - estimator.set_params(**parameters) - - X_train, y_train = _split(estimator, X, y, train) - X_test, y_test = _split(estimator, X, y, test, train) - _fit(estimator.fit, X_train, y_train, **fit_params) - score = _score(estimator, X_test, y_test, scorer) - - if verbose > 2: - msg += ", score=%f" % score - if verbose > 1: - end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - - start_time)) - print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) - - return score, parameters, _num_samples(X_test) + score, n_samples_test = _cross_val_score(estimator, X, y, scorer, train, + test, parameters, verbose, + fit_params, + log_label="GridSearchCV") + return score, parameters, n_samples_test def _check_param_grid(param_grid): @@ -397,7 +378,7 @@ def _fit(self, X, y, parameter_iterable): n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(fit_grid_point)( - X, y, base_estimator, parameters, train, test, + X, y, clone(base_estimator), parameters, train, test, self.scorer_, self.verbose, **self.fit_params) for parameters in parameter_iterable for train, test in cv) From 38081fdd56b6372a3ef6e768f134d46fd6a187ec Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sat, 11 Jan 2014 00:21:23 +0100 Subject: [PATCH 06/20] Return time --- sklearn/cross_validation.py | 29 +++++++++++++++-------------- sklearn/grid_search.py | 10 +++++----- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 050e70c2d8089..25efa841fcf35 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1103,39 +1103,40 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, return np.array(scores)[:, 0] -def _cross_val_score(estimator, X, y, scorer, train, test, parameters, verbose, - fit_params, log_label): +def _cross_val_score(estimator, X, y, scorer, train, test, parameters, + verbose, fit_params, log_label): """Inner loop for cross validation""" if parameters is not None: estimator.set_params(**parameters) - if verbose > 1: - start_time = time.time() - if parameters is None: - msg = "Evaluating..." - else: - msg = '%s' % (', '.join('%s=%s' % (k, v) - for k, v in parameters.items())) - print("[%s] %s %s" % (log_label, msg, (64 - len(msg)) * '.')) - n_samples = _num_samples(X) fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) + start_time = time.time() + + if verbose > 1: + if parameters is None: + msg = "" + else: + msg = '%s' % (', '.join('%s=%s' % (k, v) + for k, v in parameters.items())) + print("[%s] %s %s" % (log_label, msg, (64 - len(msg)) * '.')) + X_train, y_train = _split(estimator, X, y, train) X_test, y_test = _split(estimator, X, y, test, train) _fit(estimator.fit, X_train, y_train, **fit_params) score = _score(estimator, X_test, y_test, scorer) + scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % score if verbose > 1: - end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - - start_time)) + end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[%s] %s %s" % (log_label, (64 - len(end_msg)) * '.', end_msg)) - return score, _num_samples(X_test) + return score, _num_samples(X_test), scoring_time def _split(estimator, X, y, indices, train_indices=None): diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 87ff9dedbdb1b..c7824f2e8b63f 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -184,7 +184,7 @@ def __len__(self): def fit_grid_point(X, y, estimator, parameters, train, test, scorer, - verbose, loss_func=None, **fit_params): + verbose, **fit_params): """Run fit on one set of parameters. Parameters @@ -229,10 +229,10 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer, n_samples_test : int Number of test samples in this split. """ - score, n_samples_test = _cross_val_score(estimator, X, y, scorer, train, - test, parameters, verbose, - fit_params, - log_label="GridSearchCV") + score, n_samples_test, _ = _cross_val_score(estimator, X, y, scorer, + train, test, parameters, + verbose, fit_params, + log_label="GridSearchCV") return score, parameters, n_samples_test From 30c86ea2ddccab0610944a83bffea1719cc810fd Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sat, 11 Jan 2014 00:32:19 +0100 Subject: [PATCH 07/20] Move set_params back to fit_grid_point --- sklearn/cross_validation.py | 25 ++++--------------------- sklearn/grid_search.py | 21 ++++++++++++++++----- 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 25efa841fcf35..377dbcced6572 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -25,7 +25,7 @@ from .utils import check_arrays, check_random_state, safe_mask from .utils.validation import _num_samples from .utils.fixes import unique -from .externals.joblib import Parallel, delayed, logger +from .externals.joblib import Parallel, delayed from .externals.six import string_types, with_metaclass from .metrics.scorer import _deprecate_loss_and_score_funcs @@ -1096,18 +1096,14 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, pre_dispatch=pre_dispatch) scores = parallel( delayed(_cross_val_score)(clone(estimator), X, y, scorer, train, test, - parameters=None, verbose=verbose, - fit_params=fit_params, - log_label="cross_val_score") + verbose=verbose, fit_params=fit_params) for train, test in cv) return np.array(scores)[:, 0] -def _cross_val_score(estimator, X, y, scorer, train, test, parameters, - verbose, fit_params, log_label): +def _cross_val_score(estimator, X, y, scorer, train, test, + verbose, fit_params): """Inner loop for cross validation""" - if parameters is not None: - estimator.set_params(**parameters) n_samples = _num_samples(X) fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] @@ -1116,25 +1112,12 @@ def _cross_val_score(estimator, X, y, scorer, train, test, parameters, start_time = time.time() - if verbose > 1: - if parameters is None: - msg = "" - else: - msg = '%s' % (', '.join('%s=%s' % (k, v) - for k, v in parameters.items())) - print("[%s] %s %s" % (log_label, msg, (64 - len(msg)) * '.')) - X_train, y_train = _split(estimator, X, y, train) X_test, y_test = _split(estimator, X, y, test, train) _fit(estimator.fit, X_train, y_train, **fit_params) score = _score(estimator, X_test, y_test, scorer) scoring_time = time.time() - start_time - if verbose > 2: - msg += ", score=%f" % score - if verbose > 1: - end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) - print("[%s] %s %s" % (log_label, (64 - len(end_msg)) * '.', end_msg)) return score, _num_samples(X_test), scoring_time diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index c7824f2e8b63f..4db7d435256a8 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -24,7 +24,7 @@ from .base import MetaEstimatorMixin from .cross_validation import _check_cv as check_cv from .cross_validation import _check_scorable, _cross_val_score -from .externals.joblib import Parallel, delayed +from .externals.joblib import Parallel, delayed, logger from .externals import six from .utils import safe_mask, check_random_state from .utils.validation import _num_samples, check_arrays @@ -229,10 +229,21 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer, n_samples_test : int Number of test samples in this split. """ - score, n_samples_test, _ = _cross_val_score(estimator, X, y, scorer, - train, test, parameters, - verbose, fit_params, - log_label="GridSearchCV") + if verbose > 1: + msg = '%s' % (', '.join('%s=%s' % (k, v) + for k, v in parameters.items())) + print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')) + + estimator.set_params(**parameters) + score, n_samples_test, scoring_time = _cross_val_score( + estimator, X, y, scorer, train, test, verbose, fit_params) + + if verbose > 2: + msg += ", score=%f" % score + if verbose > 1: + end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) + print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) + return score, parameters, n_samples_test From 389ed8dbfd018c5ebf6bbf510cb739b1133aca71 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sat, 11 Jan 2014 16:34:17 +0100 Subject: [PATCH 08/20] Log score and time in 'cross_val_score' --- sklearn/cross_validation.py | 3 +++ sklearn/grid_search.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 377dbcced6572..080206f03441c 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1119,6 +1119,9 @@ def _cross_val_score(estimator, X, y, scorer, train, test, scoring_time = time.time() - start_time + if verbose > 1: + print("score %f in %f s" % (score, scoring_time)) + return score, _num_samples(X_test), scoring_time diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 4db7d435256a8..eac8823656439 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -236,7 +236,8 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer, estimator.set_params(**parameters) score, n_samples_test, scoring_time = _cross_val_score( - estimator, X, y, scorer, train, test, verbose, fit_params) + estimator, X, y, scorer, train, test, verbose=0, + fit_params=fit_params) if verbose > 2: msg += ", score=%f" % score From 1fa3ec363e3fb261b87b5c0ccc681e4fa5df70b3 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sun, 12 Jan 2014 00:40:19 +0100 Subject: [PATCH 09/20] check_scorable returns scorer --- sklearn/cross_validation.py | 42 +++------------------ sklearn/feature_selection/rfe.py | 13 ++----- sklearn/grid_search.py | 12 +++--- sklearn/learning_curve.py | 7 ++-- sklearn/metrics/scorer.py | 63 +++++++++++++++++++++++++++++++ sklearn/tests/test_grid_search.py | 11 +++--- 6 files changed, 86 insertions(+), 62 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 080206f03441c..c03cc4076c6b7 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -27,7 +27,7 @@ from .utils.fixes import unique from .externals.joblib import Parallel, delayed from .externals.six import string_types, with_metaclass -from .metrics.scorer import _deprecate_loss_and_score_funcs +from .metrics.scorer import check_scorable __all__ = ['Bootstrap', 'KFold', @@ -1087,9 +1087,7 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, """ X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) - _check_scorable(estimator, score_func=score_func, scoring=scoring) - scorer = _deprecate_loss_and_score_funcs(score_func=score_func, - scoring=scoring) + scorer = check_scorable(estimator, score_func=score_func, scoring=scoring) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, @@ -1168,20 +1166,12 @@ def _fit(fit_function, X_train, y_train, **fit_params): def _score(estimator, X_test, y_test, scorer): """Compute the score of an estimator on a given test set.""" if y_test is None: - if scorer is None: - score = estimator.score(X_test) - else: - score = scorer(estimator, X_test) + score = scorer(estimator, X_test) else: - if scorer is None: - score = estimator.score(X_test, y_test) - else: - score = scorer(estimator, X_test, y_test) - + score = scorer(estimator, X_test, y_test) if not isinstance(score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s) instead." % (str(score), type(score))) - return score @@ -1262,24 +1252,6 @@ def _check_cv(cv, X=None, y=None, classifier=False, warn_mask=False): return cv -def _check_scorable(estimator, scoring=None, loss_func=None, score_func=None): - """Check that estimator can be fitted and score can be computed.""" - if (not hasattr(estimator, 'fit') or - not (hasattr(estimator, 'predict') - or hasattr(estimator, 'score'))): - raise TypeError("estimator should a be an estimator implementing" - " 'fit' and 'predict' or 'score' methods," - " %s (type %s) was passed" % - (estimator, type(estimator))) - if (scoring is None and loss_func is None and score_func - is None): - if not hasattr(estimator, 'score'): - raise TypeError( - "If no scoring is specified, the estimator passed " - "should have a 'score' method. The estimator %s " - "does not." % estimator) - - def permutation_test_score(estimator, X, y, score_func=None, cv=None, n_permutations=100, n_jobs=1, labels=None, random_state=0, verbose=0, scoring=None): @@ -1351,11 +1323,7 @@ def permutation_test_score(estimator, X, y, score_func=None, cv=None, """ X, y = check_arrays(X, y, sparse_format='csr') cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) - scorer = _deprecate_loss_and_score_funcs( - loss_func=None, - score_func=score_func, - scoring=scoring - ) + scorer = check_scorable(estimator, scoring=scoring, score_func=score_func) random_state = check_random_state(random_state) # We clone the estimator to make sure that all the folds are diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index 49820742289f8..54941036e044a 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -13,9 +13,9 @@ from ..base import clone from ..base import is_classifier from ..cross_validation import _check_cv as check_cv -from ..cross_validation import _check_scorable, _split, _score +from ..cross_validation import _split, _score from .base import SelectorMixin -from ..metrics.scorer import _deprecate_loss_and_score_funcs +from ..metrics.scorer import check_scorable class RFE(BaseEstimator, MetaEstimatorMixin, SelectorMixin): @@ -326,8 +326,8 @@ def fit(self, X, y): verbose=self.verbose - 1) cv = check_cv(self.cv, X, y, is_classifier(self.estimator)) - _check_scorable(self.estimator, scoring=self.scoring, - loss_func=self.loss_func) + scorer = check_scorable(self.estimator, scoring=self.scoring, + loss_func=self.loss_func) scores = np.zeros(X.shape[1]) # Cross-validation @@ -345,11 +345,6 @@ def fit(self, X, y): estimator = clone(self.estimator) estimator.fit(X_train_subset, y_train) - - scorer = _deprecate_loss_and_score_funcs( - loss_func=self.loss_func, - scoring=self.scoring - ) score = _score(estimator, X_test_subset, y_test, scorer) if self.verbose > 0: diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index eac8823656439..d26f2a74fdd89 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -23,12 +23,12 @@ from .base import BaseEstimator, is_classifier, clone from .base import MetaEstimatorMixin from .cross_validation import _check_cv as check_cv -from .cross_validation import _check_scorable, _cross_val_score +from .cross_validation import _cross_val_score from .externals.joblib import Parallel, delayed, logger from .externals import six from .utils import safe_mask, check_random_state from .utils.validation import _num_samples, check_arrays -from .metrics.scorer import _deprecate_loss_and_score_funcs +from .metrics.scorer import check_scorable __all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point', @@ -308,8 +308,6 @@ def __init__(self, estimator, scoring=None, loss_func=None, self.cv = cv self.verbose = verbose self.pre_dispatch = pre_dispatch - _check_scorable(self.estimator, scoring=self.scoring, - loss_func=self.loss_func, score_func=self.score_func) def score(self, X, y=None): """Returns the score on the given test data and labels, if the search @@ -360,13 +358,13 @@ def _fit(self, X, y, parameter_iterable): estimator = self.estimator cv = self.cv + self.scorer_ = check_scorable(self.estimator, scoring=self.scoring, + loss_func=self.loss_func, + score_func=self.score_func) n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') - self.scorer_ = _deprecate_loss_and_score_funcs( - self.loss_func, self.score_func, self.scoring) - if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index f17c9a5a9fe30..a7fa2c28ce4e0 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -11,7 +11,8 @@ from .utils import check_arrays from .externals.joblib import Parallel, delayed from .metrics.scorer import get_scorer -from .cross_validation import _check_scorable, _split, _fit, _score +from .cross_validation import _split, _fit, _score +from .metrics.scorer import check_scorable def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), @@ -101,6 +102,7 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) # Make a list since we will be iterating multiple times over the folds cv = list(_check_cv(cv, X, y, classifier=is_classifier(estimator))) + scorer = check_scorable(estimator, scoring=scoring) # HACK as long as boolean indices are allowed in cv generators if cv[0][0].dtype == bool: @@ -119,9 +121,6 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), if verbose > 0: print("[learning_curve] Training set sizes: " + str(train_sizes_abs)) - _check_scorable(estimator, scoring=scoring) - scorer = get_scorer(scoring) - parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) if exploit_incremental_learning: diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 2a28495890ba2..73c32d60836cf 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -198,6 +198,69 @@ def get_scorer(scoring): return scorer +class _passthrough_scorer(object): + """Callable that wraps estimator.score""" + def __call__(self, estimator, *args, **kwargs): + return estimator.score(*args, **kwargs) + + +def check_scorable(estimator, scoring=None, loss_func=None, score_func=None): + """Check if estimator can be scored. + + A TypeError will be thrown if the estimator cannot be scored. + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + scoring : string, callable or None, optional, default: None + A string (see model evaluation documentation) or + a scorer callable object / function with signature + ``scorer(estimator, X, y)``. + + loss_func : callable or None, optional, default: None + A loss function callable object / function with signature + ``loss_func(estimator, X, y)``. + + score_func : callable or None, optional, default: None + A scoring function with signature + ``score_func(estimator, X, y)``. + + Returns + ------- + scoring : callable + A scorer callable object / function with signature + ``scorer(estimator, X, y)``. + """ + if not hasattr(estimator, 'fit'): + raise TypeError("estimator should a be an estimator implementing " + "'fit' method, %s (type %s) was passed" % + (estimator, type(estimator))) + + if scoring is None and loss_func is None and score_func is None: + if hasattr(estimator, 'score'): + return _passthrough_scorer() + else: + raise TypeError( + "If no scoring is specified, the estimator passed should " + "have a 'score' method. The estimator %s (type %s) " + "does not." % (estimator, type(estimator))) + else: + if hasattr(estimator, 'predict'): + scorer = _deprecate_loss_and_score_funcs(scoring=scoring, + loss_func=loss_func, score_func=score_func) + if scorer is None: + return ValueError("no scoring") + else: + return scorer + else: + raise TypeError( + "If a scoring is specified, the estimator passed should " + "have a 'predict' method. The estimator %s (type %s) " + "does not." % (estimator, type(estimator))) + + def make_scorer(score_func, greater_is_better=True, needs_proba=False, needs_threshold=False, **kwargs): """Make a scorer from a performance metric or loss function. diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index ff2510bce79fa..8d7107d10d9db 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -190,8 +190,9 @@ def test_grid_search_no_score(): assert_equal(grid_search.score(X, y), grid_search_no_score.score(X, y)) # giving no scoring function raises an error - assert_raise_message(TypeError, "no scoring", - GridSearchCV, clf_no_score, {'C': Cs}) + grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs}) + assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit, + [[1]]) def test_trivial_grid_scores(): @@ -494,9 +495,9 @@ def test_bad_estimator(): # test grid-search with clustering algorithm which doesn't support # "predict" sc = SpectralClustering() - assert_raises(TypeError, GridSearchCV, sc, - param_grid=dict(gamma=[.1, 1, 10]), - scoring='ari') + grid_search = GridSearchCV(sc, param_grid=dict(gamma=[.1, 1, 10]), + scoring='ari') + assert_raises(TypeError, grid_search.fit, [[1]]) def test_param_sampler(): From 5b8933d3b2257390eca62df98bcffe36e1b04c07 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sun, 12 Jan 2014 11:35:27 +0100 Subject: [PATCH 10/20] Clean up --- sklearn/cross_validation.py | 6 +-- sklearn/feature_selection/rfe.py | 6 +-- sklearn/grid_search.py | 12 ++--- sklearn/learning_curve.py | 5 +-- sklearn/metrics/scorer.py | 49 ++++++++------------ sklearn/metrics/tests/test_score_objects.py | 50 +++++++++++++++++++++ 6 files changed, 82 insertions(+), 46 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index c03cc4076c6b7..43c27904b73aa 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -27,7 +27,7 @@ from .utils.fixes import unique from .externals.joblib import Parallel, delayed from .externals.six import string_types, with_metaclass -from .metrics.scorer import check_scorable +from .metrics.scorer import check_scoring __all__ = ['Bootstrap', 'KFold', @@ -1087,7 +1087,7 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, """ X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) - scorer = check_scorable(estimator, score_func=score_func, scoring=scoring) + scorer = check_scoring(estimator, score_func=score_func, scoring=scoring) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, @@ -1323,7 +1323,7 @@ def permutation_test_score(estimator, X, y, score_func=None, cv=None, """ X, y = check_arrays(X, y, sparse_format='csr') cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) - scorer = check_scorable(estimator, scoring=scoring, score_func=score_func) + scorer = check_scoring(estimator, scoring=scoring, score_func=score_func) random_state = check_random_state(random_state) # We clone the estimator to make sure that all the folds are diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index 54941036e044a..a58fd33d61f47 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -15,7 +15,7 @@ from ..cross_validation import _check_cv as check_cv from ..cross_validation import _split, _score from .base import SelectorMixin -from ..metrics.scorer import check_scorable +from ..metrics.scorer import check_scoring class RFE(BaseEstimator, MetaEstimatorMixin, SelectorMixin): @@ -326,8 +326,8 @@ def fit(self, X, y): verbose=self.verbose - 1) cv = check_cv(self.cv, X, y, is_classifier(self.estimator)) - scorer = check_scorable(self.estimator, scoring=self.scoring, - loss_func=self.loss_func) + scorer = check_scoring(self.estimator, scoring=self.scoring, + loss_func=self.loss_func) scores = np.zeros(X.shape[1]) # Cross-validation diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index d26f2a74fdd89..4b440d23381e0 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -28,7 +28,7 @@ from .externals import six from .utils import safe_mask, check_random_state from .utils.validation import _num_samples, check_arrays -from .metrics.scorer import check_scorable +from .metrics.scorer import check_scoring __all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point', @@ -232,7 +232,7 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer, if verbose > 1: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) - print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')) + print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) estimator.set_params(**parameters) score, n_samples_test, scoring_time = _cross_val_score( @@ -243,7 +243,7 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer, msg += ", score=%f" % score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) - print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) + print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) return score, parameters, n_samples_test @@ -358,9 +358,9 @@ def _fit(self, X, y, parameter_iterable): estimator = self.estimator cv = self.cv - self.scorer_ = check_scorable(self.estimator, scoring=self.scoring, - loss_func=self.loss_func, - score_func=self.score_func) + self.scorer_ = check_scoring(self.estimator, scoring=self.scoring, + loss_func=self.loss_func, + score_func=self.score_func) n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index a7fa2c28ce4e0..26a742a22ed2f 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -12,7 +12,7 @@ from .externals.joblib import Parallel, delayed from .metrics.scorer import get_scorer from .cross_validation import _split, _fit, _score -from .metrics.scorer import check_scorable +from .metrics.scorer import check_scoring def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), @@ -94,7 +94,6 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), ----- See :ref:`examples/plot_learning_curve.py ` """ - if exploit_incremental_learning and not hasattr(estimator, "partial_fit"): raise ValueError("An estimator must support the partial_fit interface " "to exploit incremental learning") @@ -102,7 +101,7 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) # Make a list since we will be iterating multiple times over the folds cv = list(_check_cv(cv, X, y, classifier=is_classifier(estimator))) - scorer = check_scorable(estimator, scoring=scoring) + scorer = check_scoring(estimator, scoring=scoring) # HACK as long as boolean indices are allowed in cv generators if cv[0][0].dtype == bool: diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 73c32d60836cf..c4b89d2dfe1bb 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -198,13 +198,13 @@ def get_scorer(scoring): return scorer -class _passthrough_scorer(object): +class _PassthroughScorer(object): """Callable that wraps estimator.score""" def __call__(self, estimator, *args, **kwargs): return estimator.score(*args, **kwargs) -def check_scorable(estimator, scoring=None, loss_func=None, score_func=None): +def check_scoring(estimator, scoring=None, loss_func=None, score_func=None): """Check if estimator can be scored. A TypeError will be thrown if the estimator cannot be scored. @@ -219,46 +219,33 @@ def check_scorable(estimator, scoring=None, loss_func=None, score_func=None): a scorer callable object / function with signature ``scorer(estimator, X, y)``. - loss_func : callable or None, optional, default: None - A loss function callable object / function with signature - ``loss_func(estimator, X, y)``. - - score_func : callable or None, optional, default: None - A scoring function with signature - ``score_func(estimator, X, y)``. - Returns ------- scoring : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. """ + has_scoring = not (scoring is None and loss_func is None and + score_func is None) if not hasattr(estimator, 'fit'): raise TypeError("estimator should a be an estimator implementing " "'fit' method, %s (type %s) was passed" % (estimator, type(estimator))) - - if scoring is None and loss_func is None and score_func is None: - if hasattr(estimator, 'score'): - return _passthrough_scorer() - else: - raise TypeError( - "If no scoring is specified, the estimator passed should " - "have a 'score' method. The estimator %s (type %s) " - "does not." % (estimator, type(estimator))) + elif hasattr(estimator, 'predict') and has_scoring: + return _deprecate_loss_and_score_funcs(scoring=scoring, + loss_func=loss_func, score_func=score_func) + elif hasattr(estimator, 'score'): + return _PassthroughScorer() + elif not has_scoring: + raise TypeError( + "If no scoring is specified, the estimator passed should " + "have a 'score' method. The estimator %s (type %s) " + "does not." % (estimator, type(estimator))) else: - if hasattr(estimator, 'predict'): - scorer = _deprecate_loss_and_score_funcs(scoring=scoring, - loss_func=loss_func, score_func=score_func) - if scorer is None: - return ValueError("no scoring") - else: - return scorer - else: - raise TypeError( - "If a scoring is specified, the estimator passed should " - "have a 'predict' method. The estimator %s (type %s) " - "does not." % (estimator, type(estimator))) + raise TypeError( + "The estimator passed should have a 'score' or a 'predict' " + "method. The estimator %s (type %s) does not." + % (estimator, type(estimator))) def make_scorer(score_func, greater_is_better=True, needs_proba=False, diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index d7ea9f427074d..265e35b9b5034 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -9,6 +9,7 @@ from sklearn.metrics import (f1_score, r2_score, roc_auc_score, fbeta_score, log_loss) from sklearn.metrics.cluster import adjusted_rand_score +from sklearn.metrics.scorer import check_scoring from sklearn.metrics import make_scorer, SCORERS from sklearn.svm import LinearSVC from sklearn.cluster import KMeans @@ -22,6 +23,55 @@ from sklearn.multiclass import OneVsRestClassifier +class EstimatorWithoutFit(object): + """Dummy estimator to test check_scoring""" + pass + + +class EstimatorWithFit(object): + """Dummy estimator to test check_scoring""" + def fit(self, X, y): + return self + + +class EstimatorWithFitAndScore(object): + """Dummy estimator to test check_scoring""" + def fit(self, X, y): + return self + def score(self, X, y): + return 1.0 + + +class EstimatorWithFitAndPredict(object): + """Dummy estimator to test check_scoring""" + def fit(self, X, y): + self.y = y + return self + def predict(self, X): + return self.y + + +def test_check_scoring(): + """Test all branches of check_scoring""" + estimator = EstimatorWithoutFit() + assert_raises(TypeError, check_scoring, estimator) + + estimator = EstimatorWithFitAndScore() + estimator.fit([[1]], [1]) + scorer = check_scoring(estimator) + assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) + + estimator = EstimatorWithFitAndPredict() + estimator.fit([[1]], [1]) + assert_raises(TypeError, check_scoring, estimator) + + scorer = check_scoring(estimator, "accuracy") + assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) + + estimator = EstimatorWithFit() + assert_raises(TypeError, check_scoring, estimator) + + def test_make_scorer(): """Sanity check on the make_scorer factory function.""" f = lambda *args: 0 From 70aaef24e8f6dd2eb8921fd0e0f6a9504c7c9358 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sun, 12 Jan 2014 12:29:31 +0100 Subject: [PATCH 11/20] Replace '_fit_estimator' by '_cross_val_score' --- sklearn/cross_validation.py | 8 ++++++-- sklearn/learning_curve.py | 30 +++++++++------------------- sklearn/tests/test_learning_curve.py | 2 +- 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 43c27904b73aa..8de3e1474c082 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1100,7 +1100,7 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, def _cross_val_score(estimator, X, y, scorer, train, test, - verbose, fit_params): + verbose, fit_params, return_train_score=False): """Inner loop for cross validation""" n_samples = _num_samples(X) fit_params = fit_params if fit_params is not None else {} @@ -1120,7 +1120,11 @@ def _cross_val_score(estimator, X, y, scorer, train, test, if verbose > 1: print("score %f in %f s" % (score, scoring_time)) - return score, _num_samples(X_test), scoring_time + if return_train_score: + return (_score(estimator, X_train, y_train, scorer), score, + _num_samples(X_test), scoring_time) + else: + return score, _num_samples(X_test), scoring_time def _split(estimator, X, y, indices, train_indices=None): diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 26a742a22ed2f..c803ac242649d 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -11,7 +11,7 @@ from .utils import check_arrays from .externals.joblib import Parallel, delayed from .metrics.scorer import get_scorer -from .cross_validation import _split, _fit, _score +from .cross_validation import _split, _fit, _score, _cross_val_score from .metrics.scorer import check_scoring @@ -127,14 +127,16 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), classes = np.unique(y) else: classes = None + out = parallel(delayed(_incremental_fit_estimator)( - estimator, X, y, classes, train, test, train_sizes_abs, scorer, - verbose) for train, test in cv) + clone(estimator), X, y, classes, train, test, train_sizes_abs, + scorer, verbose) for train, test in cv) else: - out = parallel(delayed(_fit_estimator)( - estimator, X, y, train, test, n_train_samples, scorer, verbose) + out = parallel(delayed(_cross_val_score)( + clone(estimator), X, y, scorer, train[:n_train_samples], test, + verbose, fit_params=None, return_train_score=True) for train, test in cv for n_train_samples in train_sizes_abs) - out = np.array(out) + out = np.array(out)[:, :2] n_cv_folds = out.shape[0]/n_unique_ticks out = out.reshape(n_cv_folds, n_unique_ticks, 2) @@ -202,23 +204,9 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): return train_sizes_abs -def _fit_estimator(base_estimator, X, y, train, test, - n_train_samples, scorer, verbose): - """Train estimator on a training subset and compute scores.""" - train_subset = train[:n_train_samples] - estimator = clone(base_estimator) - X_train, y_train = _split(estimator, X, y, train_subset) - X_test, y_test = _split(estimator, X, y, test, train_subset) - _fit(estimator.fit, X_train, y_train) - train_score = _score(estimator, X_train, y_train, scorer) - test_score = _score(estimator, X_test, y_test, scorer) - return train_score, test_score - - -def _incremental_fit_estimator(base_estimator, X, y, classes, train, test, +def _incremental_fit_estimator(estimator, X, y, classes, train, test, train_sizes, scorer, verbose): """Train estimator on training subsets incrementally and compute scores.""" - estimator = clone(base_estimator) train_scores, test_scores = [], [] partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) for n_train_samples, partial_train in partitions: diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index 66b8f36279b97..1d43fdfb0eb4a 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -170,7 +170,7 @@ def test_learning_curve_with_boolean_indices(): estimator = MockImprovingClassifier(20) cv = KFold(n=30, n_folds=3, indices=False) train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, - cv=cv) + cv=cv) assert_array_equal(train_sizes, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores, np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) From 13c791595ca59456f861a6f1a88ecfb4fc7c1ade Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sun, 12 Jan 2014 13:09:22 +0100 Subject: [PATCH 12/20] Fix PEP8, style and documentation --- sklearn/cross_validation.py | 2 +- sklearn/grid_search.py | 3 +-- sklearn/learning_curve.py | 7 +------ sklearn/metrics/scorer.py | 14 ++++++-------- sklearn/metrics/tests/test_score_objects.py | 2 ++ 5 files changed, 11 insertions(+), 17 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 8de3e1474c082..8eb08f9a95deb 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -26,7 +26,7 @@ from .utils.validation import _num_samples from .utils.fixes import unique from .externals.joblib import Parallel, delayed -from .externals.six import string_types, with_metaclass +from .externals.six import with_metaclass from .metrics.scorer import check_scoring __all__ = ['Bootstrap', diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 4b440d23381e0..d6a29273f7aa6 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -14,7 +14,6 @@ from collections import Mapping, namedtuple, Sized from functools import partial, reduce from itertools import product -import numbers import operator import warnings @@ -26,7 +25,7 @@ from .cross_validation import _cross_val_score from .externals.joblib import Parallel, delayed, logger from .externals import six -from .utils import safe_mask, check_random_state +from .utils import check_random_state from .utils.validation import _num_samples, check_arrays from .metrics.scorer import check_scoring diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index c803ac242649d..5ca556a99417f 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -10,7 +10,6 @@ from .cross_validation import _check_cv from .utils import check_arrays from .externals.joblib import Parallel, delayed -from .metrics.scorer import get_scorer from .cross_validation import _split, _fit, _score, _cross_val_score from .metrics.scorer import check_scoring @@ -123,11 +122,7 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) if exploit_incremental_learning: - if is_classifier(estimator): - classes = np.unique(y) - else: - classes = None - + classes = np.unique(y) if is_classifier(estimator) else None out = parallel(delayed(_incremental_fit_estimator)( clone(estimator), X, y, classes, train, test, train_sizes_abs, scorer, verbose) for train, test in cv) diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index c4b89d2dfe1bb..9fbf4893652c6 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -205,7 +205,7 @@ def __call__(self, estimator, *args, **kwargs): def check_scoring(estimator, scoring=None, loss_func=None, score_func=None): - """Check if estimator can be scored. + """Determine scorer from user options. A TypeError will be thrown if the estimator cannot be scored. @@ -229,23 +229,21 @@ def check_scoring(estimator, scoring=None, loss_func=None, score_func=None): score_func is None) if not hasattr(estimator, 'fit'): raise TypeError("estimator should a be an estimator implementing " - "'fit' method, %s (type %s) was passed" % - (estimator, type(estimator))) + "'fit' method, %r was passed" % estimator) elif hasattr(estimator, 'predict') and has_scoring: return _deprecate_loss_and_score_funcs(scoring=scoring, - loss_func=loss_func, score_func=score_func) + loss_func=loss_func, + score_func=score_func) elif hasattr(estimator, 'score'): return _PassthroughScorer() elif not has_scoring: raise TypeError( "If no scoring is specified, the estimator passed should " - "have a 'score' method. The estimator %s (type %s) " - "does not." % (estimator, type(estimator))) + "have a 'score' method. The estimator %r does not." % estimator) else: raise TypeError( "The estimator passed should have a 'score' or a 'predict' " - "method. The estimator %s (type %s) does not." - % (estimator, type(estimator))) + "method. The estimator %r does not." % estimator) def make_scorer(score_func, greater_is_better=True, needs_proba=False, diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 265e35b9b5034..3cda7aadece52 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -38,6 +38,7 @@ class EstimatorWithFitAndScore(object): """Dummy estimator to test check_scoring""" def fit(self, X, y): return self + def score(self, X, y): return 1.0 @@ -47,6 +48,7 @@ class EstimatorWithFitAndPredict(object): def fit(self, X, y): self.y = y return self + def predict(self, X): return self.y From 7b951d8c63220c556c9d90a65720dad6cbb78174 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sun, 12 Jan 2014 13:41:37 +0100 Subject: [PATCH 13/20] Remove wrong variable names --- sklearn/feature_selection/rfe.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index a58fd33d61f47..ca6ede9a7c760 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -340,12 +340,9 @@ def fit(self, X, y): # Score each subset of features for k in range(0, max(ranking_)): mask = np.where(ranking_ <= k + 1)[0] - X_train_subset = X_train[:, mask] - X_test_subset = X_test[:, mask] - estimator = clone(self.estimator) - estimator.fit(X_train_subset, y_train) - score = _score(estimator, X_test_subset, y_test, scorer) + estimator.fit(X_train[:, mask], y_train) + score = _score(estimator, X_test[:, mask], y_test, scorer) if self.verbose > 0: print("Finished fold with %d / %d feature ranks, score=%f" From 5b211cd96543090ac5a3684b85e4fcae620003c9 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Tue, 14 Jan 2014 11:12:02 +0100 Subject: [PATCH 14/20] Remove helper function '_fit' --- sklearn/cross_validation.py | 13 ++++--------- sklearn/learning_curve.py | 10 +++++++--- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 8eb08f9a95deb..7eaadafa2dfec 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1112,7 +1112,10 @@ def _cross_val_score(estimator, X, y, scorer, train, test, X_train, y_train = _split(estimator, X, y, train) X_test, y_test = _split(estimator, X, y, test, train) - _fit(estimator.fit, X_train, y_train, **fit_params) + if y_train is None: + estimator.fit(X_train, **fit_params) + else: + estimator.fit(X_train, y_train, **fit_params) score = _score(estimator, X_test, y_test, scorer) scoring_time = time.time() - start_time @@ -1159,14 +1162,6 @@ def _split(estimator, X, y, indices, train_indices=None): return X_subset, y_subset -def _fit(fit_function, X_train, y_train, **fit_params): - """Fit an estimator on a given training set.""" - if y_train is None: - fit_function(X_train, **fit_params) - else: - fit_function(X_train, y_train, **fit_params) - - def _score(estimator, X_test, y_test, scorer): """Compute the score of an estimator on a given test set.""" if y_test is None: diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 5ca556a99417f..6ba706ed7442f 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -10,7 +10,7 @@ from .cross_validation import _check_cv from .utils import check_arrays from .externals.joblib import Parallel, delayed -from .cross_validation import _split, _fit, _score, _cross_val_score +from .cross_validation import _split, _score, _cross_val_score from .metrics.scorer import check_scoring @@ -209,8 +209,12 @@ def _incremental_fit_estimator(estimator, X, y, classes, train, test, X_partial_train, y_partial_train = _split(estimator, X, y, partial_train) X_test, y_test = _split(estimator, X, y, test, train[:n_train_samples]) - _fit(estimator.partial_fit, X_partial_train, y_partial_train, - classes=classes) + if y_partial_train is None: + estimator.partial_fit(X_partial_train, classes=classes) + else: + estimator.partial_fit(X_partial_train, y_partial_train, + classes=classes) train_scores.append(_score(estimator, X_train, y_train, scorer)) test_scores.append(_score(estimator, X_test, y_test, scorer)) return np.array((train_scores, test_scores)).T + From 2330ebe883d86580b316ea0927524cb1cbacad07 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 15 Jan 2014 22:05:14 +0100 Subject: [PATCH 15/20] Remove 'fit_grid_point' from 'BaseSearchCV' --- sklearn/cross_validation.py | 105 ++++++++++++++++++++++++++++++------ sklearn/grid_search.py | 32 ++++------- sklearn/learning_curve.py | 6 +-- 3 files changed, 102 insertions(+), 41 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 7eaadafa2dfec..ddaf8cd923ead 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -25,7 +25,7 @@ from .utils import check_arrays, check_random_state, safe_mask from .utils.validation import _num_samples from .utils.fixes import unique -from .externals.joblib import Parallel, delayed +from .externals.joblib import Parallel, delayed, logger from .externals.six import with_metaclass from .metrics.scorer import check_scoring @@ -1092,22 +1092,90 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) - scores = parallel( - delayed(_cross_val_score)(clone(estimator), X, y, scorer, train, test, - verbose=verbose, fit_params=fit_params) - for train, test in cv) + scores = parallel(delayed(fit_and_score)(clone(estimator), X, y, scorer, + train, test, verbose, None, + fit_params) + for train, test in cv) return np.array(scores)[:, 0] -def _cross_val_score(estimator, X, y, scorer, train, test, - verbose, fit_params, return_train_score=False): - """Inner loop for cross validation""" +def fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, + fit_params, return_train_score=False, + return_parameters=False): + """Fit estimator and compute scores for a given dataset split. + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + X : array-like of shape at least 2D + The data to fit. + + y : array-like, optional, default: None + The target variable to try to predict in the case of + supervised learning. + + scoring : callable + A scorer callable object / function with signature + ``scorer(estimator, X, y)``. + + train : array-like, shape = (n_train_samples,) + Indices of training samples. + + test : array-like, shape = (n_test_samples,) + Indices of test samples. + + verbose : integer + The verbosity level. + + parameters : dict or None + Parameters to be set on the estimator. + + fit_params : dict or None + Parameters that will be passed to ``estimator.fit``. + + return_train_score : boolean, optional, default: False + Compute and return score on training set. + + return_parameters : boolean, optional, default: False + Return parameters that has been used for the estimator. + + Returns + ------- + test_score : float + Score on test set. + + train_score : float, optional + Score on training set. + + n_test_samples : int + Number of test samples. + + scoring_time : float + Time spent for fitting and scoring in seconds. + + parameters : dict or None, optional + The parameters that have been evaluated. + """ + if verbose > 1: + if parameters is None: + msg = "no parameters to be set" + else: + msg = '%s' % (', '.join('%s=%s' % (k, v) + for k, v in parameters.items())) + print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) + + # Adjust lenght of sample weights n_samples = _num_samples(X) fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) + if parameters is not None: + estimator.set_params(**parameters) + start_time = time.time() X_train, y_train = _split(estimator, X, y, train) @@ -1116,18 +1184,23 @@ def _cross_val_score(estimator, X, y, scorer, train, test, estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) - score = _score(estimator, X_test, y_test, scorer) + test_score = _score(estimator, X_test, y_test, scorer) + if return_train_score: + train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time + if verbose > 2: + msg += ", score=%f" % test_score if verbose > 1: - print("score %f in %f s" % (score, scoring_time)) - - if return_train_score: - return (_score(estimator, X_train, y_train, scorer), score, - _num_samples(X_test), scoring_time) - else: - return score, _num_samples(X_test), scoring_time + end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) + print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) + + ret = [train_score] if return_train_score else [] + ret.extend([test_score, _num_samples(X_test), scoring_time]) + if return_parameters: + ret.append(parameters) + return ret def _split(estimator, X, y, indices, train_indices=None): diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index d6a29273f7aa6..e7e4b3bdf17c3 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -22,8 +22,8 @@ from .base import BaseEstimator, is_classifier, clone from .base import MetaEstimatorMixin from .cross_validation import _check_cv as check_cv -from .cross_validation import _cross_val_score -from .externals.joblib import Parallel, delayed, logger +from .cross_validation import fit_and_score +from .externals.joblib import Parallel, delayed from .externals import six from .utils import check_random_state from .utils.validation import _num_samples, check_arrays @@ -228,22 +228,9 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer, n_samples_test : int Number of test samples in this split. """ - if verbose > 1: - msg = '%s' % (', '.join('%s=%s' % (k, v) - for k, v in parameters.items())) - print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) - - estimator.set_params(**parameters) - score, n_samples_test, scoring_time = _cross_val_score( - estimator, X, y, scorer, train, test, verbose=0, - fit_params=fit_params) - - if verbose > 2: - msg += ", score=%f" % score - if verbose > 1: - end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) - print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) - + score, n_samples_test, _ = fit_and_score(estimator, X, y, scorer, train, + test, verbose, parameters, + fit_params) return score, parameters, n_samples_test @@ -386,9 +373,10 @@ def _fit(self, X, y, parameter_iterable): out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( - delayed(fit_grid_point)( - X, y, clone(base_estimator), parameters, train, test, - self.scorer_, self.verbose, **self.fit_params) + delayed(fit_and_score)( + clone(base_estimator), X, y, self.scorer_, train, test, + self.verbose, parameters, self.fit_params, + return_parameters=True) for parameters in parameter_iterable for train, test in cv) @@ -402,7 +390,7 @@ def _fit(self, X, y, parameter_iterable): n_test_samples = 0 score = 0 all_scores = [] - for this_score, parameters, this_n_test_samples in \ + for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 6ba706ed7442f..149de7b5c2c11 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -10,7 +10,7 @@ from .cross_validation import _check_cv from .utils import check_arrays from .externals.joblib import Parallel, delayed -from .cross_validation import _split, _score, _cross_val_score +from .cross_validation import _split, _score, fit_and_score from .metrics.scorer import check_scoring @@ -127,9 +127,9 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), clone(estimator), X, y, classes, train, test, train_sizes_abs, scorer, verbose) for train, test in cv) else: - out = parallel(delayed(_cross_val_score)( + out = parallel(delayed(fit_and_score)( clone(estimator), X, y, scorer, train[:n_train_samples], test, - verbose, fit_params=None, return_train_score=True) + verbose, parameters=None, fit_params=None, return_train_score=True) for train, test in cv for n_train_samples in train_sizes_abs) out = np.array(out)[:, :2] n_cv_folds = out.shape[0]/n_unique_ticks From f4aa5cad7178d20a45cd9bc606bec44518aecf06 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 15 Jan 2014 22:16:27 +0100 Subject: [PATCH 16/20] Check substrings of error messages --- sklearn/metrics/tests/test_score_objects.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 3cda7aadece52..c3ad63c18d4ef 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -4,6 +4,7 @@ from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import ignore_warnings from sklearn.metrics import (f1_score, r2_score, roc_auc_score, fbeta_score, @@ -56,7 +57,7 @@ def predict(self, X): def test_check_scoring(): """Test all branches of check_scoring""" estimator = EstimatorWithoutFit() - assert_raises(TypeError, check_scoring, estimator) + assert_raise_message(TypeError, "'fit' method", check_scoring, estimator) estimator = EstimatorWithFitAndScore() estimator.fit([[1]], [1]) @@ -65,13 +66,14 @@ def test_check_scoring(): estimator = EstimatorWithFitAndPredict() estimator.fit([[1]], [1]) - assert_raises(TypeError, check_scoring, estimator) + assert_raise_message(TypeError, "no scoring", check_scoring, estimator) scorer = check_scoring(estimator, "accuracy") assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) estimator = EstimatorWithFit() - assert_raises(TypeError, check_scoring, estimator) + assert_raise_message(TypeError, "'score' or a 'predict'", check_scoring, + estimator, "accuracy") def test_make_scorer(): From 33994f024b4f8dcf48485650d4fbe0e16dede623 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 15 Jan 2014 22:30:13 +0100 Subject: [PATCH 17/20] Rename '_split' to '_split_with_kernel' --- sklearn/cross_validation.py | 6 +++--- sklearn/feature_selection/rfe.py | 7 ++++--- sklearn/learning_curve.py | 12 +++++++----- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index ddaf8cd923ead..f9a67228443ae 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1178,8 +1178,8 @@ def fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, start_time = time.time() - X_train, y_train = _split(estimator, X, y, train) - X_test, y_test = _split(estimator, X, y, test, train) + X_train, y_train = _split_with_kernel(estimator, X, y, train) + X_test, y_test = _split_with_kernel(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: @@ -1203,7 +1203,7 @@ def fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, return ret -def _split(estimator, X, y, indices, train_indices=None): +def _split_with_kernel(estimator, X, y, indices, train_indices=None): """Create subset of dataset.""" if hasattr(estimator, 'kernel') and callable(estimator.kernel): # cannot compute the kernel values with custom function diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index ca6ede9a7c760..8767baa53c66e 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -13,7 +13,7 @@ from ..base import clone from ..base import is_classifier from ..cross_validation import _check_cv as check_cv -from ..cross_validation import _split, _score +from ..cross_validation import _split_with_kernel, _score from .base import SelectorMixin from ..metrics.scorer import check_scoring @@ -332,8 +332,9 @@ def fit(self, X, y): # Cross-validation for n, (train, test) in enumerate(cv): - X_train, y_train = _split(self.estimator, X, y, train) - X_test, y_test = _split(self.estimator, X, y, test, train) + X_train, y_train = _split_with_kernel(self.estimator, X, y, train) + X_test, y_test = _split_with_kernel(self.estimator, X, y, test, + train) # Compute a full ranking of the features ranking_ = rfe.fit(X_train, y_train).ranking_ diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 149de7b5c2c11..8d8fb9405b1cb 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -10,7 +10,7 @@ from .cross_validation import _check_cv from .utils import check_arrays from .externals.joblib import Parallel, delayed -from .cross_validation import _split, _score, fit_and_score +from .cross_validation import _split_with_kernel, _score, fit_and_score from .metrics.scorer import check_scoring @@ -205,10 +205,12 @@ def _incremental_fit_estimator(estimator, X, y, classes, train, test, train_scores, test_scores = [], [] partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) for n_train_samples, partial_train in partitions: - X_train, y_train = _split(estimator, X, y, train[:n_train_samples]) - X_partial_train, y_partial_train = _split(estimator, X, y, - partial_train) - X_test, y_test = _split(estimator, X, y, test, train[:n_train_samples]) + X_train, y_train = _split_with_kernel(estimator, X, y, + train[:n_train_samples]) + X_partial_train, y_partial_train = _split_with_kernel(estimator, X, y, + partial_train) + X_test, y_test = _split_with_kernel(estimator, X, y, test, + train[:n_train_samples]) if y_partial_train is None: estimator.partial_fit(X_partial_train, classes=classes) else: From 4494f152bbc34480974b6704415fe674982844df Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 15 Jan 2014 22:41:36 +0100 Subject: [PATCH 18/20] _passthrough_scorer is a function --- sklearn/metrics/scorer.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 9fbf4893652c6..dfce4a837faa3 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -198,10 +198,9 @@ def get_scorer(scoring): return scorer -class _PassthroughScorer(object): - """Callable that wraps estimator.score""" - def __call__(self, estimator, *args, **kwargs): - return estimator.score(*args, **kwargs) +def _passthrough_scorer(estimator, *args, **kwargs): + """Function that wraps estimator.score""" + return estimator.score(*args, **kwargs) def check_scoring(estimator, scoring=None, loss_func=None, score_func=None): @@ -235,7 +234,7 @@ def check_scoring(estimator, scoring=None, loss_func=None, score_func=None): loss_func=loss_func, score_func=score_func) elif hasattr(estimator, 'score'): - return _PassthroughScorer() + return _passthrough_scorer elif not has_scoring: raise TypeError( "If no scoring is specified, the estimator passed should " From 062d4001b5ca6532d5416f806c34f6a49fa8bc64 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 15 Jan 2014 23:10:56 +0100 Subject: [PATCH 19/20] Remove '_deprecate_loss_and_score_funcs' --- sklearn/linear_model/ridge.py | 10 ++-- sklearn/metrics/scorer.py | 60 ++++++++++----------- sklearn/metrics/tests/test_score_objects.py | 5 ++ 3 files changed, 37 insertions(+), 38 deletions(-) diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py index 0e93bbd25c3e5..4f61b149cd721 100644 --- a/sklearn/linear_model/ridge.py +++ b/sklearn/linear_model/ridge.py @@ -25,7 +25,7 @@ from ..preprocessing import LabelBinarizer from ..grid_search import GridSearchCV from ..externals import six -from ..metrics.scorer import _deprecate_loss_and_score_funcs +from ..metrics.scorer import check_scoring def _solve_sparse_cg(X, y, alpha, max_iter=None, tol=1e-3): @@ -728,12 +728,10 @@ def fit(self, X, y, sample_weight=1.0): cv_values = np.zeros((n_samples * n_y, len(self.alphas))) C = [] - scorer = _deprecate_loss_and_score_funcs( - self.loss_func, self.score_func, self.scoring, - score_overrides_loss=True - ) + scorer = check_scoring(self, scoring=self.scoring, allow_none=True, + loss_func=self.loss_func, score_func=self.score_func, + score_overrides_loss=True) error = scorer is None - #error = self.score_func is None and self.loss_func is None for i, alpha in enumerate(self.alphas): if error: diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index dfce4a837faa3..da5a05b0d0f31 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -157,34 +157,6 @@ def _factory_args(self): return ", needs_threshold=True" -def _deprecate_loss_and_score_funcs( - loss_func=None, score_func=None, scoring=None, - score_overrides_loss=False): - - scorer = None - if loss_func is not None or score_func is not None: - - if loss_func is not None: - warn("Passing a loss function is " - "deprecated and will be removed in 0.15. " - "Either use strings or score objects. " - "The relevant new parameter is called ''scoring''. ", - category=DeprecationWarning, stacklevel=2) - scorer = make_scorer(loss_func, greater_is_better=False) - if score_func is not None: - warn("Passing function as ``score_func`` is " - "deprecated and will be removed in 0.15. " - "Either use strings or score objects. " - "The relevant new parameter is called ''scoring''.", - category=DeprecationWarning, stacklevel=2) - if loss_func is None or score_overrides_loss: - scorer = make_scorer(score_func) - - else: - scorer = get_scorer(scoring) - return scorer - - def get_scorer(scoring): if isinstance(scoring, six.string_types): try: @@ -203,7 +175,8 @@ def _passthrough_scorer(estimator, *args, **kwargs): return estimator.score(*args, **kwargs) -def check_scoring(estimator, scoring=None, loss_func=None, score_func=None): +def check_scoring(estimator, scoring=None, allow_none=False, loss_func=None, + score_func=None, score_overrides_loss=False): """Determine scorer from user options. A TypeError will be thrown if the estimator cannot be scored. @@ -218,6 +191,10 @@ def check_scoring(estimator, scoring=None, loss_func=None, score_func=None): a scorer callable object / function with signature ``scorer(estimator, X, y)``. + allow_none : boolean, optional, default: False + If no scoring is specified and the estimator has no score function, we + can either return None or raise an exception. + Returns ------- scoring : callable @@ -230,12 +207,31 @@ def check_scoring(estimator, scoring=None, loss_func=None, score_func=None): raise TypeError("estimator should a be an estimator implementing " "'fit' method, %r was passed" % estimator) elif hasattr(estimator, 'predict') and has_scoring: - return _deprecate_loss_and_score_funcs(scoring=scoring, - loss_func=loss_func, - score_func=score_func) + scorer = None + if loss_func is not None or score_func is not None: + if loss_func is not None: + warn("Passing a loss function is " + "deprecated and will be removed in 0.15. " + "Either use strings or score objects. " + "The relevant new parameter is called ''scoring''. ", + category=DeprecationWarning, stacklevel=2) + scorer = make_scorer(loss_func, greater_is_better=False) + if score_func is not None: + warn("Passing function as ``score_func`` is " + "deprecated and will be removed in 0.15. " + "Either use strings or score objects. " + "The relevant new parameter is called ''scoring''.", + category=DeprecationWarning, stacklevel=2) + if loss_func is None or score_overrides_loss: + scorer = make_scorer(score_func) + else: + scorer = get_scorer(scoring) + return scorer elif hasattr(estimator, 'score'): return _passthrough_scorer elif not has_scoring: + if allow_none: + return None raise TypeError( "If no scoring is specified, the estimator passed should " "have a 'score' method. The estimator %r does not." % estimator) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index c3ad63c18d4ef..5dea88c1c5e1f 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -5,6 +5,7 @@ from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raise_message +from sklearn.utils.testing import assert_true from sklearn.utils.testing import ignore_warnings from sklearn.metrics import (f1_score, r2_score, roc_auc_score, fbeta_score, @@ -75,6 +76,10 @@ def test_check_scoring(): assert_raise_message(TypeError, "'score' or a 'predict'", check_scoring, estimator, "accuracy") + estimator = EstimatorWithFit() + scorer = check_scoring(estimator, allow_none=True) + assert_true(scorer is None) + def test_make_scorer(): """Sanity check on the make_scorer factory function.""" From 8bc79b3fe311c54072839b3a2e7b2b0f8a3ebc0a Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 15 Jan 2014 23:18:55 +0100 Subject: [PATCH 20/20] Check error message --- sklearn/tests/test_grid_search.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index 8d7107d10d9db..6d1a0d4f2ccc6 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -497,7 +497,8 @@ def test_bad_estimator(): sc = SpectralClustering() grid_search = GridSearchCV(sc, param_grid=dict(gamma=[.1, 1, 10]), scoring='ari') - assert_raises(TypeError, grid_search.fit, [[1]]) + assert_raise_message(TypeError, "'score' or a 'predict'", grid_search.fit, + [[1]]) def test_param_sampler():