From bf789be7d499c8242e877defdc403abd66b8a343 Mon Sep 17 00:00:00 2001 From: Gael varoquaux Date: Tue, 14 Sep 2010 23:39:01 +0200 Subject: [PATCH 1/3] ENH: GridsearchCV, Pipelines and cross validation Make it possible to pipeline GridsearchCV, Give a meaningful scaling to the scores in GridsearchCV Store the scores in GridsearchCV --- scikits/learn/grid_search.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/scikits/learn/grid_search.py b/scikits/learn/grid_search.py index bed1bc5893c08..00574219c4f29 100644 --- a/scikits/learn/grid_search.py +++ b/scikits/learn/grid_search.py @@ -10,7 +10,7 @@ from .externals.joblib import Parallel, delayed from .cross_val import KFold, StratifiedKFold -from .base import ClassifierMixin, clone +from .base import BaseEstimator, ClassifierMixin, clone try: from itertools import product @@ -50,9 +50,11 @@ def iter_grid(param_grid): if hasattr(param_grid, 'has_key'): param_grid = [param_grid] for p in param_grid: - keys = p.keys() - for v in product(*p.values()): - params = dict(zip(keys,v)) + # Always sort the keys of a dictionary, for reproducibility + items = sorted(p.items()) + keys, values = zip(*items) + for v in product(*values): + params = dict(zip(keys, v)) yield params @@ -65,7 +67,8 @@ def fit_grid_point(X, y, base_clf, clf_params, cv, loss_func, iid, clf = copy.deepcopy(base_clf) clf._set_params(**clf_params) - score = 0 + score = 0. + n_test_samples = 0. for train, test in cv: clf.fit(X[train], y[train], **fit_params) y_test = y[test] @@ -76,13 +79,16 @@ def fit_grid_point(X, y, base_clf, clf_params, cv, loss_func, iid, this_score = clf.score(X[test], y_test) if iid: this_score *= len(y_test) + n_test_samples += len(y_test) score += this_score + if iid: + score /= n_test_samples - return clf, score + return score, clf ################################################################################ -class GridSearchCV(object): +class GridSearchCV(BaseEstimator): """ Grid search on the parameters of a classifier. @@ -195,12 +201,17 @@ def fit(self, X, y, cv=None, **kw): cv, self.loss_func, self.iid, **self.fit_params) for clf_params in grid) - # Out is a list of pairs: estimator, score - key = lambda pair: pair[1] - best_estimator = max(out, key=key)[0] # get maximum score + # Out is a list of pairs: score, estimator + best_estimator = max(out)[1] # get maximum score self.best_estimator = best_estimator self.predict = best_estimator.predict + self.score = best_estimator.score + + # Store the computed scores + grid = iter_grid(self.param_grid) + self.grid_points_scores_ = dict((tuple(clf_params.items()), score) + for clf_params, (score, _) in zip(grid, out)) return self From 452f72396f442842e994708e204d567dec3092c4 Mon Sep 17 00:00:00 2001 From: Gael Varoquaux Date: Wed, 15 Sep 2010 14:35:11 +0200 Subject: [PATCH 2/3] ENH: Make sure that Pipeline and GridSearch objects are indeed recognized as classifiers, if they contain a classifier. --- scikits/learn/base.py | 23 +++++++++++++++++++++++ scikits/learn/cross_val.py | 6 ++---- scikits/learn/grid_search.py | 6 ++---- scikits/learn/tests/test_base.py | 19 +++++++++++++++++-- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/scikits/learn/base.py b/scikits/learn/base.py index 30123179e360e..9d30fe92351c1 100644 --- a/scikits/learn/base.py +++ b/scikits/learn/base.py @@ -220,3 +220,26 @@ def score(self, X, y): z : float """ return explained_variance(self.predict(X), y) + + +################################################################################ +# XXX: Temporary solution to figure out if an estimator is a classifier + +def _get_sub_estimator(estimator): + """ Returns the final estimator if there is any. + """ + if hasattr(estimator, 'estimator'): + # GridSearchCV and other CV-tuned estimators + return _get_sub_estimator(estimator.estimator) + if hasattr(estimator, 'steps'): + # Pipeline + return _get_sub_estimator(estimator.steps[-1][1]) + return estimator + + +def is_classifier(estimator): + """ Returns True if the given estimator is (probably) a classifier. + """ + estimator = _get_sub_estimator(estimator) + return isinstance(estimator, ClassifierMixin) + diff --git a/scikits/learn/cross_val.py b/scikits/learn/cross_val.py index b361d8aa7de7e..6681d84665a46 100644 --- a/scikits/learn/cross_val.py +++ b/scikits/learn/cross_val.py @@ -9,7 +9,7 @@ from math import ceil import numpy as np -from .base import ClassifierMixin +from .base import is_classifier from .utils.extmath import factorial, combinations from .externals.joblib import Parallel, delayed @@ -485,9 +485,7 @@ def cross_val_score(estimator, X, y=None, score_func=None, cv=None, """ n_samples = len(X) if cv is None: - if y is not None and (isinstance(estimator, ClassifierMixin) - or (hasattr(estimator, 'estimator') - and isinstance(estimator.estimator, ClassifierMixin))): + if y is not None and is_classifier(estimator): cv = StratifiedKFold(y, k=3) else: cv = KFold(n_samples, k=3) diff --git a/scikits/learn/grid_search.py b/scikits/learn/grid_search.py index 00574219c4f29..311693277ed3a 100644 --- a/scikits/learn/grid_search.py +++ b/scikits/learn/grid_search.py @@ -10,7 +10,7 @@ from .externals.joblib import Parallel, delayed from .cross_val import KFold, StratifiedKFold -from .base import BaseEstimator, ClassifierMixin, clone +from .base import BaseEstimator, is_classifier, clone try: from itertools import product @@ -187,9 +187,7 @@ def fit(self, X, y, cv=None, **kw): estimator = self.estimator if cv is None: n_samples = len(X) - if y is not None and (isinstance(estimator, ClassifierMixin) - or (hasattr(estimator, 'estimator') - and isinstance(estimator.estimator, ClassifierMixin))): + if y is not None and is_classifier(estimator): cv = StratifiedKFold(y, k=3) else: cv = KFold(n_samples, k=3) diff --git a/scikits/learn/tests/test_base.py b/scikits/learn/tests/test_base.py index 9d9a73e517bea..de7808bd5c94d 100644 --- a/scikits/learn/tests/test_base.py +++ b/scikits/learn/tests/test_base.py @@ -1,6 +1,10 @@ + +# Author: Gael Varoquaux +# License: BSD + from nose.tools import assert_true, assert_false, assert_equal, \ assert_raises -from ..base import BaseEstimator, clone +from ..base import BaseEstimator, clone, is_classifier ################################################################################ # A few test classes @@ -74,7 +78,6 @@ def test_str(): def test_get_params(): - test = T(K(), K()) assert_true('a__d' in test._get_params(deep=True)) @@ -84,3 +87,15 @@ def test_get_params(): assert test.a.d == 2 assert_raises(AssertionError, test._set_params, a__a=2) + +def test_is_classifier(): + from ..svm import SVC + from ..pipeline import Pipeline + from ..grid_search import GridSearchCV + svc = SVC() + assert_true(is_classifier(svc)) + assert_true(is_classifier(GridSearchCV(svc, {'C': [0.1, 1]}))) + assert_true(is_classifier(Pipeline([('svc', svc)]))) + assert_true(is_classifier(Pipeline([('svc_cv', + GridSearchCV(svc, {'C': [0.1, 1]}))]))) + From a4ed78e23f83a0723e06157b4777cd3b4b382ac9 Mon Sep 17 00:00:00 2001 From: Gael Varoquaux Date: Wed, 15 Sep 2010 17:19:17 +0200 Subject: [PATCH 3/3] ENH: Make sure clone works on pipelines and use clone in cross_val_func --- scikits/learn/base.py | 29 ++++++++++++++++++++++++---- scikits/learn/cross_val.py | 6 ++++-- scikits/learn/pipeline.py | 2 +- scikits/learn/tests/test_pipeline.py | 14 ++++++++++++-- 4 files changed, 42 insertions(+), 9 deletions(-) diff --git a/scikits/learn/base.py b/scikits/learn/base.py index 9d30fe92351c1..f0cfabf93fcc6 100644 --- a/scikits/learn/base.py +++ b/scikits/learn/base.py @@ -6,24 +6,45 @@ # License: BSD Style import inspect +import copy import numpy as np from .metrics import explained_variance ################################################################################ -def clone(estimator): +def clone(estimator, safe=True): """ Constructs a new estimator with the same parameters. Clone does a deep copy of the model in an estimator without actually copying attached data. It yields a new estimator with the same parameters that has not been fit on any data. + + Parameters + ============ + estimator: estimator object, or list, tuple or set of objects + The estimator or group of estimators to be cloned + safe: boolean, optional + If safe is false, clone will fall back to a deepcopy on objects + that are not estimators. + """ + estimator_type = type(estimator) + # XXX: not handling dictionnaries + if estimator_type in (list, tuple, set, frozenset): + return estimator_type([clone(e, safe=safe) for e in estimator]) + elif not hasattr(estimator, '_get_params'): + if not safe: + return copy.deepcopy(estimator) + else: + raise ValueError("Cannot clone object '%s' (type %s): " + "it does not seem to be a scikit-learn estimator as " + "it does not implement a '_get_params' methods." + % (repr(estimator), type(estimator))) klass = estimator.__class__ new_object_params = estimator._get_params(deep=False) for name, param in new_object_params.iteritems(): - if hasattr(param, '_get_params'): - new_object_params[name] = clone(param) + new_object_params[name] = clone(param, safe=False) new_object = klass(**new_object_params) return new_object @@ -108,7 +129,7 @@ def _get_param_names(cls): args = [] return args - def _get_params(self, deep=False): + def _get_params(self, deep=True): """ Get parameters for the estimator Parameters diff --git a/scikits/learn/cross_val.py b/scikits/learn/cross_val.py index 6681d84665a46..d0545cc7a4b05 100644 --- a/scikits/learn/cross_val.py +++ b/scikits/learn/cross_val.py @@ -9,7 +9,7 @@ from math import ceil import numpy as np -from .base import is_classifier +from .base import is_classifier, clone from .utils.extmath import factorial, combinations from .externals.joblib import Parallel, delayed @@ -495,8 +495,10 @@ def cross_val_score(estimator, X, y=None, score_func=None, cv=None, "should have a 'score' method. The estimator %s " "does not." % estimator ) + # We clone the estimator to make sure that all the folds are + # independent, and that it is pickable. scores = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(_cross_val_score)(estimator, X, y, score_func, + delayed(_cross_val_score)(clone(estimator), X, y, score_func, train, test) for train, test in cv) return np.array(scores) diff --git a/scikits/learn/pipeline.py b/scikits/learn/pipeline.py index 74a3996a5f478..19c1c21265d47 100644 --- a/scikits/learn/pipeline.py +++ b/scikits/learn/pipeline.py @@ -101,7 +101,7 @@ def __init__(self, steps): "'%s' (type %s) doesn't)" % (estimator, type(estimator)) ) - def _get_params(self, deep=False): + def _get_params(self, deep=True): if not deep: return super(Pipeline, self)._get_params(deep=False) else: diff --git a/scikits/learn/tests/test_pipeline.py b/scikits/learn/tests/test_pipeline.py index 281ff81a97157..01807f26ce59e 100644 --- a/scikits/learn/tests/test_pipeline.py +++ b/scikits/learn/tests/test_pipeline.py @@ -2,7 +2,7 @@ Test the pipeline module. """ -from nose.tools import assert_raises, assert_equal +from nose.tools import assert_raises, assert_equal, assert_false from ..base import BaseEstimator, clone from ..pipeline import Pipeline @@ -56,4 +56,14 @@ def test_pipeline_init(): # Test clone pipe2 = clone(pipe) - assert_equal(pipe._get_params(), pipe2._get_params()) + assert_false(pipe._named_steps['svc'] is pipe2._named_steps['svc']) + + # Check that appart from estimators, the parameters are the same + params = pipe._get_params() + params2 = pipe2._get_params() + # Remove estimators that where copied + params.pop('svc') + params.pop('anova') + params2.pop('svc') + params2.pop('anova') + assert_equal(params, params2)