diff --git a/sklearn/base.py b/sklearn/base.py index aa202a9c6da34..e5eb027ee6e5d 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -10,6 +10,8 @@ from scipy import sparse from .externals import six +from .utils.iter_fits import group_params + ############################################################################### def clone(estimator, safe=True): @@ -260,6 +262,21 @@ def __str__(self): _pprint(self.get_params(deep=True), offset=len(class_name), printer=str,),) + def iter_fits(self, param_iter, *args, **kwargs): + if not hasattr(self, '_non_fit_params'): + for params in param_iter: + self.set_params(**params) + yield params, self.fit(*args, **kwargs) + return + non_fit_params = set(self._non_fit_params) + for group, entries in group_params(param_iter, lambda k: k not in non_fit_params): + entries = list(entries) + self.set_params(**entries[0]) + self.fit(*args, **kwargs) + for params in entries: + self.set_params(**params) + yield params, self + ############################################################################### class ClassifierMixin(object): diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index 6d01dbcaf1560..1360f34855464 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -380,6 +380,7 @@ class SelectPercentile(_ScoreFilter): way. """ + _non_fit_params = ['percentile'] def __init__(self, score_func=f_classif, percentile=10): if not 0 <= percentile <= 100: @@ -437,6 +438,7 @@ class SelectKBest(_ScoreFilter): way. """ + _non_fit_params = ['k'] def __init__(self, score_func=f_classif, k=10): self.k = k @@ -483,6 +485,7 @@ class SelectFpr(_PvalueFilter): `pvalues_` : array-like, shape=(n_features,) p-values of feature scores. """ + _non_fit_params = ['alpha'] def __init__(self, score_func=f_classif, alpha=5e-2): self.alpha = alpha @@ -517,6 +520,7 @@ class SelectFdr(_PvalueFilter): `pvalues_` : array-like, shape=(n_features,) p-values of feature scores. """ + _non_fit_params = ['alpha'] def __init__(self, score_func=f_classif, alpha=5e-2): self.alpha = alpha @@ -549,6 +553,7 @@ class SelectFwe(_PvalueFilter): `pvalues_` : array-like, shape=(n_features,) p-values of feature scores. """ + _non_fit_params = ['alpha'] def __init__(self, score_func=f_classif, alpha=5e-2): self.alpha = alpha @@ -588,6 +593,7 @@ class GenericUnivariateSelect(_PvalueFilter): `pvalues_` : array-like, shape=(n_features,) p-values of feature scores. """ + _non_fit_params = ['param', 'mode'] _selection_modes = {'percentile': SelectPercentile, 'k_best': SelectKBest, diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 40ff1a676693c..acbfddb85ca1e 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -190,7 +190,7 @@ def __len__(self): return self.n_iter -def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, +def fit_grid_points(X, y, base_clf, param_iter, train, test, scorer, verbose, loss_func=None, **fit_params): """Run fit on one set of parameters. @@ -245,7 +245,6 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, # update parameters of the classifier after a copy of its base structure clf = clone(base_clf) - clf.set_params(**clf_params) if hasattr(base_clf, 'kernel') and callable(base_clf.kernel): # cannot compute the kernel values with custom function @@ -272,31 +271,34 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer, if y is not None: y_test = y[safe_mask(y, test)] y_train = y[safe_mask(y, train)] - clf.fit(X_train, y_train, **fit_params) - - if scorer is not None: - this_score = scorer(clf, X_test, y_test) - else: - this_score = clf.score(X_test, y_test) + train_args = (X_train, y_train) + test_args = (X_test, y_test) else: - clf.fit(X_train, **fit_params) + train_args = (X_train, ) + test_args = (X_test, ) + + results = [] + for params, est in clf.iter_fits(param_iter, *train_args, **fit_params): if scorer is not None: - this_score = scorer(clf, X_test) + this_score = scorer(clf, *test_args) else: - this_score = clf.score(X_test) + this_score = clf.score(*test_args) - if not isinstance(this_score, numbers.Number): - raise ValueError("scoring must return a number, got %s (%s)" - " instead." % (str(this_score), type(this_score))) + if not isinstance(this_score, numbers.Number): + raise ValueError("scoring must return a number, got %s (%s)" + " instead." % (str(this_score), type(this_score))) + + if verbose > 2: + msg += ", score=%f" % this_score + + results.append((params, this_score)) - if verbose > 2: - msg += ", score=%f" % this_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - start_time)) print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) - return this_score, clf_params, _num_samples(X_test) + return results, _num_samples(X_test) def _check_param_grid(param_grid): @@ -450,13 +452,18 @@ def _fit(self, X, y, parameter_iterator, **params): out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( - delayed(fit_grid_point)( - X, y, base_clf, clf_params, train, test, scorer, - self.verbose, **self.fit_params) for clf_params in - parameter_iterator for train, test in cv) + delayed(fit_grid_points)( + X, y, base_clf, parameter_iterator, train, test, scorer, + self.verbose, **self.fit_params) for train, test in cv) - # Out is a list of triplet: score, estimator, n_test_samples + # transform out to a list of triplet: score, params, n_test_samples: n_param_points = len(list(parameter_iterator)) + new_out = [] + for i in range(n_param_points): + # XXX: assumes parameters iterated in same order!!! + new_out.extend((fold_out[0][i][1], fold_out[0][i][0], fold_out[1]) for fold_out in out) + out = new_out + n_fits = len(out) n_folds = n_fits // n_param_points diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py index 4d769d65367f7..038a73aab2b29 100644 --- a/sklearn/linear_model/coordinate_descent.py +++ b/sklearn/linear_model/coordinate_descent.py @@ -23,6 +23,7 @@ from ..externals import six from ..externals.six.moves import xrange from ..utils.extmath import safe_sparse_dot +from ..utils.iter_fits import group_params from . import cd_fast @@ -187,6 +188,49 @@ def fit(self, X, y, Xy=None, coef_init=None): fit(X, y, Xy, coef_init) return self + def _iter_fits(self, param_iter, X, y, Xy=None, coef_init=None): + # Expects param_iter where only alpha varies, + # and sorted by decreasing alpha + X = atleast2d_or_csc(X, dtype=np.float64, order='F', + copy=self.copy_X and self.fit_intercept) + if not sparse.isspmatrix(X): + X, y, X_mean, y_mean, X_std = center_data(X, y, self.fit_intercept, + self.normalize, copy=False) + if Xy is None: + Xy = safe_sparse_dot(X.T, y, dense_output=True) + + n_samples, n_features = X.shape + precompute = self.precompute + if (hasattr(precompute, '__array__') + and not np.allclose(X_mean, np.zeros(n_features)) + and not np.allclose(X_std, np.ones(n_features))): + # recompute Gram + precompute = 'auto' + Xy = None + if precompute == 'auto': + precompute = (n_samples > n_features) + if not sparse.isspmatrix(X): + precompute = np.dot(X.T, X) + self.precompute = precompute + + self.coef_ = coef_init # XXX: even if warm_start = True? + for params in param_iter: + self.set_params(**params) + yield params, self.fit(X, y, Xy, self.coef_) + + def iter_fits(self, param_iter, X, y, Xy=None, coef_init=None): + if self.precompute is not None: + if sparse.isspmatrix(X): + warnings.warn("precompute is ignored for sparse data") + + orig_params = self.get_params() + for group, entries in group_params(param_iter, lambda k: k != 'alpha'): + self.set_params(**orig_params) + self.set_params(**group) + entries = sorted(entries, key=lambda x: -x.get('alpha', self.alpha)) + for tup in self._iter_fits(entries, X, y, Xy, coef_init): + yield tup + def _dense_fit(self, X, y, Xy=None, coef_init=None): # copy was done in fit if necessary diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index c01de183bff38..a8c464a67e3f1 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -15,6 +15,8 @@ from .externals.joblib import Parallel, delayed from .externals import six from .utils import tosequence +from .utils.iter_fits import iter_fit_transforms, iter_fits, group_params, \ + find_group from .externals.six import iteritems __all__ = ['Pipeline', 'FeatureUnion'] @@ -141,6 +143,35 @@ def fit_transform(self, X, y=None, **fit_params): else: return self.steps[-1][-1].fit(Xt, y, **fit_params).transform(Xt) + def iter_fits(self, param_iter, X, y=None, **fit_params): + for params, obj in self._iter_fits(param_iter, 0, False, X, y, **fit_params): + yield params, self + + def iter_fit_transforms(self, param_iter, X, y=None, **fit_params): + return self._iter_fits(param_iter, 0, True, X, y, **fit_params) + + def _iter_fits(self, param_iter, step, transform_last, X, y=None, **fit_params): + step_name, step_est = self.steps[step] + + if step == len(self.steps) - 1: + step_param_iter = [{k.split('__', 1)[1]: v for k, v in iteritems(params) + if k.startswith(step_name + '__')} + for params in param_iter] + fn = iter_fit_transforms if transform_last else iter_fits + for params, obj in fn(step_est, step_param_iter, X, y, **fit_params): + yield {step_name + '__' + k: v for k, v in iteritems(params)}, obj + return + + grouped = list(group_params(param_iter, + lambda k: k.startswith(step_name + '__'))) + step_param_iter = [{k.split('__', 1)[1]: v for k, v in iteritems(group)} for group, entries in grouped] + for group, Xt in iter_fit_transforms(step_est, step_param_iter, X, y, **fit_params): + group = {step_name + '__' + k: v for k, v in iteritems(group)} + entries = find_group(grouped, group) + for params, obj in self._iter_fits(entries, step + 1, transform_last, Xt, y, **fit_params): + params.update(group) + yield params, obj + def predict(self, X): """Applies transforms to the data, and the predict method of the final estimator. Valid only if the final estimator implements diff --git a/sklearn/utils/iter_fits.py b/sklearn/utils/iter_fits.py new file mode 100644 index 0000000000000..5bcc8d98acdfa --- /dev/null +++ b/sklearn/utils/iter_fits.py @@ -0,0 +1,100 @@ +from collections import defaultdict +from six import iteritems + + +def _default_iter_fits(est, fit, param_iter, *args, **kwargs): + for params in param_iter: + est.set_params(**params) + yield params, fit(*args, **kwargs) + + +def iter_fits(est, param_iter, *args, **kwargs): + if hasattr(est, 'iter_fits'): + return est.iter_fits(param_iter, *args, **kwargs) + return _default_iter_fits(est, est.fit, param_iter, *args, **kwargs) + + +def _fit_transform(est): + if hasattr(est, 'fit_transform'): + return est.fit_transform + def fn(X, *args, **kwargs): + est.fit(X, *args, **kwargs) + return est.transform(X) + return fn + + +def iter_fit_transforms(est, param_iter, X, *args, **kwargs): + if hasattr(est, 'iter_fit_transforms'): + return est.iter_fit_transforms(param_iter, X, *args, **kwargs) + if hasattr(est, 'iter_fits'): + return ((params, new_est.transform(X)) + for params, new_est + in est.iter_fits(param_iter, X, *args, **kwargs)) + return _default_iter_fits(est, _fit_transform, param_iter, X, *args, **kwargs) + + +def group_params(items, key_name=lambda x: True): + """bin by sub-dicts where values are compared by id if not hashable + + >>> a = ('boo', 6) + >>> b = ('boo', 6) + >>> id(a) == id(b) + False + >>> import numpy + >>> c = numpy.array([1, 2, 3]) + >>> items = [{'p': a, 'q': 1}, {'p': b, 'q': 2}, {'p': c, 'q': 3}, {'p': c, 'q': 4}] + >>> groups = list(group_params(items, lambda x: x == "p")) + >>> len(groups) + 2 + >>> groups + >>> sorted([[x['q'] for x in gitems] for g, gitems in groups if g['p'] == a][0]) + [1, 2] + >>> sorted([[x['q'] for x in gitems] for g, gitems in groups if g['p'] is c][0]) + [3, 4] + """ + # can reduce work if input is sorted tuples rather than dict + + groups = defaultdict(list) + canonical = {} # maps hashable x to a canonical instance of x + id_to_obj = {} + + for params in items: + group = [] + for k, v in iteritems(params): + if key_name(k): + try: + v_id = id(canonical.setdefault(v, v)) + except TypeError: + v_id = id(v) + id_to_obj[v_id] = v + group.append((k, v_id)) + groups[tuple(sorted(group))].append(params) + + return [({k: id_to_obj[v_id] for k, v_id in group}, group_items) + for group, group_items in iteritems(groups)] + + +def find_group(haystack, needle): + """Given the output of group_params, gets the entries for a single group value. + + Again designed to deal with non-hashable, non-comparable types...""" + for group, entries in haystack: + if len(group) != len(needle): + continue + for key, val in iteritems(group): + try: + other_val = needle[key] + except KeyError: + break + if val is other_val: + continue + try: + if val == other_val: + continue + except (ValueError, TypeError): + # can't convert numpy equality to boolean + continue + break + else: + return entries + raise ValueError('Could not find {!r} in given groups:\n{!r}'.format(needle, haystack))