ENH/FIX timing and training score.

raghavrv · raghavrv · commit c3478e39f872 · 2016-09-25T14:09:06.000+02:00
* ENH separate fit / score times
* Make score_time=0 if errored; Ignore warnings in test
* Cleanup docstrings
* ENH Use helper to store the results
* Move fit time computation to else of try...except...else
* DOC readable sample scores
* COSMIT Add a commnent on why time test is &gt;= 0 instead of &gt; 0
  (Windows time.time precision is not accurate enought to be non-zero
   for trivial fits)
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -100,6 +100,20 @@ Model Selection Enhancements and API Changes
     The parameter ``n_labels`` in the newly renamed
     :class:`model_selection.LeavePGroupsOut` is changed to ``n_groups``.
 
+  - Training scores and Timing information
+
+    ``cv_results_`` also includes the training scores for each
+    cross-validation split (with keys such as ``'split0_train_score'``), as
+    well as their mean (``'mean_train_score'``) and standard deviation
+    (``'std_train_score'``). To avoid the cost of evaluating training score,
+    set ``return_train_score=False``.
+
+    Additionally the mean and standard deviation of the times taken to split,
+    train and score the model across all the cross-validation splits is
+    available at the key ``'mean_time'`` and ``'std_time'`` respectively.
+
+Changelog
+---------
 
 New features
 ............
@@ -349,6 +363,12 @@ Enhancements
      now accept arbitrary kernel functions in addition to strings ``knn`` and ``rbf``.
      (`#5762 <https://github.com/scikit-learn/scikit-learn/pull/5762>`_) By `Utkarsh Upadhyay`_.
 
+   - The training scores and time taken for training followed by scoring for
+     each search candidate are now available at the ``cv_results_`` dict.
+     See :ref:`model_selection_changes` for more information.
+     (`#7324 <https://github.com/scikit-learn/scikit-learn/pull/7325>`)
+     By `Eugene Chen`_ and `Raghav RV`_.
+
 
 Bug fixes
 .........
@@ -4651,3 +4671,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Russell Smith: https://github.com/rsmith54
 
 .. _Utkarsh Upadhyay: https://github.com/musically-ut
+
+.. _Eugene Chen: https://github.com/eyc88
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
@@ -319,7 +319,9 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
     """
     score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train,
                                               test, verbose, parameters,
-                                              fit_params, error_score)
+                                              fit_params=fit_params,
+                                              return_n_test_samples=True,
+                                              error_score=error_score)
     return score, parameters, n_samples_test
 
 
@@ -552,77 +554,61 @@ def _fit(self, X, y, groups, parameter_iterable):
             pre_dispatch=pre_dispatch
         )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                   train, test, self.verbose, parameters,
-                                  self.fit_params,
+                                  fit_params=self.fit_params,
                                   return_train_score=self.return_train_score,
-                                  return_parameters=True,
+                                  return_n_test_samples=True,
+                                  return_times=True, return_parameters=True,
                                   error_score=self.error_score)
           for parameters in parameter_iterable
           for train, test in cv.split(X, y, groups))
 
         # if one choose to see train score, "out" will contain train score info
         if self.return_train_score:
-            train_scores, test_scores, test_sample_counts, time, parameters =\
-                zip(*out)
+            (train_scores, test_scores, test_sample_counts,
+             fit_time, score_time, parameters) = zip(*out)
         else:
-            test_scores, test_sample_counts, time, parameters = zip(*out)
+            (test_scores, test_sample_counts,
+             fit_time, score_time, parameters) = zip(*out)
 
         candidate_params = parameters[::n_splits]
         n_candidates = len(candidate_params)
 
-        # if one choose to return train score, reshape the train_scores array
-        if self.return_train_score:
-            train_scores = np.array(train_scores,
-                                    dtype=np.float64).reshape(n_candidates,
+        results = dict()
+
+        def _store(key_name, array, weights=None, splits=False, rank=False):
+            """A small helper to store the scores/times to the cv_results_"""
+            array = np.array(array, dtype=np.float64).reshape(n_candidates,
                                                               n_splits)
-        test_scores = np.array(test_scores,
-                               dtype=np.float64).reshape(n_candidates,
-                                                         n_splits)
+            if splits:
+                for split_i in range(n_splits):
+                    results["split%d_%s"
+                            % (split_i, key_name)] = array[:, split_i]
+
+            array_means = np.average(array, axis=1, weights=weights)
+            results['mean_%s' % key_name] = array_means
+            # Weighted std is not directly available in numpy
+            array_stds = np.sqrt(np.average((array -
+                                             array_means[:, np.newaxis]) ** 2,
+                                            axis=1, weights=weights))
+            results['std_%s' % key_name] = array_stds
+
+            if rank:
+                results["rank_%s" % key_name] = np.asarray(
+                    rankdata(-array_means, method='min'), dtype=np.int32)
+
+        # Computed the (weighted) mean and std for test scores alone
         # NOTE test_sample counts (weights) remain the same for all candidates
         test_sample_counts = np.array(test_sample_counts[:n_splits],
                                       dtype=np.int)
 
-        # Computed the (weighted) mean and std for test scores
-        weights = test_sample_counts if self.iid else None
-        test_means = np.average(test_scores, axis=1, weights=weights)
-        test_stds = np.sqrt(
-            np.average((test_scores - test_means[:, np.newaxis]) ** 2, axis=1,
-                       weights=weights))
-
-        time = np.array(time, dtype=np.float64).reshape(n_candidates, n_splits)
-        time_means = np.average(time, axis=1)
-        time_stds =  np.sqrt(
-            np.average((time - time_means[:, np.newaxis]) ** 2,
-                       axis=1))
-        if self.return_train_score:
-            train_means = np.average(train_scores, axis=1)
-            train_stds = np.sqrt(
-                np.average((train_scores - train_means[:, np.newaxis]) ** 2,
-                           axis=1))
-
-        cv_results = dict()
-        for split_i in range(n_splits):
-            cv_results["split%d_test_score" % split_i] = test_scores[:,
-                                                                     split_i]
-        cv_results["mean_test_score"] = means
-        cv_results["std_test_score"] = stds
+        _store('test_score', test_scores, splits=True, rank=True,
+               weights=test_sample_counts if self.iid else None)
+        _store('train_score', train_scores, splits=True)
+        _store('fit_time', fit_time)
+        _store('score_time', score_time)
 
-        if self.return_train_score:
-            for split_i in range(n_splits):
-                results["train_split%d_score" % split_i] = (
-                    train_scores[:, split_i])
-            results["mean_train_score"] = train_means
-            results["std_train_scores"] = train_stds
-            results["rank_train_scores"] = np.asarray(rankdata(-train_means,
-                                                               method='min'),
-                                                      dtype=np.int32)
-
-        results["mean_test_time"] = time_means
-        results["std_test_time"] = time_stds
-        ranks = np.asarray(rankdata(-test_means, method='min'), dtype=np.int32)
-
-        best_index = np.flatnonzero(ranks == 1)[0]
+        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
         best_parameters = candidate_params[best_index]
-        cv_results["rank_test_score"] = ranks
 
         # Use one np.MaskedArray and mask all the places where the param is not
         # applicable for that candidate. Use defaultdict as each candidate may
@@ -636,12 +622,12 @@ def _fit(self, X, y, groups, parameter_iterable):
                 # Setting the value at an index also unmasks that index
                 param_results["param_%s" % name][cand_i] = value
 
-        cv_results.update(param_results)
+        results.update(param_results)
 
         # Store a list of param dicts at the key 'params'
-        cv_results['params'] = candidate_params
+        results['params'] = candidate_params
 
-        self.cv_results_ = cv_results
+        self.cv_results_ = results
         self.best_index_ = best_index
         self.n_splits_ = n_splits
 
@@ -783,8 +769,8 @@ class GridSearchCV(BaseSearchCV):
         FitFailedWarning is raised. This parameter does not affect the refit
         step, which will always raise the error.
 
-    return_train_score: boolean, default=True
-        If ``'False'``, the results_ attribute will not include training
+    return_train_score : boolean, default=True
+        If ``'False'``, the ``cv_results_`` attribute will not include training
         scores.
 
 
@@ -809,10 +795,12 @@ class GridSearchCV(BaseSearchCV):
            scoring=..., verbose=...)
     >>> sorted(clf.cv_results_.keys())
     ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
-    ['mean_test_score', 'mean_test_time', 'mean_train_score',...
-     'param_C', 'param_kernel', 'params', 'rank_test_score',...
-     'split0_test_score', 'split1_test_score',...
-     'split2_test_score', 'std_test_score', 'std_test_time'...]
+    ['mean_fit_time', 'mean_score_time', 'mean_test_score',...
+     'mean_train_score', 'param_C', 'param_kernel', 'params',...
+     'rank_test_score', 'split0_test_score',...
+     'split0_train_score', 'split1_test_score', 'split1_train_score',...
+     'split2_test_score', 'split2_train_score',...
+     'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score'...]
 
     Attributes
     ----------
@@ -843,25 +831,24 @@ class GridSearchCV(BaseSearchCV):
                                         mask = [ True  True False False]...),
             'param_degree': masked_array(data = [2.0 3.0 -- --],
                                          mask = [False False  True  True]...),
-            'split0_test_score' : [0.8, 0.7, 0.8, 0.9],
-            'split1_test_score' : [0.82, 0.5, 0.7, 0.78],
-            'mean_test_score'   : [0.81, 0.60, 0.75, 0.82],
-            'std_test_score'    : [0.02, 0.01, 0.03, 0.03],
-            'rank_test_score'   : [2, 4, 3, 1],
-            'split0_train_score': [0.9, 0.8, 0.85, 1.]
-            'split1_train_score': [0.95, 0.7, 0.8, 0.8]
-            'mean_train_score'  : [0.93, 0.75, 0.83, 0.9]
-            'std_train_score'    : [0.02, 0.01, 0.03, 0.03],
-            'rank_train_score'   : [2, 4, 3, 1],
-            'mean_test_time'    : [0.00073, 0.00063, 0.00043, 0.00049]
-            'std_test_time'     : [1.62e-4, 3.37e-5, 1.42e-5, 1.1e-5]
-            'params'            : [{'kernel': 'poly', 'degree': 2}, ...],
+            'split0_test_score'  : [0.8, 0.7, 0.8, 0.9],
+            'split1_test_score'  : [0.82, 0.5, 0.7, 0.78],
+            'mean_test_score'    : [0.81, 0.60, 0.75, 0.82],
+            'std_test_score'     : [0.02, 0.01, 0.03, 0.03],
+            'rank_test_score'    : [2, 4, 3, 1],
+            'split0_train_score' : [0.8, 0.9, 0.7],
+            'split1_train_score' : [0.82, 0.5, 0.7],
+            'mean_train_score'   : [0.81, 0.7, 0.7],
+            'std_train_score'    : [0.03, 0.03, 0.04],
+            'mean_fit_time'      : [0.73, 0.63, 0.43, 0.49],
+            'std_fit_time'       : [0.01, 0.02, 0.01, 0.01],
+            'mean_score_time'    : [0.007, 0.06, 0.04, 0.04],
+            'std_score_time'     : [0.001, 0.002, 0.003, 0.005],
+            'params'             : [{'kernel': 'poly', 'degree': 2}, ...],
             }
 
         NOTE that the key ``'params'`` is used to store a list of parameter
-        settings dict for all the parameter candidates.  Besides,
-        ``'train_mean_score'``, ``'train_split*_score'``, ... will be present
-        when ``return_train_score=True``.
+        settings dict for all the parameter candidates.
 
     best_estimator_ : estimator
         Estimator that was chosen by the search, i.e. estimator
@@ -920,7 +907,7 @@ class GridSearchCV(BaseSearchCV):
     def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
                  n_jobs=1, iid=True, refit=True, cv=None, verbose=0,
                  pre_dispatch='2*n_jobs', error_score='raise',
-                 return_train_score=False):
+                 return_train_score=True):
         super(GridSearchCV, self).__init__(
             estimator=estimator, scoring=scoring, fit_params=fit_params,
             n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
@@ -1059,8 +1046,8 @@ class RandomizedSearchCV(BaseSearchCV):
         FitFailedWarning is raised. This parameter does not affect the refit
         step, which will always raise the error.
 
-    return_train_score: boolean, default=True
-        If ``'False'``, the results_ attribute will not include training
+    return_train_score : boolean, default=True
+        If ``'False'``, the ``cv_results_`` attribute will not include training
         scores.
 
     Attributes
@@ -1095,19 +1082,16 @@ class RandomizedSearchCV(BaseSearchCV):
             'split0_train_score' : [0.8, 0.9, 0.7],
             'split1_train_score' : [0.82, 0.5, 0.7],
             'mean_train_score'   : [0.81, 0.7, 0.7],
-            'std_train_score'    : [0.00073, 0.00063, 0.00043]
-            'rank_train_score'   : [1.62e-4, 3.37e-5, 1.1e-5]
-            'test_mean_time'    : [0.00073, 0.00063, 0.00043]
-            'test_std_time'     : [1.62e-4, 3.37e-5, 1.1e-5]
-            'test_std_score'    : [0.02, 0.2, 0.],
-            'test_rank_score'   : [3, 1, 1],
+            'std_train_score'    : [0.03, 0.03, 0.04],
+            'mean_fit_time'      : [0.73, 0.63, 0.43, 0.49],
+            'std_fit_time'       : [0.01, 0.02, 0.01, 0.01],
+            'mean_score_time'    : [0.007, 0.06, 0.04, 0.04],
+            'std_score_time'     : [0.001, 0.002, 0.003, 0.005],
             'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],
             }
 
         NOTE that the key ``'params'`` is used to store a list of parameter
-        settings dict for all the parameter candidates.  Besides,
-        'train_mean_score', 'train_split*_score', ... will be present when
-        return_train_score is set to True.
+        settings dict for all the parameter candidates.
 
     best_estimator_ : estimator
         Estimator that was chosen by the search, i.e. estimator
@@ -1162,7 +1146,7 @@ class RandomizedSearchCV(BaseSearchCV):
     def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
                  fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
                  verbose=0, pre_dispatch='2*n_jobs', random_state=None,
-                 error_score='raise', return_train_score=False):
+                 error_score='raise', return_train_score=True):
         self.param_distributions = param_distributions
         self.n_iter = n_iter
         self.random_state = random_state
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
@@ -1,3 +1,4 @@
+
 """
 The :mod:`sklearn.model_selection._validation` module includes classes and
 functions to validate the model.
@@ -142,7 +143,8 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
 
 def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                    parameters, fit_params, return_train_score=False,
-                   return_parameters=False, error_score='raise'):
+                   return_parameters=False, return_n_test_samples=False,
+                   return_times=False, error_score='raise'):
     """Fit estimator and compute scores for a given dataset split.
 
     Parameters
@@ -199,8 +201,11 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
     n_test_samples : int
         Number of test samples.
 
-    scoring_time : float
-        Time spent for fitting and scoring in seconds.
+    fit_time : float
+        Time spent for fitting in seconds.
+
+    score_time : float
+        Time spent for scoring in seconds.
 
     parameters : dict or None, optional
         The parameters that have been evaluated.
@@ -233,6 +238,9 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
             estimator.fit(X_train, y_train, **fit_params)
 
     except Exception as e:
+        # Note fit time as time until error
+        fit_time = time.time() - start_time
+        score_time = 0.0
         if error_score == 'raise':
             raise
         elif isinstance(error_score, numbers.Number):
@@ -248,20 +256,24 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                              " make sure that it has been spelled correctly.)")
 
     else:
+        fit_time = time.time() - start_time
         test_score = _score(estimator, X_test, y_test, scorer)
+        score_time = time.time() - start_time - fit_time
         if return_train_score:
             train_score = _score(estimator, X_train, y_train, scorer)
 
-    scoring_time = time.time() - start_time
-
     if verbose > 2:
         msg += ", score=%f" % test_score
     if verbose > 1:
-        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
+        end_msg = "%s -%s" % (msg, logger.short_format_time(score_time))
         print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
 
-    ret = [train_score] if return_train_score else []
-    ret.extend([test_score, _num_samples(X_test), scoring_time])
+    ret = [train_score, test_score] if return_train_score else [test_score]
+
+    if return_n_test_samples:
+        ret.append(_num_samples(X_test))
+    if return_times:
+        ret.extend([fit_time, score_time])
     if return_parameters:
         ret.append(parameters)
     return ret
@@ -758,7 +770,7 @@ def learning_curve(estimator, X, y, groups=None,
             verbose, parameters=None, fit_params=None, return_train_score=True)
             for train, test in cv_iter
             for n_train_samples in train_sizes_abs)
-        out = np.array(out)[:, :2]
+        out = np.array(out)
         n_cv_folds = out.shape[0] // n_unique_ticks
         out = out.reshape(n_cv_folds, n_unique_ticks, 2)
 
@@ -941,7 +953,7 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None,
         parameters={param_name: v}, fit_params=None, return_train_score=True)
         for train, test in cv.split(X, y, groups) for v in param_range)
 
-    out = np.asarray(out)[:, :2]
+    out = np.asarray(out)
     n_params = len(param_range)
     n_cv_folds = out.shape[0] // n_params
     out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0))
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py