scikit-learn · vene · Apr 6, 2014 · Apr 22, 2014 · Apr 22, 2014 · Apr 22, 2014
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
@@ -1077,7 +1077,8 @@ def __len__(self):
 
 
 def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
-                    verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
+                    verbose=0, fit_params=None, pre_dispatch='2*n_jobs',
+                    scorer_params=None):
     """Evaluate a score by cross-validation
 
     Parameters
@@ -1130,6 +1131,10 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
             - A string, giving an expression as a function of n_jobs,
               as in '2*n_jobs'
 
+    scorer_params : dict, optional
+        Parameters to pass to the scorer.  Can be used for sample weights
+        and sample groups.
+
     Returns
     -------
     scores : array of float, shape=(len(list(cv)),)
@@ -1143,15 +1148,15 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
     # independent, and that it is pickle-able.
     parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                         pre_dispatch=pre_dispatch)
-    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
-                                              train, test, verbose, None,
-                                              fit_params)
+    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y,
+                                              scorer, train, test, verbose,
+                                              None, fit_params, scorer_params)
                       for train, test in cv)
     return np.array(scores)[:, 0]
 
 
 def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
-                   fit_params, return_train_score=False,
+                   fit_params, scorer_params, return_train_score=False,
                    return_parameters=False):
     """Fit estimator and compute scores for a given dataset split.
 
@@ -1163,7 +1168,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
     X : array-like of shape at least 2D
         The data to fit.
 
-    y : array-like, optional, default: None
+    y : array-like or None
         The target variable to try to predict in the case of
         supervised learning.
 
@@ -1186,6 +1191,9 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
     fit_params : dict or None
         Parameters that will be passed to ``estimator.fit``.
 
+    scorer_params : dict or None
+        Parameters that will be passed to the scorer.
+
     return_train_score : boolean, optional, default: False
         Compute and return score on training set.
 
@@ -1224,20 +1232,36 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
                        if hasattr(v, '__len__') and len(v) == n_samples else v)
                        for k, v in fit_params.items()])
 
+    # Same, but take both slices
+    scorer_params = scorer_params if scorer_params is not None else {}
+    train_scorer_params = dict([(k, np.asarray(v)[train]
+                                 if hasattr(v, '__len__')
+                                 and len(v) == n_samples
+                                 else v)
+                                for k, v in scorer_params.items()])
+    test_scorer_params = dict([(k, np.asarray(v)[test]
+                                if hasattr(v, '__len__')
+                                and len(v) == n_samples
+                                else v)
+                               for k, v in scorer_params.items()])
+
     if parameters is not None:
         estimator.set_params(**parameters)
 
     start_time = time.time()
 
     X_train, y_train = _safe_split(estimator, X, y, train)
     X_test, y_test = _safe_split(estimator, X, y, test, train)
+
     if y_train is None:
         estimator.fit(X_train, **fit_params)
     else:
         estimator.fit(X_train, y_train, **fit_params)
-    test_score = _score(estimator, X_test, y_test, scorer)
+    test_score = _score(estimator, X_test, y_test, scorer,
+                        **test_scorer_params)
     if return_train_score:
-        train_score = _score(estimator, X_train, y_train, scorer)
+        train_score = _score(estimator, X_train, y_train, scorer,
+                             **train_scorer_params)
 
     scoring_time = time.time() - start_time
 
@@ -1286,12 +1310,12 @@ def _safe_split(estimator, X, y, indices, train_indices=None):
     return X_subset, y_subset
 
 
-def _score(estimator, X_test, y_test, scorer):
+def _score(estimator, X_test, y_test, scorer, **params):
     """Compute the score of an estimator on a given test set."""
     if y_test is None:
-        score = scorer(estimator, X_test)
+        score = scorer(estimator, X_test, **params)
     else:
-        score = scorer(estimator, X_test, y_test)
+        score = scorer(estimator, X_test, y_test, **params)
     if not isinstance(score, numbers.Number):
         raise ValueError("scoring must return a number, got %s (%s) instead."
                          % (str(score), type(score)))

diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py
@@ -306,7 +306,7 @@ def __init__(self, estimator, step=1, cv=None, scoring=None,
         self.estimator_params = estimator_params
         self.verbose = verbose
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         """Fit the RFE model and automatically tune the number of selected
            features.
 
@@ -319,6 +319,9 @@ def fit(self, X, y):
         y : array-like, shape = [n_samples]
             Target values (integers for classification, real numbers for
             regression).
+
+        sample_weight : array-like, shape = [n_samples], optional (default=None)
+            Sample weights.
         """
         X, y = check_X_y(X, y, "csr")
         # Initialization
@@ -332,17 +335,27 @@ def fit(self, X, y):
 
         # Cross-validation
         for n, (train, test) in enumerate(cv):
-            X_train, y_train = _safe_split(self.estimator, X, y, train)
-            X_test, y_test = _safe_split(self.estimator, X, y, test, train)
+            X_train, y_train = _safe_split(
+                self.estimator, X, y, train)
+            X_test, y_test  = _safe_split(
+                self.estimator, X, y, test, train)
+
+            fit_params = dict()
+            score_params = dict()
+            if sample_weight is not None:
+                sample_weight = np.asarray(sample_weight)
+                fit_params['sample_weight'] = sample_weight[train]
+                score_params['sample_weight'] = sample_weight[test]
 
             # Compute a full ranking of the features
-            ranking_ = rfe.fit(X_train, y_train).ranking_
+            ranking_ = rfe.fit(X_train, y_train, **fit_params).ranking_
             # Score each subset of features
             for k in range(0, max(ranking_)):
                 mask = np.where(ranking_ <= k + 1)[0]
                 estimator = clone(self.estimator)
-                estimator.fit(X_train[:, mask], y_train)
-                score = _score(estimator, X_test[:, mask], y_test, scorer)
+                estimator.fit(X_train[:, mask], y_train, **fit_params)
+                score = _score(
+                    estimator, X_test[:, mask], y_test, scorer, **score_params)
 
                 if self.verbose > 0:
                     print("Finished fold with %d / %d feature ranks, score=%f"
@@ -358,7 +371,10 @@ def fit(self, X, y):
                   n_features_to_select=k+1,
                   step=self.step, estimator_params=self.estimator_params)
 
-        rfe.fit(X, y)
+        if sample_weight is not None:
+            rfe.fit(X, y, sample_weight=sample_weight)
+        else:
+            rfe.fit(X, y)
 
         # Set final attributes
         self.support_ = rfe.support_

diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
@@ -8,6 +8,7 @@
 #         Gael Varoquaux <gael.varoquaux@normalesup.org>
 #         Andreas Mueller <amueller@ais.uni-bonn.de>
 #         Olivier Grisel <olivier.grisel@ensta.org>
+#         Noel Dawe <noel@dawe.me>
 # License: BSD 3 clause
 
 from abc import ABCMeta, abstractmethod
@@ -226,7 +227,8 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
     n_samples_test : int
         Number of test samples in this split.
     """
-    score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train,
+    score, n_samples_test, _ = _fit_and_score(estimator, X, y, None,
+                                              scorer, train,
                                               test, verbose, parameters,
                                               fit_params)
     return score, parameters, n_samples_test
@@ -279,7 +281,8 @@ class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator,
     @abstractmethod
     def __init__(self, estimator, scoring=None,
                  fit_params=None, n_jobs=1, iid=True,
-                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'):
+                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
+                 scorer_params=None):
 
         self.scoring = scoring
         self.estimator = estimator
@@ -290,8 +293,9 @@ def __init__(self, estimator, scoring=None,
         self.cv = cv
         self.verbose = verbose
         self.pre_dispatch = pre_dispatch
+        self.scorer_params = scorer_params
 
-    def score(self, X, y=None):
+    def score(self, X, y=None, **scorer_params):
         """Returns the score on the given test data and labels, if the search
         estimator has been refit. The ``score`` function of the best estimator
         is used, or the ``scoring`` parameter where unavailable.
@@ -312,12 +316,12 @@ def score(self, X, y=None):
 
         """
         if hasattr(self.best_estimator_, 'score'):
-            return self.best_estimator_.score(X, y)
+            return self.best_estimator_.score(X, y, **scorer_params)
         if self.scorer_ is None:
             raise ValueError("No score function explicitly defined, "
                              "and the estimator doesn't provide one %s"
                              % self.best_estimator_)
-        return self.scorer_(self.best_estimator_, X, y)
+        return self.scorer_(self.best_estimator_, X, y, **scorer_params)
 
     @property
     def predict(self):
@@ -350,6 +354,7 @@ def _fit(self, X, y, parameter_iterable):
                 raise ValueError('Target variable (y) has a different number '
                                  'of samples (%i) than data (X: %i samples)'
                                  % (len(y), n_samples))
+
         cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
 
         if self.verbose > 0:
@@ -367,9 +372,10 @@ def _fit(self, X, y, parameter_iterable):
             n_jobs=self.n_jobs, verbose=self.verbose,
             pre_dispatch=pre_dispatch
         )(
-            delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
-                                    train, test, self.verbose, parameters,
-                                    self.fit_params, return_parameters=True)
+            delayed(_fit_and_score)(clone(base_estimator), X, y,
+                                    self.scorer_, train, test,
+                                    self.verbose, parameters, self.fit_params,
+                                    self.scorer_params, return_parameters=True)
             for parameters in parameter_iterable
             for train, test in cv)
 
@@ -411,14 +417,15 @@ def _fit(self, X, y, parameter_iterable):
         self.best_score_ = best.mean_validation_score
 
         if self.refit:
+            fit_params = self.fit_params
             # fit the best estimator using the entire dataset
             # clone first to work around broken estimators
             best_estimator = clone(base_estimator).set_params(
                 **best.parameters)
             if y is not None:
-                best_estimator.fit(X, y, **self.fit_params)
+                best_estimator.fit(X, y, **fit_params)
             else:
-                best_estimator.fit(X, **self.fit_params)
+                best_estimator.fit(X, **fit_params)
             self.best_estimator_ = best_estimator
         return self
 
@@ -566,10 +573,11 @@ class GridSearchCV(BaseSearchCV):
 
     def __init__(self, estimator, param_grid, scoring=None,
                  fit_params=None, n_jobs=1, iid=True,
-                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'):
+                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
+                 scorer_params=None):
         super(GridSearchCV, self).__init__(
             estimator, scoring, fit_params, n_jobs, iid,
-            refit, cv, verbose, pre_dispatch)
+            refit, cv, verbose, pre_dispatch, scorer_params)
         self.param_grid = param_grid
         _check_param_grid(param_grid)
 
@@ -586,7 +594,6 @@ def fit(self, X, y=None):
         y : array-like, shape = [n_samples] or [n_samples, n_output], optional
             Target relative to X for classification or regression;
             None for unsupervised learning.
-
         """
         return self._fit(X, y, ParameterGrid(self.param_grid))
 
@@ -714,15 +721,16 @@ class RandomizedSearchCV(BaseSearchCV):
 
     def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
                  fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
-                 verbose=0, pre_dispatch='2*n_jobs', random_state=None):
+                 verbose=0, pre_dispatch='2*n_jobs', random_state=None,
+                 scorer_params=None):
 
         self.param_distributions = param_distributions
         self.n_iter = n_iter
         self.random_state = random_state
         super(RandomizedSearchCV, self).__init__(
             estimator=estimator, scoring=scoring, fit_params=fit_params,
             n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
-            pre_dispatch=pre_dispatch)
+            pre_dispatch=pre_dispatch, scorer_params=scorer_params)
 
     def fit(self, X, y=None):
         """Run fit on the estimator with randomly drawn parameters.
@@ -736,7 +744,6 @@ def fit(self, X, y=None):
         y : array-like, shape = [n_samples] or [n_samples, n_output], optional
             Target relative to X for classification or regression;
             None for unsupervised learning.
-
         """
         sampled_params = ParameterSampler(self.param_distributions,
                                           self.n_iter,