Skip to content

[WIP] scorer_params and sample_weight support #3524

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 35 additions & 11 deletions sklearn/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,7 +1077,8 @@ def __len__(self):


def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
verbose=0, fit_params=None, pre_dispatch='2*n_jobs',
scorer_params=None):
"""Evaluate a score by cross-validation

Parameters
Expand Down Expand Up @@ -1130,6 +1131,10 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
- A string, giving an expression as a function of n_jobs,
as in '2*n_jobs'

scorer_params : dict, optional
Parameters to pass to the scorer. Can be used for sample weights
and sample groups.

Returns
-------
scores : array of float, shape=(len(list(cv)),)
Expand All @@ -1143,15 +1148,15 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
# independent, and that it is pickle-able.
parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
pre_dispatch=pre_dispatch)
scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
train, test, verbose, None,
fit_params)
scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y,
scorer, train, test, verbose,
None, fit_params, scorer_params)
for train, test in cv)
return np.array(scores)[:, 0]


def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
fit_params, return_train_score=False,
fit_params, scorer_params, return_train_score=False,
return_parameters=False):
"""Fit estimator and compute scores for a given dataset split.

Expand All @@ -1163,7 +1168,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
X : array-like of shape at least 2D
The data to fit.

y : array-like, optional, default: None
y : array-like or None
The target variable to try to predict in the case of
supervised learning.

Expand All @@ -1186,6 +1191,9 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
fit_params : dict or None
Parameters that will be passed to ``estimator.fit``.

scorer_params : dict or None
Parameters that will be passed to the scorer.

return_train_score : boolean, optional, default: False
Compute and return score on training set.

Expand Down Expand Up @@ -1224,20 +1232,36 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
if hasattr(v, '__len__') and len(v) == n_samples else v)
for k, v in fit_params.items()])

# Same, but take both slices
scorer_params = scorer_params if scorer_params is not None else {}
train_scorer_params = dict([(k, np.asarray(v)[train]
if hasattr(v, '__len__')
and len(v) == n_samples
else v)
for k, v in scorer_params.items()])
test_scorer_params = dict([(k, np.asarray(v)[test]
if hasattr(v, '__len__')
and len(v) == n_samples
else v)
for k, v in scorer_params.items()])

if parameters is not None:
estimator.set_params(**parameters)

start_time = time.time()

X_train, y_train = _safe_split(estimator, X, y, train)
X_test, y_test = _safe_split(estimator, X, y, test, train)

if y_train is None:
estimator.fit(X_train, **fit_params)
else:
estimator.fit(X_train, y_train, **fit_params)
test_score = _score(estimator, X_test, y_test, scorer)
test_score = _score(estimator, X_test, y_test, scorer,
**test_scorer_params)
if return_train_score:
train_score = _score(estimator, X_train, y_train, scorer)
train_score = _score(estimator, X_train, y_train, scorer,
**train_scorer_params)

scoring_time = time.time() - start_time

Expand Down Expand Up @@ -1286,12 +1310,12 @@ def _safe_split(estimator, X, y, indices, train_indices=None):
return X_subset, y_subset


def _score(estimator, X_test, y_test, scorer):
def _score(estimator, X_test, y_test, scorer, **params):
"""Compute the score of an estimator on a given test set."""
if y_test is None:
score = scorer(estimator, X_test)
score = scorer(estimator, X_test, **params)
else:
score = scorer(estimator, X_test, y_test)
score = scorer(estimator, X_test, y_test, **params)
if not isinstance(score, numbers.Number):
raise ValueError("scoring must return a number, got %s (%s) instead."
% (str(score), type(score)))
Expand Down
30 changes: 23 additions & 7 deletions sklearn/feature_selection/rfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def __init__(self, estimator, step=1, cv=None, scoring=None,
self.estimator_params = estimator_params
self.verbose = verbose

def fit(self, X, y):
def fit(self, X, y, sample_weight=None):
"""Fit the RFE model and automatically tune the number of selected
features.

Expand All @@ -319,6 +319,9 @@ def fit(self, X, y):
y : array-like, shape = [n_samples]
Target values (integers for classification, real numbers for
regression).

sample_weight : array-like, shape = [n_samples], optional (default=None)
Sample weights.
"""
X, y = check_X_y(X, y, "csr")
# Initialization
Expand All @@ -332,17 +335,27 @@ def fit(self, X, y):

# Cross-validation
for n, (train, test) in enumerate(cv):
X_train, y_train = _safe_split(self.estimator, X, y, train)
X_test, y_test = _safe_split(self.estimator, X, y, test, train)
X_train, y_train = _safe_split(
self.estimator, X, y, train)
X_test, y_test = _safe_split(
self.estimator, X, y, test, train)

fit_params = dict()
score_params = dict()
if sample_weight is not None:
sample_weight = np.asarray(sample_weight)
fit_params['sample_weight'] = sample_weight[train]
score_params['sample_weight'] = sample_weight[test]

# Compute a full ranking of the features
ranking_ = rfe.fit(X_train, y_train).ranking_
ranking_ = rfe.fit(X_train, y_train, **fit_params).ranking_
# Score each subset of features
for k in range(0, max(ranking_)):
mask = np.where(ranking_ <= k + 1)[0]
estimator = clone(self.estimator)
estimator.fit(X_train[:, mask], y_train)
score = _score(estimator, X_test[:, mask], y_test, scorer)
estimator.fit(X_train[:, mask], y_train, **fit_params)
score = _score(
estimator, X_test[:, mask], y_test, scorer, **score_params)

if self.verbose > 0:
print("Finished fold with %d / %d feature ranks, score=%f"
Expand All @@ -358,7 +371,10 @@ def fit(self, X, y):
n_features_to_select=k+1,
step=self.step, estimator_params=self.estimator_params)

rfe.fit(X, y)
if sample_weight is not None:
rfe.fit(X, y, sample_weight=sample_weight)
else:
rfe.fit(X, y)

# Set final attributes
self.support_ = rfe.support_
Expand Down
39 changes: 23 additions & 16 deletions sklearn/grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Andreas Mueller <amueller@ais.uni-bonn.de>
# Olivier Grisel <olivier.grisel@ensta.org>
# Noel Dawe <noel@dawe.me>
# License: BSD 3 clause

from abc import ABCMeta, abstractmethod
Expand Down Expand Up @@ -226,7 +227,8 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
n_samples_test : int
Number of test samples in this split.
"""
score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train,
score, n_samples_test, _ = _fit_and_score(estimator, X, y, None,
scorer, train,
test, verbose, parameters,
fit_params)
return score, parameters, n_samples_test
Expand Down Expand Up @@ -279,7 +281,8 @@ class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator,
@abstractmethod
def __init__(self, estimator, scoring=None,
fit_params=None, n_jobs=1, iid=True,
refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'):
refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
scorer_params=None):

self.scoring = scoring
self.estimator = estimator
Expand All @@ -290,8 +293,9 @@ def __init__(self, estimator, scoring=None,
self.cv = cv
self.verbose = verbose
self.pre_dispatch = pre_dispatch
self.scorer_params = scorer_params

def score(self, X, y=None):
def score(self, X, y=None, **scorer_params):
"""Returns the score on the given test data and labels, if the search
estimator has been refit. The ``score`` function of the best estimator
is used, or the ``scoring`` parameter where unavailable.
Expand All @@ -312,12 +316,12 @@ def score(self, X, y=None):

"""
if hasattr(self.best_estimator_, 'score'):
return self.best_estimator_.score(X, y)
return self.best_estimator_.score(X, y, **scorer_params)
if self.scorer_ is None:
raise ValueError("No score function explicitly defined, "
"and the estimator doesn't provide one %s"
% self.best_estimator_)
return self.scorer_(self.best_estimator_, X, y)
return self.scorer_(self.best_estimator_, X, y, **scorer_params)

@property
def predict(self):
Expand Down Expand Up @@ -350,6 +354,7 @@ def _fit(self, X, y, parameter_iterable):
raise ValueError('Target variable (y) has a different number '
'of samples (%i) than data (X: %i samples)'
% (len(y), n_samples))

cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

if self.verbose > 0:
Expand All @@ -367,9 +372,10 @@ def _fit(self, X, y, parameter_iterable):
n_jobs=self.n_jobs, verbose=self.verbose,
pre_dispatch=pre_dispatch
)(
delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
train, test, self.verbose, parameters,
self.fit_params, return_parameters=True)
delayed(_fit_and_score)(clone(base_estimator), X, y,
self.scorer_, train, test,
self.verbose, parameters, self.fit_params,
self.scorer_params, return_parameters=True)
for parameters in parameter_iterable
for train, test in cv)

Expand Down Expand Up @@ -411,14 +417,15 @@ def _fit(self, X, y, parameter_iterable):
self.best_score_ = best.mean_validation_score

if self.refit:
fit_params = self.fit_params
# fit the best estimator using the entire dataset
# clone first to work around broken estimators
best_estimator = clone(base_estimator).set_params(
**best.parameters)
if y is not None:
best_estimator.fit(X, y, **self.fit_params)
best_estimator.fit(X, y, **fit_params)
else:
best_estimator.fit(X, **self.fit_params)
best_estimator.fit(X, **fit_params)
self.best_estimator_ = best_estimator
return self

Expand Down Expand Up @@ -566,10 +573,11 @@ class GridSearchCV(BaseSearchCV):

def __init__(self, estimator, param_grid, scoring=None,
fit_params=None, n_jobs=1, iid=True,
refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'):
refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
scorer_params=None):
super(GridSearchCV, self).__init__(
estimator, scoring, fit_params, n_jobs, iid,
refit, cv, verbose, pre_dispatch)
refit, cv, verbose, pre_dispatch, scorer_params)
self.param_grid = param_grid
_check_param_grid(param_grid)

Expand All @@ -586,7 +594,6 @@ def fit(self, X, y=None):
y : array-like, shape = [n_samples] or [n_samples, n_output], optional
Target relative to X for classification or regression;
None for unsupervised learning.

"""
return self._fit(X, y, ParameterGrid(self.param_grid))

Expand Down Expand Up @@ -714,15 +721,16 @@ class RandomizedSearchCV(BaseSearchCV):

def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
verbose=0, pre_dispatch='2*n_jobs', random_state=None):
verbose=0, pre_dispatch='2*n_jobs', random_state=None,
scorer_params=None):

self.param_distributions = param_distributions
self.n_iter = n_iter
self.random_state = random_state
super(RandomizedSearchCV, self).__init__(
estimator=estimator, scoring=scoring, fit_params=fit_params,
n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
pre_dispatch=pre_dispatch)
pre_dispatch=pre_dispatch, scorer_params=scorer_params)

def fit(self, X, y=None):
"""Run fit on the estimator with randomly drawn parameters.
Expand All @@ -736,7 +744,6 @@ def fit(self, X, y=None):
y : array-like, shape = [n_samples] or [n_samples, n_output], optional
Target relative to X for classification or regression;
None for unsupervised learning.

"""
sampled_params = ParameterSampler(self.param_distributions,
self.n_iter,
Expand Down
Loading