Skip to content

WIP GridSearch with advanced score functions. #1198

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 17 additions & 14 deletions sklearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
import numpy as np
from scipy import sparse

from .metrics import r2_score
from .metrics import r2_score, zero_one_score


###############################################################################
def clone(estimator, safe=True):
"""Constructs a new estimator with the same parameters.

Expand Down Expand Up @@ -96,7 +95,6 @@ def clone(estimator, safe=True):
return new_object


###############################################################################
def _pprint(params, offset=0, printer=repr):
"""Pretty print the dictionary 'params'

Expand Down Expand Up @@ -148,7 +146,6 @@ def _pprint(params, offset=0, printer=repr):
return lines


###############################################################################
class BaseEstimator(object):
"""Base class for all estimators in scikit-learn

Expand Down Expand Up @@ -262,11 +259,10 @@ def __str__(self):
)


###############################################################################
class ClassifierMixin(object):
"""Mixin class for all classifiers in scikit-learn"""

def score(self, X, y):
def score(self, X, y, score_func=None):
"""Returns the mean accuracy on the given test data and labels.

Parameters
Expand All @@ -282,14 +278,23 @@ def score(self, X, y):
z : float

"""
return np.mean(self.predict(X) == y)
if score_func is None:
score_func = zero_one_score

if getattr(score_func, "requires_thresholds", False):
try:
score = score_func(y, self.predict_proba(X)[:, 0])
except (AttributeError, NotImplementedError):
score = score_func(y, self.decision_function(X))
else:
score = score_func(y, self.predict(X))
return score


###############################################################################
class RegressorMixin(object):
"""Mixin class for all regression estimators in scikit-learn"""

def score(self, X, y):
def score(self, X, y, score_func=None):
"""Returns the coefficient of determination R^2 of the prediction.

The coefficient R^2 is defined as (1 - u/v), where u is the
Expand All @@ -309,10 +314,11 @@ def score(self, X, y):
-------
z : float
"""
return r2_score(y, self.predict(X))
if score_func is None:
score_func = r2_score
return score_func(y, self.predict(X))


###############################################################################
class ClusterMixin(object):
"""Mixin class for all cluster estimators in scikit-learn"""
def fit_predict(self, X, y=None):
Expand All @@ -334,7 +340,6 @@ def fit_predict(self, X, y=None):
return self.labels_


###############################################################################
class TransformerMixin(object):
"""Mixin class for all transformers in scikit-learn"""

Expand Down Expand Up @@ -372,13 +377,11 @@ def fit_transform(self, X, y=None, **fit_params):
return self.fit(X, y, **fit_params).transform(X)


###############################################################################
class MetaEstimatorMixin(object):
"""Mixin class for all meta estimators in scikit-learn"""
# this is just a tag for the moment


###############################################################################
# XXX: Temporary solution to figure out if an estimator is a classifier

def _get_sub_estimator(estimator):
Expand Down
8 changes: 2 additions & 6 deletions sklearn/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,8 +997,6 @@ def __len__(self):
return self.n_iterations


##############################################################################

def _cross_val_score(estimator, X, y, score_func, train, test, verbose,
fit_params):
"""Inner loop for cross validation"""
Expand All @@ -1008,16 +1006,14 @@ def _cross_val_score(estimator, X, y, score_func, train, test, verbose,
for k, v in fit_params.items()])
if y is None:
estimator.fit(X[train], **fit_params)
# XXX this should be handled in estimator.score
if score_func is None:
score = estimator.score(X[test])
else:
score = score_func(X[test])
else:
estimator.fit(X[train], y[train], **fit_params)
if score_func is None:
score = estimator.score(X[test], y[test])
else:
score = score_func(y[test], estimator.predict(X[test]))
score = estimator.score(X[test], y[test], score_func)
if verbose > 1:
print("score: %f" % score)
return score
Expand Down
14 changes: 2 additions & 12 deletions sklearn/grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,8 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, loss_func,
if loss_func is not None:
y_pred = clf.predict(X_test)
this_score = -loss_func(y_test, y_pred)
elif score_func is not None:
y_pred = clf.predict(X_test)
this_score = score_func(y_test, y_pred)
else:
this_score = clf.score(X_test, y_test)
this_score = clf.score(X_test, y_test, score_func=score_func)

if y is not None:
if hasattr(y, 'shape'):
Expand Down Expand Up @@ -447,11 +444,4 @@ def _fit(self, X, y):
return self

def score(self, X, y=None):
if hasattr(self.best_estimator_, 'score'):
return self.best_estimator_.score(X, y)
if self.score_func is None:
raise ValueError("No score function explicitly defined, "
"and the estimator doesn't provide one %s"
% self.best_estimator_)
y_predicted = self.predict(X)
return self.score_func(y, y_predicted)
return self.best_estimator_.score(X, y, score_func=self.score_func)
12 changes: 10 additions & 2 deletions sklearn/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
from ..utils import check_arrays


def requires_thresholds(f):
"""Decorator to tag scores that need continuous inputs."""
f.requires_thresholds = True
return f
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could even check the input in the wrapper and yield a meaningful error message if the second input array has a 1D shape.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how does that work? Write a wrapper function for f that does the check and then applies f?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes



def unique_labels(*lists_of_labels):
"""Extract an ordered array of unique labels"""
labels = set()
Expand Down Expand Up @@ -180,6 +186,7 @@ def roc_curve(y_true, y_score):
return fpr, tpr, thresholds[::-1]


@requires_thresholds
def average_precision_score(y_true, y_score):
"""Compute average precision (AP) from prediction scores.

Expand Down Expand Up @@ -215,6 +222,7 @@ def average_precision_score(y_true, y_score):
return auc(recall, precision)


@requires_thresholds
def auc_score(y_true, y_score):
"""Compute Area Under the Curve (AUC) from prediction scores.

Expand Down Expand Up @@ -985,12 +993,12 @@ def zero_one_score(y_true, y_pred):
# Loss functions

def zero_one(y_true, y_pred):
"""Zero-One classification loss
"""Zero-One classification loss.

Positive integer (number of misclassifications). The best performance
is 0.

Return the number of errors
Return the number of errors.

Parameters
----------
Expand Down
5 changes: 3 additions & 2 deletions sklearn/multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,13 @@ def multilabel_(self):
"""Whether this is a multilabel classifier"""
return self.label_binarizer_.multilabel

def score(self, X, y):
def score(self, X, y, score_func=None):
if self.multilabel_:
raise NotImplementedError(
"score is not supported for multilabel classifiers")
else:
return super(OneVsRestClassifier, self).score(X, y)
return super(OneVsRestClassifier, self).score(X, y,
score_func=score_func)

@property
def classes_(self):
Expand Down
4 changes: 2 additions & 2 deletions sklearn/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,14 +190,14 @@ def inverse_transform(self, X):
Xt = step.inverse_transform(Xt)
return Xt

def score(self, X, y=None):
def score(self, X, y=None, score_func=None):
"""Applies transforms to the data, and the score method of the
final estimator. Valid only if the final estimator implements
score."""
Xt = X
for name, transform in self.steps[:-1]:
Xt = transform.transform(Xt)
return self.steps[-1][-1].score(Xt, y)
return self.steps[-1][-1].score(Xt, y, score_func)

@property
def _pairwise(self):
Expand Down
13 changes: 13 additions & 0 deletions sklearn/svm/tests/test_svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from sklearn.datasets.samples_generator import make_classification
from sklearn.utils import check_random_state
from sklearn.utils.testing import assert_greater, assert_less
from sklearn.metrics import auc_score
from sklearn.grid_search import GridSearchCV

# toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
Expand Down Expand Up @@ -575,6 +577,17 @@ def test_svc_clone_with_callable_kernel():
b.decision_function(X)


def test_auc_grid_search():
X, y = make_classification(n_classes=2, random_state=0)
clf = svm.SVC()
param_grid = {'C': [0.01, 0.1, 1]}
grid = GridSearchCV(clf, param_grid, score_func=auc_score)
grid.fit(X, y)
clf2 = svm.SVC(probability=True)
grid2 = GridSearchCV(clf2, param_grid, score_func=auc_score)
grid2.fit(X, y)


if __name__ == '__main__':
import nose
nose.runmodule()
2 changes: 1 addition & 1 deletion sklearn/tests/test_cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def fit(self, X, Y, sample_weight=None, class_prior=None):
def predict(self, T):
return T.shape[0]

def score(self, X=None, Y=None):
def score(self, X=None, Y=None, score_func=None):
return 1. / (1 + np.abs(self.a))


Expand Down
5 changes: 4 additions & 1 deletion sklearn/tests/test_grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def fit(self, X, Y):
def predict(self, T):
return T.shape[0]

def score(self, X=None, Y=None):
def score(self, X=None, Y=None, score_func=None):
if self.foo_param > 1:
score = 1.
else:
Expand Down Expand Up @@ -200,6 +200,9 @@ def fit(self, X, y):
def predict(self, X):
return np.zeros(X.shape[0])

def score(self, X, y, score_func=None):
return 0


def test_refit():
"""Regression test for bug in refitting
Expand Down