Skip to content

[MRG] Ensure that classification metrics support string label #2170

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

Changelog
---------

- Missing values with sparse and dense matrices can be imputed with the
transformer :class:`preprocessing.Imputer` by `Nicolas Trésegnie`_.

Expand Down Expand Up @@ -145,6 +146,8 @@ Changelog
how to use OOB estimates to select the number of trees was added.
By `Peter Prettenhofer`_.

- Most metrics now support string labels for multiclass classification
by `Arnaud Joly`_ and `Lars Buitinck`_.


API changes summary
Expand Down
38 changes: 23 additions & 15 deletions sklearn/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,13 @@ def matthews_corrcoef(y_true, y_pred):
if y_type != "binary":
raise ValueError("%s is not supported" % y_type)

mcc = np.corrcoef(y_true, y_pred)[0, 1]
tp, tn, fp, fn = _tp_tn_fp_fn(y_true, y_pred)
tp, tn, fp, fn = tp[1], tn[1], fp[1], fn[1]

num = (tp * tn - fp * fn)
den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
mcc = num / den

if np.isnan(mcc):
return 0.
else:
Expand Down Expand Up @@ -499,7 +505,7 @@ def _binary_clf_curve(y_true, y_score, pos_label=None):
return fps, tps, y_score[threshold_idxs]


def precision_recall_curve(y_true, probas_pred):
def precision_recall_curve(y_true, probas_pred, pos_label=None):
"""Compute precision-recall pairs for different probability thresholds

Note: this implementation is restricted to the binary classification task.
Expand Down Expand Up @@ -705,6 +711,7 @@ def confusion_matrix(y_true, y_pred, labels=None):
if y_type not in ("binary", "multiclass"):
raise ValueError("%s is not supported" % y_type)


if labels is None:
labels = unique_labels(y_true, y_pred)
else:
Expand Down Expand Up @@ -1066,7 +1073,7 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
labels : array
Integer array of labels.

pos_label : int, 1 by default
pos_label : str or int, 1 by default
If ``average`` is not ``None`` and the classification target is binary,
only this class's scores will be returned.

Expand Down Expand Up @@ -1190,7 +1197,7 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
labels : array
Integer array of labels.

pos_label : int, 1 by default
pos_label : str or int, 1 by default
If ``average`` is not ``None`` and the classification target is binary,
only this class's scores will be returned.

Expand Down Expand Up @@ -1373,6 +1380,7 @@ def _tp_tn_fp_fn(y_true, y_pred, labels=None):
labels = unique_labels(y_true, y_pred)
else:
labels = np.asarray(labels)

n_labels = labels.size
true_pos = np.zeros((n_labels, ), dtype=np.int)
false_pos = np.zeros((n_labels, ), dtype=np.int)
Expand Down Expand Up @@ -1452,7 +1460,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
labels : array
Integer array of labels.

pos_label : int, 1 by default
pos_label : str or int, 1 by default
If ``average`` is not ``None`` and the classification target is binary,
only this class's scores will be returned.

Expand Down Expand Up @@ -1664,8 +1672,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
if len(labels) == 1:
# Only negative labels
return (0., 0., 0., 0)
raise ValueError("pos_label=%d is not a valid label: %r" %
(pos_label, labels))
raise ValueError("pos_label=%r is not a valid label: %r" %
(pos_label, list(labels)))
pos_label_idx = list(labels).index(pos_label)
return (precision[pos_label_idx], recall[pos_label_idx],
fscore[pos_label_idx], support[pos_label_idx])
Expand Down Expand Up @@ -1735,7 +1743,7 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
labels : array
Integer array of labels.

pos_label : int, 1 by default
pos_label : str or int, 1 by default
If ``average`` is not ``None`` and the classification target is binary,
only this class's scores will be returned.

Expand Down Expand Up @@ -1857,7 +1865,7 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
labels : array
Integer array of labels.

pos_label : int, 1 by default
pos_label : str or int, 1 by default
If ``average`` is not ``None`` and the classification target is binary,
only this class's scores will be returned.

Expand Down Expand Up @@ -2021,13 +2029,13 @@ class 2 1.00 1.00 1.00 2
if labels is None:
labels = unique_labels(y_true, y_pred)
else:
labels = np.asarray(labels, dtype=np.int)
labels = np.asarray(labels)

last_line_heading = 'avg / total'

if target_names is None:
width = len(last_line_heading)
target_names = ['%d' % l for l in labels]
target_names = ['{0}'.format(l) for l in labels]
else:
width = max(len(cn) for cn in target_names)
width = max(width, len(last_line_heading))
Expand All @@ -2049,8 +2057,8 @@ class 2 1.00 1.00 1.00 2
for i, label in enumerate(labels):
values = [target_names[i]]
for v in (p[i], r[i], f1[i]):
values += ["%0.2f" % float(v)]
values += ["%d" % int(s[i])]
values += ["{0:0.2f}".format(v)]
values += ["{0}".format(s[i])]
report += fmt % tuple(values)

report += '\n'
Expand All @@ -2060,8 +2068,8 @@ class 2 1.00 1.00 1.00 2
for v in (np.average(p, weights=s),
np.average(r, weights=s),
np.average(f1, weights=s)):
values += ["%0.2f" % float(v)]
values += ['%d' % np.sum(s)]
values += ["{0:0.2f}".format(v)]
values += ['{0}'.format(np.sum(s))]
report += fmt % tuple(values)
return report

Expand Down
137 changes: 116 additions & 21 deletions sklearn/metrics/tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,14 @@

from sklearn.externals.six.moves import xrange

ALL_METRICS = {
REGRESSION_METRICS = {
"mean_absolute_error": mean_absolute_error,
"mean_squared_error": mean_squared_error,
"explained_variance_score": explained_variance_score,
"r2_score": r2_score,
}

CLASSIFICATION_METRICS = {
"accuracy_score": accuracy_score,
"unormalized_accuracy_score": partial(accuracy_score, normalize=False),
"confusion_matrix": confusion_matrix,
Expand All @@ -74,8 +81,6 @@
"f2_score": partial(fbeta_score, beta=2),
"f0.5_score": partial(fbeta_score, beta=0.5),
"matthews_corrcoef_score": matthews_corrcoef,
"auc_score": auc_score,
"average_precision_score": average_precision_score,

"weighted_f0.5_score": partial(fbeta_score, average="weighted", beta=0.5),
"weighted_f1_score": partial(f1_score, average="weighted"),
Expand All @@ -95,13 +100,49 @@
"macro_precision_score": partial(precision_score, average="macro"),
"macro_recall_score": partial(recall_score, average="macro"),

"mean_absolute_error": mean_absolute_error,
"mean_squared_error": mean_squared_error,
"explained_variance_score": explained_variance_score,
"r2_score": r2_score,
"confusion_matrix": partial(confusion_matrix, labels=range(3)),
"confusion_matrix": partial(confusion_matrix),
}

THRESHOLDED_METRICS = {
"auc_score": auc_score,
"average_precision_score": average_precision_score,
}

ALL_METRICS = dict()
ALL_METRICS.update(THRESHOLDED_METRICS)
ALL_METRICS.update(CLASSIFICATION_METRICS)
ALL_METRICS.update(REGRESSION_METRICS)

METRICS_WITH_POS_LABEL = [
"roc_curve",

"precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",

"weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
"weighted_precision_score", "weighted_recall_score",

"micro_f0.5_score", "micro_f1_score", "micro_f2_score",
"micro_precision_score", "micro_recall_score",

"macro_f0.5_score", "macro_f1_score", "macro_f2_score",
"macro_precision_score", "macro_recall_score",
]

METRICS_WITH_LABELS = [
"confusion_matrix",

"precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",

"weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
"weighted_precision_score", "weighted_recall_score",

"micro_f0.5_score", "micro_f1_score", "micro_f2_score",
"micro_precision_score", "micro_recall_score",

"macro_f0.5_score", "macro_f1_score", "macro_f2_score",
"macro_precision_score", "macro_recall_score",
]

METRICS_WITH_NORMALIZE_OPTION = {
"accuracy_score ": accuracy_score,
"jaccard_similarity_score": jaccard_similarity_score,
Expand Down Expand Up @@ -211,11 +252,6 @@
"confusion_matrix": partial(confusion_matrix, labels=range(3)),
}

THRESHOLDED_METRICS = {
"auc_score": auc_score,
"average_precision_score": average_precision_score,
}


def make_prediction(dataset=None, binary=False):
"""Make some classification predictions on a toy dataset using a SVC
Expand Down Expand Up @@ -706,24 +742,47 @@ def test_classification_report_multiclass():
expected_report = """\
precision recall f1-score support

0 0.82 0.92 0.87 25
1 0.56 0.17 0.26 30
2 0.47 0.90 0.62 20
0 0.83 0.79 0.81 24
1 0.33 0.10 0.15 31
2 0.42 0.90 0.57 20

avg / total 0.62 0.61 0.56 75
avg / total 0.51 0.53 0.47 75
"""
report = classification_report(y_true, y_pred)
assert_equal(report, expected_report)


def test_classification_report_multiclass_with_string_label():
y_true, y_pred, _ = make_prediction(binary=False)

y_true = np.array(["blue", "green", "red"])[y_true]
y_pred = np.array(["blue", "green", "red"])[y_pred]

expected_report = """\
precision recall f1-score support

0 0.83 0.79 0.81 24
1 0.33 0.10 0.15 31
2 0.42 0.90 0.57 20
blue 0.83 0.79 0.81 24
green 0.33 0.10 0.15 31
red 0.42 0.90 0.57 20

avg / total 0.51 0.53 0.47 75
"""
report = classification_report(y_true, y_pred)
assert_equal(report, expected_report)

expected_report = """\
precision recall f1-score support

a 0.83 0.79 0.81 24
b 0.33 0.10 0.15 31
c 0.42 0.90 0.57 20

avg / total 0.51 0.53 0.47 75
"""
report = classification_report(y_true, y_pred,
target_names=["a", "b", "c"])
assert_equal(report, expected_report)


def test_multilabel_classification_report():

Expand Down Expand Up @@ -891,7 +950,7 @@ def test_symmetry():

# We shouldn't forget any metrics
assert_equal(set(SYMMETRIC_METRICS).union(NOT_SYMMETRIC_METRICS,
THRESHOLDED_METRICS),
THRESHOLDED_METRICS),
set(ALL_METRICS))

assert_equal(set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)),
Expand Down Expand Up @@ -1009,6 +1068,42 @@ def test_format_invariance_with_1d_vectors():
assert_raises(ValueError, metric, y1_row, y2_row)


def test_invariance_string_vs_numbers_labels():
"""Ensure that classification metrics with string labels"""
y1, y2, _ = make_prediction(binary=True)

y1_str = np.array(["eggs", "spam"])[y1]
y2_str = np.array(["eggs", "spam"])[y2]

pos_label_str = "spam"
labels_str = ["eggs", "spam"]

for name, metric in CLASSIFICATION_METRICS.items():
measure_with_number = metric(y1, y2)

# Ugly, but handle case with a pos_label and label
metric_str = metric
if name in METRICS_WITH_POS_LABEL:
metric_str = partial(metric_str, pos_label=pos_label_str)

measure_with_str = metric_str(y1_str, y2_str)

assert_array_equal(measure_with_number, measure_with_str,
err_msg="{0} failed string vs number invariance "
"test".format(name))

if name in METRICS_WITH_LABELS:
metric_str = partial(metric_str, labels=labels_str)
measure_with_str = metric_str(y1_str, y2_str)
assert_array_equal(measure_with_number, measure_with_str,
err_msg="{0} failed string vs number "
"invariance test".format(name))

# TODO Currently not supported
for name, metrics in THRESHOLDED_METRICS.items():
assert_raises(ValueError, metrics, y1_str, y2_str)


def test_clf_single_sample():
"""Non-regression test: scores should work with a single sample.

Expand Down