From 8bbd0c0114b387b08026065bd8e2cbf8efa63b66 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Fri, 19 Jul 2013 08:23:01 +0200 Subject: [PATCH 1/9] ENH add support for string input with classification metrics --- sklearn/metrics/metrics.py | 21 +++-- sklearn/metrics/tests/test_metrics.py | 120 +++++++++++++++++++++----- 2 files changed, 114 insertions(+), 27 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index f564ff72338af..967ec7294166b 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -27,6 +27,7 @@ from ..utils import check_arrays from ..utils import deprecated from ..utils.fixes import divide +from ..utils.fixes import unique from ..utils.multiclass import unique_labels from ..utils.multiclass import type_of_target @@ -425,7 +426,13 @@ def matthews_corrcoef(y_true, y_pred): if y_type != "binary": raise ValueError("%s is not supported" % y_type) - mcc = np.corrcoef(y_true, y_pred)[0, 1] + tp, tn, fp, fn = _tp_tn_fp_fn(y_true, y_pred) + tp, tn, fp, fn = tp[1], tn[1], fp[1], fn[1] + + num = (tp * tn - fp * fn) + den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) + mcc = num / den + if np.isnan(mcc): return 0. else: @@ -499,7 +506,7 @@ def _binary_clf_curve(y_true, y_score, pos_label=None): return fps, tps, y_score[threshold_idxs] -def precision_recall_curve(y_true, probas_pred): +def precision_recall_curve(y_true, probas_pred, pos_label=None): """Compute precision-recall pairs for different probability thresholds Note: this implementation is restricted to the binary classification task. @@ -705,6 +712,7 @@ def confusion_matrix(y_true, y_pred, labels=None): if y_type not in ("binary", "multiclass"): raise ValueError("%s is not supported" % y_type) + if labels is None: labels = unique_labels(y_true, y_pred) else: @@ -1373,6 +1381,7 @@ def _tp_tn_fp_fn(y_true, y_pred, labels=None): labels = unique_labels(y_true, y_pred) else: labels = np.asarray(labels) + n_labels = labels.size true_pos = np.zeros((n_labels, ), dtype=np.int) false_pos = np.zeros((n_labels, ), dtype=np.int) @@ -2021,13 +2030,13 @@ class 2 1.00 1.00 1.00 2 if labels is None: labels = unique_labels(y_true, y_pred) else: - labels = np.asarray(labels, dtype=np.int) + labels = np.asarray(labels) last_line_heading = 'avg / total' if target_names is None: width = len(last_line_heading) - target_names = ['%d' % l for l in labels] + target_names = ['{0}'.format(l) for l in labels] else: width = max(len(cn) for cn in target_names) width = max(width, len(last_line_heading)) @@ -2049,8 +2058,8 @@ class 2 1.00 1.00 1.00 2 for i, label in enumerate(labels): values = [target_names[i]] for v in (p[i], r[i], f1[i]): - values += ["%0.2f" % float(v)] - values += ["%d" % int(s[i])] + values += ["{0:0.2f}".format(float(v))] + values += ["{0}".format(s[i])] report += fmt % tuple(values) report += '\n' diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index a4d4327563360..f719bee23f54c 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -1,6 +1,7 @@ from __future__ import division, print_function import warnings +import inspect import numpy as np from functools import partial @@ -55,7 +56,14 @@ from sklearn.externals.six.moves import xrange -ALL_METRICS = { +REGRESSION_METRICS = { + "mean_absolute_error": mean_absolute_error, + "mean_squared_error": mean_squared_error, + "explained_variance_score": explained_variance_score, + "r2_score": r2_score, +} + +CLASSIFICATION_METRICS = { "accuracy_score": accuracy_score, "unormalized_accuracy_score": partial(accuracy_score, normalize=False), "confusion_matrix": confusion_matrix, @@ -74,8 +82,6 @@ "f2_score": partial(fbeta_score, beta=2), "f0.5_score": partial(fbeta_score, beta=0.5), "matthews_corrcoef_score": matthews_corrcoef, - "auc_score": auc_score, - "average_precision_score": average_precision_score, "weighted_f0.5_score": partial(fbeta_score, average="weighted", beta=0.5), "weighted_f1_score": partial(f1_score, average="weighted"), @@ -95,13 +101,19 @@ "macro_precision_score": partial(precision_score, average="macro"), "macro_recall_score": partial(recall_score, average="macro"), - "mean_absolute_error": mean_absolute_error, - "mean_squared_error": mean_squared_error, - "explained_variance_score": explained_variance_score, - "r2_score": r2_score, - "confusion_matrix": partial(confusion_matrix, labels=range(3)), + "confusion_matrix": partial(confusion_matrix), +} + +THRESHOLDED_METRICS = { + "auc_score": auc_score, + "average_precision_score": average_precision_score, } +ALL_METRICS = dict() +ALL_METRICS.update(THRESHOLDED_METRICS) +ALL_METRICS.update(CLASSIFICATION_METRICS) +ALL_METRICS.update(REGRESSION_METRICS) + METRICS_WITH_NORMALIZE_OPTION = { "accuracy_score ": accuracy_score, "jaccard_similarity_score": jaccard_similarity_score, @@ -211,11 +223,6 @@ "confusion_matrix": partial(confusion_matrix, labels=range(3)), } -THRESHOLDED_METRICS = { - "auc_score": auc_score, - "average_precision_score": average_precision_score, -} - def make_prediction(dataset=None, binary=False): """Make some classification predictions on a toy dataset using a SVC @@ -706,24 +713,53 @@ def test_classification_report_multiclass(): expected_report = """\ precision recall f1-score support - 0 0.82 0.92 0.87 25 - 1 0.56 0.17 0.26 30 - 2 0.47 0.90 0.62 20 + 0 0.83 0.79 0.81 24 + 1 0.33 0.10 0.15 31 + 2 0.42 0.90 0.57 20 -avg / total 0.62 0.61 0.56 75 +avg / total 0.51 0.53 0.47 75 """ + report = classification_report(y_true, y_pred) + assert_equal(report, expected_report) + + +def test_classification_report_multiclass_with_string_label(): + y_true, y_pred, _ = make_prediction(binary=False) + + y_true = y_true.astype(np.str) + y_true[y_true == "0"] = "blue" + y_true[y_true == "1"] = "green" + y_true[y_true == "2"] = "red" + y_pred = y_pred.astype(np.str) + y_pred[y_pred == "0"] = "blue" + y_pred[y_pred == "1"] = "green" + y_pred[y_pred == "2"] = "red" + expected_report = """\ precision recall f1-score support - 0 0.83 0.79 0.81 24 - 1 0.33 0.10 0.15 31 - 2 0.42 0.90 0.57 20 + blue 0.83 0.79 0.81 24 + green 0.33 0.10 0.15 31 + red 0.42 0.90 0.57 20 avg / total 0.51 0.53 0.47 75 """ report = classification_report(y_true, y_pred) assert_equal(report, expected_report) + expected_report = """\ + precision recall f1-score support + + a 0.83 0.79 0.81 24 + b 0.33 0.10 0.15 31 + c 0.42 0.90 0.57 20 + +avg / total 0.51 0.53 0.47 75 +""" + report = classification_report(y_true, y_pred, + target_names=["a", "b", "c"]) + assert_equal(report, expected_report) + def test_multilabel_classification_report(): @@ -891,7 +927,7 @@ def test_symmetry(): # We shouldn't forget any metrics assert_equal(set(SYMMETRIC_METRICS).union(NOT_SYMMETRIC_METRICS, - THRESHOLDED_METRICS), + THRESHOLDED_METRICS), set(ALL_METRICS)) assert_equal(set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)), @@ -1009,6 +1045,48 @@ def test_format_invariance_with_1d_vectors(): assert_raises(ValueError, metric, y1_row, y2_row) +def test_invariance_string_vs_numbers_labels(): + """Ensure that classification metrics with string labels""" + y1, y2, _ = make_prediction(binary=True) + + y1_str = y1.astype(np.str) + y1_str[y1_str == "0"] = "eggs" + y1_str[y1_str == "1"] = "spam" + y2_str = y2.astype(np.str) + y2_str[y2_str == "0"] = "eggs" + y2_str[y2_str == "1"] = "spam" + + pos_label_str = "spam" + labels_str = ["eggs", "spam"] + + for name, metric in CLASSIFICATION_METRICS.items(): + print(name) + measure_with_number = metric(y1, y2) + + # Ugly, but handle case with a pos_label + if hasattr(metric, "func"): + argspect = inspect.getargspec(metric.func) + else: + argspect = inspect.getargspec(metric) + + metric_str = metric + if "pos_label" in argspect[0]: + metric_str = partial(metric_str, pos_label=pos_label_str) + + if "labels" in argspect[0]: + metric_str = partial(metric_str, labels=labels_str) + + measure_with_str = metric_str(y1_str, y2_str) + + assert_array_equal(measure_with_number, measure_with_str, + err_msg="{0} failed string vs number invariance " + "test".format(name)) + + # Currently not supported + for name, metrics in THRESHOLDED_METRICS.items(): + assert_raises(ValueError, metrics, y1_str, y2_str) + + def test_clf_single_sample(): """Non-regression test: scores should work with a single sample. From 8fdc542ea7aadc8988de03547727459e209ae39b Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Fri, 19 Jul 2013 09:52:51 +0200 Subject: [PATCH 2/9] ENH use the new format syntax --- sklearn/metrics/metrics.py | 6 +++--- sklearn/metrics/tests/test_metrics.py | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 967ec7294166b..67e23db45d7c4 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -2058,7 +2058,7 @@ class 2 1.00 1.00 1.00 2 for i, label in enumerate(labels): values = [target_names[i]] for v in (p[i], r[i], f1[i]): - values += ["{0:0.2f}".format(float(v))] + values += ["{0:0.2f}".format(v)] values += ["{0}".format(s[i])] report += fmt % tuple(values) @@ -2069,8 +2069,8 @@ class 2 1.00 1.00 1.00 2 for v in (np.average(p, weights=s), np.average(r, weights=s), np.average(f1, weights=s)): - values += ["%0.2f" % float(v)] - values += ['%d' % np.sum(s)] + values += ["{0:0.2f}".format(v)] + values += ['{0}'.format(np.sum(s))] report += fmt % tuple(values) return report diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index f719bee23f54c..d730d8d2d5ada 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -1060,10 +1060,9 @@ def test_invariance_string_vs_numbers_labels(): labels_str = ["eggs", "spam"] for name, metric in CLASSIFICATION_METRICS.items(): - print(name) measure_with_number = metric(y1, y2) - # Ugly, but handle case with a pos_label + # Ugly, but handle case with a pos_label and label if hasattr(metric, "func"): argspect = inspect.getargspec(metric.func) else: @@ -1082,7 +1081,7 @@ def test_invariance_string_vs_numbers_labels(): err_msg="{0} failed string vs number invariance " "test".format(name)) - # Currently not supported + # TODO Currently not supported for name, metrics in THRESHOLDED_METRICS.items(): assert_raises(ValueError, metrics, y1_str, y2_str) From cfd013997e3d5cb492d0462021299b4ff585c8fd Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Fri, 19 Jul 2013 13:27:07 +0200 Subject: [PATCH 3/9] ENH remove inspect --- sklearn/metrics/tests/test_metrics.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index d730d8d2d5ada..aba6e18db53fa 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -1,7 +1,6 @@ from __future__ import division, print_function import warnings -import inspect import numpy as np from functools import partial @@ -114,6 +113,21 @@ ALL_METRICS.update(CLASSIFICATION_METRICS) ALL_METRICS.update(REGRESSION_METRICS) +METRICS_WITH_POS_LABEL = [ + "roc_curve", + + "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score", + + "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score", + "weighted_precision_score", "weighted_recall_score", + + "micro_f0.5_score", "micro_f1_score", "micro_f2_score", + "micro_precision_score", "micro_recall_score", + + "macro_f0.5_score", "macro_f1_score", "macro_f2_score", + "macro_precision_score", "macro_recall_score", +] + METRICS_WITH_NORMALIZE_OPTION = { "accuracy_score ": accuracy_score, "jaccard_similarity_score": jaccard_similarity_score, @@ -1063,18 +1077,10 @@ def test_invariance_string_vs_numbers_labels(): measure_with_number = metric(y1, y2) # Ugly, but handle case with a pos_label and label - if hasattr(metric, "func"): - argspect = inspect.getargspec(metric.func) - else: - argspect = inspect.getargspec(metric) - metric_str = metric - if "pos_label" in argspect[0]: + if name in METRICS_WITH_POS_LABEL: metric_str = partial(metric_str, pos_label=pos_label_str) - if "labels" in argspect[0]: - metric_str = partial(metric_str, labels=labels_str) - measure_with_str = metric_str(y1_str, y2_str) assert_array_equal(measure_with_number, measure_with_str, From 038a1fa692b52d317937dda15d8357ebf18c5013 Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Mon, 22 Jul 2013 14:45:07 +0200 Subject: [PATCH 4/9] TST fix string labels in metrics tests Also found a faulty format op in the error handling. --- sklearn/metrics/metrics.py | 4 ++-- sklearn/metrics/tests/test_metrics.py | 18 ++++-------------- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 67e23db45d7c4..16a37b9425f83 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1673,8 +1673,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, if len(labels) == 1: # Only negative labels return (0., 0., 0., 0) - raise ValueError("pos_label=%d is not a valid label: %r" % - (pos_label, labels)) + raise ValueError("pos_label=%r is not a valid label: %r" % + (pos_label, list(labels))) pos_label_idx = list(labels).index(pos_label) return (precision[pos_label_idx], recall[pos_label_idx], fscore[pos_label_idx], support[pos_label_idx]) diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index aba6e18db53fa..615d0484d8496 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -740,14 +740,8 @@ def test_classification_report_multiclass(): def test_classification_report_multiclass_with_string_label(): y_true, y_pred, _ = make_prediction(binary=False) - y_true = y_true.astype(np.str) - y_true[y_true == "0"] = "blue" - y_true[y_true == "1"] = "green" - y_true[y_true == "2"] = "red" - y_pred = y_pred.astype(np.str) - y_pred[y_pred == "0"] = "blue" - y_pred[y_pred == "1"] = "green" - y_pred[y_pred == "2"] = "red" + y_true = np.array(["blue", "green", "red"])[y_true] + y_pred = np.array(["blue", "green", "red"])[y_pred] expected_report = """\ precision recall f1-score support @@ -1063,12 +1057,8 @@ def test_invariance_string_vs_numbers_labels(): """Ensure that classification metrics with string labels""" y1, y2, _ = make_prediction(binary=True) - y1_str = y1.astype(np.str) - y1_str[y1_str == "0"] = "eggs" - y1_str[y1_str == "1"] = "spam" - y2_str = y2.astype(np.str) - y2_str[y2_str == "0"] = "eggs" - y2_str[y2_str == "1"] = "spam" + y1_str = np.array(["eggs", "spam"])[y1] + y2_str = np.array(["eggs", "spam"])[y2] pos_label_str = "spam" labels_str = ["eggs", "spam"] From aeeee9352546d5752051d397a754465e7cfa8a90 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Thu, 25 Jul 2013 12:27:49 +0200 Subject: [PATCH 5/9] COSMIT --- sklearn/metrics/metrics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 16a37b9425f83..b43d10ebccae5 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -27,7 +27,6 @@ from ..utils import check_arrays from ..utils import deprecated from ..utils.fixes import divide -from ..utils.fixes import unique from ..utils.multiclass import unique_labels from ..utils.multiclass import type_of_target From 3e32e862de5b405ccff57bebfd849bb4bd1513e6 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Thu, 25 Jul 2013 12:29:10 +0200 Subject: [PATCH 6/9] Update what's new --- doc/whats_new.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 85fa6bfe666f7..f9a1e51b7eb63 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -7,6 +7,7 @@ Changelog --------- + - Missing values with sparse and dense matrices can be imputed with the transformer :class:`preprocessing.Imputer` by `Nicolas Trésegnie`_. @@ -132,6 +133,7 @@ Changelog - Python 3 support fixes by `Justin Vincent`_, `Lars Buitinck`_ and `Olivier Grisel`_. All tests now pass under Python 3.3. +<<<<<<< HEAD - Reduce memory footprint of FastICA by `Denis Engemann`_ and `Alexandre Gramfort`_. @@ -146,6 +148,10 @@ Changelog By `Peter Prettenhofer`_. +======= + - Most metrics now support string labels for multiclass classification + by `Arnaud Joly`_ and `Lars Buitinck`_. +>>>>>>> Update what's new API changes summary ------------------- From 9b57814240e23b70404b3f24102822a2db78ddc5 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Thu, 25 Jul 2013 15:38:11 +0200 Subject: [PATCH 7/9] DOC state that string is possible --- sklearn/metrics/metrics.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index b43d10ebccae5..526f8ee8ad53f 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1073,7 +1073,7 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): labels : array Integer array of labels. - pos_label : int, 1 by default + pos_label : str or int, 1 by default If ``average`` is not ``None`` and the classification target is binary, only this class's scores will be returned. @@ -1197,7 +1197,7 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, labels : array Integer array of labels. - pos_label : int, 1 by default + pos_label : str or int, 1 by default If ``average`` is not ``None`` and the classification target is binary, only this class's scores will be returned. @@ -1460,7 +1460,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, labels : array Integer array of labels. - pos_label : int, 1 by default + pos_label : str or int, 1 by default If ``average`` is not ``None`` and the classification target is binary, only this class's scores will be returned. @@ -1743,7 +1743,7 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, labels : array Integer array of labels. - pos_label : int, 1 by default + pos_label : str or int, 1 by default If ``average`` is not ``None`` and the classification target is binary, only this class's scores will be returned. @@ -1865,7 +1865,7 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): labels : array Integer array of labels. - pos_label : int, 1 by default + pos_label : str or int, 1 by default If ``average`` is not ``None`` and the classification target is binary, only this class's scores will be returned. From 7462bf3614562a8bedaea0796e1df476756940ac Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Thu, 25 Jul 2013 15:43:39 +0200 Subject: [PATCH 8/9] TST with labels arguments --- sklearn/metrics/tests/test_metrics.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index 615d0484d8496..71a0af183d46b 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -128,6 +128,21 @@ "macro_precision_score", "macro_recall_score", ] +METRICS_WITH_LABELS = [ + "confusion_matrix", + + "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score", + + "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score", + "weighted_precision_score", "weighted_recall_score", + + "micro_f0.5_score", "micro_f1_score", "micro_f2_score", + "micro_precision_score", "micro_recall_score", + + "macro_f0.5_score", "macro_f1_score", "macro_f2_score", + "macro_precision_score", "macro_recall_score", +] + METRICS_WITH_NORMALIZE_OPTION = { "accuracy_score ": accuracy_score, "jaccard_similarity_score": jaccard_similarity_score, @@ -1077,6 +1092,13 @@ def test_invariance_string_vs_numbers_labels(): err_msg="{0} failed string vs number invariance " "test".format(name)) + if name in METRICS_WITH_LABELS: + metric_str = partial(metric_str, labels=labels_str) + measure_with_str = metric_str(y1_str, y2_str) + assert_array_equal(measure_with_number, measure_with_str, + err_msg="{0} failed string vs number " + "invariance test".format(name)) + # TODO Currently not supported for name, metrics in THRESHOLDED_METRICS.items(): assert_raises(ValueError, metrics, y1_str, y2_str) From 98c0d73bfdaeb3c97f02fcf1be10a1e620f85e58 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Thu, 25 Jul 2013 16:27:28 +0200 Subject: [PATCH 9/9] FIX what's new... --- doc/whats_new.rst | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index f9a1e51b7eb63..3b9b9e9153ec5 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -133,7 +133,6 @@ Changelog - Python 3 support fixes by `Justin Vincent`_, `Lars Buitinck`_ and `Olivier Grisel`_. All tests now pass under Python 3.3. -<<<<<<< HEAD - Reduce memory footprint of FastICA by `Denis Engemann`_ and `Alexandre Gramfort`_. @@ -147,11 +146,9 @@ Changelog how to use OOB estimates to select the number of trees was added. By `Peter Prettenhofer`_. - -======= - Most metrics now support string labels for multiclass classification by `Arnaud Joly`_ and `Lars Buitinck`_. ->>>>>>> Update what's new + API changes summary -------------------