From 8bbd0c0114b387b08026065bd8e2cbf8efa63b66 Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Fri, 19 Jul 2013 08:23:01 +0200
Subject: [PATCH 1/9] ENH add support for string input with classification
 metrics

---
 sklearn/metrics/metrics.py            |  21 +++--
 sklearn/metrics/tests/test_metrics.py | 120 +++++++++++++++++++++-----
 2 files changed, 114 insertions(+), 27 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index f564ff72338af..967ec7294166b 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -27,6 +27,7 @@
 from ..utils import check_arrays
 from ..utils import deprecated
 from ..utils.fixes import divide
+from ..utils.fixes import unique
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
 
@@ -425,7 +426,13 @@ def matthews_corrcoef(y_true, y_pred):
     if y_type != "binary":
         raise ValueError("%s is not supported" % y_type)
 
-    mcc = np.corrcoef(y_true, y_pred)[0, 1]
+    tp, tn, fp, fn = _tp_tn_fp_fn(y_true, y_pred)
+    tp, tn, fp, fn = tp[1], tn[1], fp[1], fn[1]
+
+    num = (tp * tn - fp * fn)
+    den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
+    mcc = num / den
+
     if np.isnan(mcc):
         return 0.
     else:
@@ -499,7 +506,7 @@ def _binary_clf_curve(y_true, y_score, pos_label=None):
     return fps, tps, y_score[threshold_idxs]
 
 
-def precision_recall_curve(y_true, probas_pred):
+def precision_recall_curve(y_true, probas_pred, pos_label=None):
     """Compute precision-recall pairs for different probability thresholds
 
     Note: this implementation is restricted to the binary classification task.
@@ -705,6 +712,7 @@ def confusion_matrix(y_true, y_pred, labels=None):
     if y_type not in ("binary", "multiclass"):
         raise ValueError("%s is not supported" % y_type)
 
+
     if labels is None:
         labels = unique_labels(y_true, y_pred)
     else:
@@ -1373,6 +1381,7 @@ def _tp_tn_fp_fn(y_true, y_pred, labels=None):
         labels = unique_labels(y_true, y_pred)
     else:
         labels = np.asarray(labels)
+
     n_labels = labels.size
     true_pos = np.zeros((n_labels, ), dtype=np.int)
     false_pos = np.zeros((n_labels, ), dtype=np.int)
@@ -2021,13 +2030,13 @@ class 2       1.00      1.00      1.00         2
     if labels is None:
         labels = unique_labels(y_true, y_pred)
     else:
-        labels = np.asarray(labels, dtype=np.int)
+        labels = np.asarray(labels)
 
     last_line_heading = 'avg / total'
 
     if target_names is None:
         width = len(last_line_heading)
-        target_names = ['%d' % l for l in labels]
+        target_names = ['{0}'.format(l) for l in labels]
     else:
         width = max(len(cn) for cn in target_names)
         width = max(width, len(last_line_heading))
@@ -2049,8 +2058,8 @@ class 2       1.00      1.00      1.00         2
     for i, label in enumerate(labels):
         values = [target_names[i]]
         for v in (p[i], r[i], f1[i]):
-            values += ["%0.2f" % float(v)]
-        values += ["%d" % int(s[i])]
+            values += ["{0:0.2f}".format(float(v))]
+        values += ["{0}".format(s[i])]
         report += fmt % tuple(values)
 
     report += '\n'
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index a4d4327563360..f719bee23f54c 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -1,6 +1,7 @@
 from __future__ import division, print_function
 
 import warnings
+import inspect
 import numpy as np
 
 from functools import partial
@@ -55,7 +56,14 @@
 
 from sklearn.externals.six.moves import xrange
 
-ALL_METRICS = {
+REGRESSION_METRICS = {
+    "mean_absolute_error": mean_absolute_error,
+    "mean_squared_error": mean_squared_error,
+    "explained_variance_score": explained_variance_score,
+    "r2_score": r2_score,
+}
+
+CLASSIFICATION_METRICS = {
     "accuracy_score": accuracy_score,
     "unormalized_accuracy_score": partial(accuracy_score, normalize=False),
     "confusion_matrix": confusion_matrix,
@@ -74,8 +82,6 @@
     "f2_score": partial(fbeta_score, beta=2),
     "f0.5_score": partial(fbeta_score, beta=0.5),
     "matthews_corrcoef_score": matthews_corrcoef,
-    "auc_score": auc_score,
-    "average_precision_score": average_precision_score,
 
     "weighted_f0.5_score": partial(fbeta_score, average="weighted", beta=0.5),
     "weighted_f1_score": partial(f1_score, average="weighted"),
@@ -95,13 +101,19 @@
     "macro_precision_score": partial(precision_score, average="macro"),
     "macro_recall_score": partial(recall_score, average="macro"),
 
-    "mean_absolute_error": mean_absolute_error,
-    "mean_squared_error": mean_squared_error,
-    "explained_variance_score": explained_variance_score,
-    "r2_score": r2_score,
-    "confusion_matrix": partial(confusion_matrix, labels=range(3)),
+    "confusion_matrix": partial(confusion_matrix),
+}
+
+THRESHOLDED_METRICS = {
+    "auc_score": auc_score,
+    "average_precision_score": average_precision_score,
 }
 
+ALL_METRICS = dict()
+ALL_METRICS.update(THRESHOLDED_METRICS)
+ALL_METRICS.update(CLASSIFICATION_METRICS)
+ALL_METRICS.update(REGRESSION_METRICS)
+
 METRICS_WITH_NORMALIZE_OPTION = {
     "accuracy_score ": accuracy_score,
     "jaccard_similarity_score": jaccard_similarity_score,
@@ -211,11 +223,6 @@
     "confusion_matrix": partial(confusion_matrix, labels=range(3)),
 }
 
-THRESHOLDED_METRICS = {
-    "auc_score": auc_score,
-    "average_precision_score": average_precision_score,
-}
-
 
 def make_prediction(dataset=None, binary=False):
     """Make some classification predictions on a toy dataset using a SVC
@@ -706,24 +713,53 @@ def test_classification_report_multiclass():
     expected_report = """\
              precision    recall  f1-score   support
 
-          0       0.82      0.92      0.87        25
-          1       0.56      0.17      0.26        30
-          2       0.47      0.90      0.62        20
+          0       0.83      0.79      0.81        24
+          1       0.33      0.10      0.15        31
+          2       0.42      0.90      0.57        20
 
-avg / total       0.62      0.61      0.56        75
+avg / total       0.51      0.53      0.47        75
 """
+    report = classification_report(y_true, y_pred)
+    assert_equal(report, expected_report)
+
+
+def test_classification_report_multiclass_with_string_label():
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    y_true = y_true.astype(np.str)
+    y_true[y_true == "0"] = "blue"
+    y_true[y_true == "1"] = "green"
+    y_true[y_true == "2"] = "red"
+    y_pred = y_pred.astype(np.str)
+    y_pred[y_pred == "0"] = "blue"
+    y_pred[y_pred == "1"] = "green"
+    y_pred[y_pred == "2"] = "red"
+
     expected_report = """\
              precision    recall  f1-score   support
 
-          0       0.83      0.79      0.81        24
-          1       0.33      0.10      0.15        31
-          2       0.42      0.90      0.57        20
+       blue       0.83      0.79      0.81        24
+      green       0.33      0.10      0.15        31
+        red       0.42      0.90      0.57        20
 
 avg / total       0.51      0.53      0.47        75
 """
     report = classification_report(y_true, y_pred)
     assert_equal(report, expected_report)
 
+    expected_report = """\
+             precision    recall  f1-score   support
+
+          a       0.83      0.79      0.81        24
+          b       0.33      0.10      0.15        31
+          c       0.42      0.90      0.57        20
+
+avg / total       0.51      0.53      0.47        75
+"""
+    report = classification_report(y_true, y_pred,
+                                   target_names=["a", "b", "c"])
+    assert_equal(report, expected_report)
+
 
 def test_multilabel_classification_report():
 
@@ -891,7 +927,7 @@ def test_symmetry():
 
     # We shouldn't forget any metrics
     assert_equal(set(SYMMETRIC_METRICS).union(NOT_SYMMETRIC_METRICS,
-                                             THRESHOLDED_METRICS),
+                                              THRESHOLDED_METRICS),
                  set(ALL_METRICS))
 
     assert_equal(set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)),
@@ -1009,6 +1045,48 @@ def test_format_invariance_with_1d_vectors():
             assert_raises(ValueError, metric, y1_row, y2_row)
 
 
+def test_invariance_string_vs_numbers_labels():
+    """Ensure that classification metrics with string labels"""
+    y1, y2, _ = make_prediction(binary=True)
+
+    y1_str = y1.astype(np.str)
+    y1_str[y1_str == "0"] = "eggs"
+    y1_str[y1_str == "1"] = "spam"
+    y2_str = y2.astype(np.str)
+    y2_str[y2_str == "0"] = "eggs"
+    y2_str[y2_str == "1"] = "spam"
+
+    pos_label_str = "spam"
+    labels_str = ["eggs", "spam"]
+
+    for name, metric in CLASSIFICATION_METRICS.items():
+        print(name)
+        measure_with_number = metric(y1, y2)
+
+        # Ugly, but handle case with a pos_label
+        if hasattr(metric, "func"):
+            argspect = inspect.getargspec(metric.func)
+        else:
+            argspect = inspect.getargspec(metric)
+
+        metric_str = metric
+        if "pos_label" in argspect[0]:
+            metric_str = partial(metric_str, pos_label=pos_label_str)
+
+        if "labels" in argspect[0]:
+            metric_str = partial(metric_str, labels=labels_str)
+
+        measure_with_str = metric_str(y1_str, y2_str)
+
+        assert_array_equal(measure_with_number, measure_with_str,
+                           err_msg="{0} failed string vs number invariance "
+                                   "test".format(name))
+
+    # Currently not supported
+    for name, metrics in THRESHOLDED_METRICS.items():
+        assert_raises(ValueError, metrics, y1_str, y2_str)
+
+
 def test_clf_single_sample():
     """Non-regression test: scores should work with a single sample.
 

From 8fdc542ea7aadc8988de03547727459e209ae39b Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Fri, 19 Jul 2013 09:52:51 +0200
Subject: [PATCH 2/9] ENH use the new format syntax

---
 sklearn/metrics/metrics.py            | 6 +++---
 sklearn/metrics/tests/test_metrics.py | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 967ec7294166b..67e23db45d7c4 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -2058,7 +2058,7 @@ class 2       1.00      1.00      1.00         2
     for i, label in enumerate(labels):
         values = [target_names[i]]
         for v in (p[i], r[i], f1[i]):
-            values += ["{0:0.2f}".format(float(v))]
+            values += ["{0:0.2f}".format(v)]
         values += ["{0}".format(s[i])]
         report += fmt % tuple(values)
 
@@ -2069,8 +2069,8 @@ class 2       1.00      1.00      1.00         2
     for v in (np.average(p, weights=s),
               np.average(r, weights=s),
               np.average(f1, weights=s)):
-        values += ["%0.2f" % float(v)]
-    values += ['%d' % np.sum(s)]
+        values += ["{0:0.2f}".format(v)]
+    values += ['{0}'.format(np.sum(s))]
     report += fmt % tuple(values)
     return report
 
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index f719bee23f54c..d730d8d2d5ada 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -1060,10 +1060,9 @@ def test_invariance_string_vs_numbers_labels():
     labels_str = ["eggs", "spam"]
 
     for name, metric in CLASSIFICATION_METRICS.items():
-        print(name)
         measure_with_number = metric(y1, y2)
 
-        # Ugly, but handle case with a pos_label
+        # Ugly, but handle case with a pos_label and label
         if hasattr(metric, "func"):
             argspect = inspect.getargspec(metric.func)
         else:
@@ -1082,7 +1081,7 @@ def test_invariance_string_vs_numbers_labels():
                            err_msg="{0} failed string vs number invariance "
                                    "test".format(name))
 
-    # Currently not supported
+    # TODO Currently not supported
     for name, metrics in THRESHOLDED_METRICS.items():
         assert_raises(ValueError, metrics, y1_str, y2_str)
 

From cfd013997e3d5cb492d0462021299b4ff585c8fd Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Fri, 19 Jul 2013 13:27:07 +0200
Subject: [PATCH 3/9] ENH remove inspect

---
 sklearn/metrics/tests/test_metrics.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index d730d8d2d5ada..aba6e18db53fa 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -1,7 +1,6 @@
 from __future__ import division, print_function
 
 import warnings
-import inspect
 import numpy as np
 
 from functools import partial
@@ -114,6 +113,21 @@
 ALL_METRICS.update(CLASSIFICATION_METRICS)
 ALL_METRICS.update(REGRESSION_METRICS)
 
+METRICS_WITH_POS_LABEL = [
+    "roc_curve",
+
+    "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",
+
+    "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
+    "weighted_precision_score", "weighted_recall_score",
+
+    "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
+    "micro_precision_score", "micro_recall_score",
+
+    "macro_f0.5_score", "macro_f1_score", "macro_f2_score",
+    "macro_precision_score", "macro_recall_score",
+]
+
 METRICS_WITH_NORMALIZE_OPTION = {
     "accuracy_score ": accuracy_score,
     "jaccard_similarity_score": jaccard_similarity_score,
@@ -1063,18 +1077,10 @@ def test_invariance_string_vs_numbers_labels():
         measure_with_number = metric(y1, y2)
 
         # Ugly, but handle case with a pos_label and label
-        if hasattr(metric, "func"):
-            argspect = inspect.getargspec(metric.func)
-        else:
-            argspect = inspect.getargspec(metric)
-
         metric_str = metric
-        if "pos_label" in argspect[0]:
+        if name in METRICS_WITH_POS_LABEL:
             metric_str = partial(metric_str, pos_label=pos_label_str)
 
-        if "labels" in argspect[0]:
-            metric_str = partial(metric_str, labels=labels_str)
-
         measure_with_str = metric_str(y1_str, y2_str)
 
         assert_array_equal(measure_with_number, measure_with_str,

From 038a1fa692b52d317937dda15d8357ebf18c5013 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <larsmans@gmail.com>
Date: Mon, 22 Jul 2013 14:45:07 +0200
Subject: [PATCH 4/9] TST fix string labels in metrics tests

Also found a faulty format op in the error handling.
---
 sklearn/metrics/metrics.py            |  4 ++--
 sklearn/metrics/tests/test_metrics.py | 18 ++++--------------
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 67e23db45d7c4..16a37b9425f83 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -1673,8 +1673,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
             if len(labels) == 1:
                 # Only negative labels
                 return (0., 0., 0., 0)
-            raise ValueError("pos_label=%d is not a valid label: %r" %
-                             (pos_label, labels))
+            raise ValueError("pos_label=%r is not a valid label: %r" %
+                             (pos_label, list(labels)))
         pos_label_idx = list(labels).index(pos_label)
         return (precision[pos_label_idx], recall[pos_label_idx],
                 fscore[pos_label_idx], support[pos_label_idx])
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index aba6e18db53fa..615d0484d8496 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -740,14 +740,8 @@ def test_classification_report_multiclass():
 def test_classification_report_multiclass_with_string_label():
     y_true, y_pred, _ = make_prediction(binary=False)
 
-    y_true = y_true.astype(np.str)
-    y_true[y_true == "0"] = "blue"
-    y_true[y_true == "1"] = "green"
-    y_true[y_true == "2"] = "red"
-    y_pred = y_pred.astype(np.str)
-    y_pred[y_pred == "0"] = "blue"
-    y_pred[y_pred == "1"] = "green"
-    y_pred[y_pred == "2"] = "red"
+    y_true = np.array(["blue", "green", "red"])[y_true]
+    y_pred = np.array(["blue", "green", "red"])[y_pred]
 
     expected_report = """\
              precision    recall  f1-score   support
@@ -1063,12 +1057,8 @@ def test_invariance_string_vs_numbers_labels():
     """Ensure that classification metrics with string labels"""
     y1, y2, _ = make_prediction(binary=True)
 
-    y1_str = y1.astype(np.str)
-    y1_str[y1_str == "0"] = "eggs"
-    y1_str[y1_str == "1"] = "spam"
-    y2_str = y2.astype(np.str)
-    y2_str[y2_str == "0"] = "eggs"
-    y2_str[y2_str == "1"] = "spam"
+    y1_str = np.array(["eggs", "spam"])[y1]
+    y2_str = np.array(["eggs", "spam"])[y2]
 
     pos_label_str = "spam"
     labels_str = ["eggs", "spam"]

From aeeee9352546d5752051d397a754465e7cfa8a90 Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Thu, 25 Jul 2013 12:27:49 +0200
Subject: [PATCH 5/9] COSMIT

---
 sklearn/metrics/metrics.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 16a37b9425f83..b43d10ebccae5 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -27,7 +27,6 @@
 from ..utils import check_arrays
 from ..utils import deprecated
 from ..utils.fixes import divide
-from ..utils.fixes import unique
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
 

From 3e32e862de5b405ccff57bebfd849bb4bd1513e6 Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Thu, 25 Jul 2013 12:29:10 +0200
Subject: [PATCH 6/9] Update what's new

---
 doc/whats_new.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 85fa6bfe666f7..f9a1e51b7eb63 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -7,6 +7,7 @@
 
 Changelog
 ---------
+
    - Missing values with sparse and dense matrices can be imputed with the
      transformer :class:`preprocessing.Imputer` by `Nicolas Trésegnie`_.
 
@@ -132,6 +133,7 @@ Changelog
    - Python 3 support fixes by `Justin Vincent`_, `Lars Buitinck`_ and
      `Olivier Grisel`_. All tests now pass under Python 3.3.
 
+<<<<<<< HEAD
    - Reduce memory footprint of FastICA by `Denis Engemann`_ and
      `Alexandre Gramfort`_.
 
@@ -146,6 +148,10 @@ Changelog
      By `Peter Prettenhofer`_.
 
 
+=======
+   - Most metrics now support string labels for multiclass classification
+     by `Arnaud Joly`_ and `Lars Buitinck`_.
+>>>>>>> Update what's new
 
 API changes summary
 -------------------

From 9b57814240e23b70404b3f24102822a2db78ddc5 Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Thu, 25 Jul 2013 15:38:11 +0200
Subject: [PATCH 7/9] DOC state that string is possible

---
 sklearn/metrics/metrics.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index b43d10ebccae5..526f8ee8ad53f 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -1073,7 +1073,7 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     labels : array
         Integer array of labels.
 
-    pos_label : int, 1 by default
+    pos_label : str or int, 1 by default
         If ``average`` is not ``None`` and the classification target is binary,
         only this class's scores will be returned.
 
@@ -1197,7 +1197,7 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
     labels : array
         Integer array of labels.
 
-    pos_label : int, 1 by default
+    pos_label : str or int, 1 by default
         If ``average`` is not ``None`` and the classification target is binary,
         only this class's scores will be returned.
 
@@ -1460,7 +1460,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     labels : array
         Integer array of labels.
 
-    pos_label : int, 1 by default
+    pos_label : str or int, 1 by default
         If ``average`` is not ``None`` and the classification target is binary,
         only this class's scores will be returned.
 
@@ -1743,7 +1743,7 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
     labels : array
         Integer array of labels.
 
-    pos_label : int, 1 by default
+    pos_label : str or int, 1 by default
         If ``average`` is not ``None`` and the classification target is binary,
         only this class's scores will be returned.
 
@@ -1865,7 +1865,7 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     labels : array
         Integer array of labels.
 
-    pos_label : int, 1 by default
+    pos_label : str or int, 1 by default
         If ``average`` is not ``None`` and the classification target is binary,
         only this class's scores will be returned.
 

From 7462bf3614562a8bedaea0796e1df476756940ac Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Thu, 25 Jul 2013 15:43:39 +0200
Subject: [PATCH 8/9] TST with labels arguments

---
 sklearn/metrics/tests/test_metrics.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index 615d0484d8496..71a0af183d46b 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -128,6 +128,21 @@
     "macro_precision_score", "macro_recall_score",
 ]
 
+METRICS_WITH_LABELS = [
+    "confusion_matrix",
+
+    "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",
+
+    "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
+    "weighted_precision_score", "weighted_recall_score",
+
+    "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
+    "micro_precision_score", "micro_recall_score",
+
+    "macro_f0.5_score", "macro_f1_score", "macro_f2_score",
+    "macro_precision_score", "macro_recall_score",
+]
+
 METRICS_WITH_NORMALIZE_OPTION = {
     "accuracy_score ": accuracy_score,
     "jaccard_similarity_score": jaccard_similarity_score,
@@ -1077,6 +1092,13 @@ def test_invariance_string_vs_numbers_labels():
                            err_msg="{0} failed string vs number invariance "
                                    "test".format(name))
 
+        if name in METRICS_WITH_LABELS:
+            metric_str = partial(metric_str, labels=labels_str)
+            measure_with_str = metric_str(y1_str, y2_str)
+            assert_array_equal(measure_with_number, measure_with_str,
+                               err_msg="{0} failed string vs number  "
+                                       "invariance test".format(name))
+
     # TODO Currently not supported
     for name, metrics in THRESHOLDED_METRICS.items():
         assert_raises(ValueError, metrics, y1_str, y2_str)

From 98c0d73bfdaeb3c97f02fcf1be10a1e620f85e58 Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Thu, 25 Jul 2013 16:27:28 +0200
Subject: [PATCH 9/9] FIX what's new...

---
 doc/whats_new.rst | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index f9a1e51b7eb63..3b9b9e9153ec5 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -133,7 +133,6 @@ Changelog
    - Python 3 support fixes by `Justin Vincent`_, `Lars Buitinck`_ and
      `Olivier Grisel`_. All tests now pass under Python 3.3.
 
-<<<<<<< HEAD
    - Reduce memory footprint of FastICA by `Denis Engemann`_ and
      `Alexandre Gramfort`_.
 
@@ -147,11 +146,9 @@ Changelog
      how to use OOB estimates to select the number of trees was added.
      By `Peter Prettenhofer`_.
 
-
-=======
    - Most metrics now support string labels for multiclass classification
      by `Arnaud Joly`_ and `Lars Buitinck`_.
->>>>>>> Update what's new
+
 
 API changes summary
 -------------------