scikit-learn · arjoly · Jul 19, 2013 · Jul 19, 2013 · Jul 19, 2013 · Jul 22, 2013
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -7,6 +7,7 @@
 
 Changelog
 ---------
+
    - Missing values with sparse and dense matrices can be imputed with the
      transformer :class:`preprocessing.Imputer` by `Nicolas Trésegnie`_.
 
@@ -145,6 +146,8 @@ Changelog
      how to use OOB estimates to select the number of trees was added.
      By `Peter Prettenhofer`_.
 
+   - Most metrics now support string labels for multiclass classification
+     by `Arnaud Joly`_ and `Lars Buitinck`_.
 
 
 API changes summary

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
@@ -425,7 +425,13 @@ def matthews_corrcoef(y_true, y_pred):
     if y_type != "binary":
         raise ValueError("%s is not supported" % y_type)
 
-    mcc = np.corrcoef(y_true, y_pred)[0, 1]
+    tp, tn, fp, fn = _tp_tn_fp_fn(y_true, y_pred)
+    tp, tn, fp, fn = tp[1], tn[1], fp[1], fn[1]
+
+    num = (tp * tn - fp * fn)
+    den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
+    mcc = num / den
+
     if np.isnan(mcc):
         return 0.
     else:
@@ -499,7 +505,7 @@ def _binary_clf_curve(y_true, y_score, pos_label=None):
     return fps, tps, y_score[threshold_idxs]
 
 
-def precision_recall_curve(y_true, probas_pred):
+def precision_recall_curve(y_true, probas_pred, pos_label=None):
     """Compute precision-recall pairs for different probability thresholds
 
     Note: this implementation is restricted to the binary classification task.
@@ -705,6 +711,7 @@ def confusion_matrix(y_true, y_pred, labels=None):
     if y_type not in ("binary", "multiclass"):
         raise ValueError("%s is not supported" % y_type)
 
+
     if labels is None:
         labels = unique_labels(y_true, y_pred)
     else:
@@ -1066,7 +1073,7 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     labels : array
         Integer array of labels.
 
-    pos_label : int, 1 by default
+    pos_label : str or int, 1 by default
         If ``average`` is not ``None`` and the classification target is binary,
         only this class's scores will be returned.
 
@@ -1190,7 +1197,7 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
     labels : array
         Integer array of labels.
 
-    pos_label : int, 1 by default
+    pos_label : str or int, 1 by default
         If ``average`` is not ``None`` and the classification target is binary,
         only this class's scores will be returned.
 
@@ -1373,6 +1380,7 @@ def _tp_tn_fp_fn(y_true, y_pred, labels=None):
         labels = unique_labels(y_true, y_pred)
     else:
         labels = np.asarray(labels)
+
     n_labels = labels.size
     true_pos = np.zeros((n_labels, ), dtype=np.int)
     false_pos = np.zeros((n_labels, ), dtype=np.int)
@@ -1452,7 +1460,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     labels : array
         Integer array of labels.
 
-    pos_label : int, 1 by default
+    pos_label : str or int, 1 by default
         If ``average`` is not ``None`` and the classification target is binary,
         only this class's scores will be returned.
 
@@ -1664,8 +1672,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
             if len(labels) == 1:
                 # Only negative labels
                 return (0., 0., 0., 0)
-            raise ValueError("pos_label=%d is not a valid label: %r" %
-                             (pos_label, labels))
+            raise ValueError("pos_label=%r is not a valid label: %r" %
+                             (pos_label, list(labels)))
         pos_label_idx = list(labels).index(pos_label)
         return (precision[pos_label_idx], recall[pos_label_idx],
                 fscore[pos_label_idx], support[pos_label_idx])
@@ -1735,7 +1743,7 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
     labels : array
         Integer array of labels.
 
-    pos_label : int, 1 by default
+    pos_label : str or int, 1 by default
         If ``average`` is not ``None`` and the classification target is binary,
         only this class's scores will be returned.
 
@@ -1857,7 +1865,7 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     labels : array
         Integer array of labels.
 
-    pos_label : int, 1 by default
+    pos_label : str or int, 1 by default
         If ``average`` is not ``None`` and the classification target is binary,
         only this class's scores will be returned.
 
@@ -2021,13 +2029,13 @@ class 2       1.00      1.00      1.00         2
     if labels is None:
         labels = unique_labels(y_true, y_pred)
     else:
-        labels = np.asarray(labels, dtype=np.int)
+        labels = np.asarray(labels)
 
     last_line_heading = 'avg / total'
 
     if target_names is None:
         width = len(last_line_heading)
-        target_names = ['%d' % l for l in labels]
+        target_names = ['{0}'.format(l) for l in labels]
     else:
         width = max(len(cn) for cn in target_names)
         width = max(width, len(last_line_heading))
@@ -2049,8 +2057,8 @@ class 2       1.00      1.00      1.00         2
     for i, label in enumerate(labels):
         values = [target_names[i]]
         for v in (p[i], r[i], f1[i]):
-            values += ["%0.2f" % float(v)]
-        values += ["%d" % int(s[i])]
+            values += ["{0:0.2f}".format(v)]
+        values += ["{0}".format(s[i])]
         report += fmt % tuple(values)
 
     report += '\n'
@@ -2060,8 +2068,8 @@ class 2       1.00      1.00      1.00         2
     for v in (np.average(p, weights=s),
               np.average(r, weights=s),
               np.average(f1, weights=s)):
-        values += ["%0.2f" % float(v)]
-    values += ['%d' % np.sum(s)]
+        values += ["{0:0.2f}".format(v)]
+    values += ['{0}'.format(np.sum(s))]
     report += fmt % tuple(values)
     return report
 

diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
@@ -55,7 +55,14 @@
 
 from sklearn.externals.six.moves import xrange
 
-ALL_METRICS = {
+REGRESSION_METRICS = {
+    "mean_absolute_error": mean_absolute_error,
+    "mean_squared_error": mean_squared_error,
+    "explained_variance_score": explained_variance_score,
+    "r2_score": r2_score,
+}
+
+CLASSIFICATION_METRICS = {
     "accuracy_score": accuracy_score,
     "unormalized_accuracy_score": partial(accuracy_score, normalize=False),
     "confusion_matrix": confusion_matrix,
@@ -74,8 +81,6 @@
     "f2_score": partial(fbeta_score, beta=2),
     "f0.5_score": partial(fbeta_score, beta=0.5),
     "matthews_corrcoef_score": matthews_corrcoef,
-    "auc_score": auc_score,
-    "average_precision_score": average_precision_score,
 
     "weighted_f0.5_score": partial(fbeta_score, average="weighted", beta=0.5),
     "weighted_f1_score": partial(f1_score, average="weighted"),
@@ -95,13 +100,49 @@
     "macro_precision_score": partial(precision_score, average="macro"),
     "macro_recall_score": partial(recall_score, average="macro"),
 
-    "mean_absolute_error": mean_absolute_error,
-    "mean_squared_error": mean_squared_error,
-    "explained_variance_score": explained_variance_score,
-    "r2_score": r2_score,
-    "confusion_matrix": partial(confusion_matrix, labels=range(3)),
+    "confusion_matrix": partial(confusion_matrix),
+}
+
+THRESHOLDED_METRICS = {
+    "auc_score": auc_score,
+    "average_precision_score": average_precision_score,
 }
 
+ALL_METRICS = dict()
+ALL_METRICS.update(THRESHOLDED_METRICS)
+ALL_METRICS.update(CLASSIFICATION_METRICS)
+ALL_METRICS.update(REGRESSION_METRICS)
+
+METRICS_WITH_POS_LABEL = [
+    "roc_curve",
+
+    "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",
+
+    "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
+    "weighted_precision_score", "weighted_recall_score",
+
+    "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
+    "micro_precision_score", "micro_recall_score",
+
+    "macro_f0.5_score", "macro_f1_score", "macro_f2_score",
+    "macro_precision_score", "macro_recall_score",
+]
+
+METRICS_WITH_LABELS = [
+    "confusion_matrix",
+
+    "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",
+
+    "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
+    "weighted_precision_score", "weighted_recall_score",
+
+    "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
+    "micro_precision_score", "micro_recall_score",
+
+    "macro_f0.5_score", "macro_f1_score", "macro_f2_score",
+    "macro_precision_score", "macro_recall_score",
+]
+
 METRICS_WITH_NORMALIZE_OPTION = {
     "accuracy_score ": accuracy_score,
     "jaccard_similarity_score": jaccard_similarity_score,
@@ -211,11 +252,6 @@
     "confusion_matrix": partial(confusion_matrix, labels=range(3)),
 }
 
-THRESHOLDED_METRICS = {
-    "auc_score": auc_score,
-    "average_precision_score": average_precision_score,
-}
-
 
 def make_prediction(dataset=None, binary=False):
     """Make some classification predictions on a toy dataset using a SVC
@@ -706,24 +742,47 @@ def test_classification_report_multiclass():
     expected_report = """\
              precision    recall  f1-score   support
 
-          0       0.82      0.92      0.87        25
-          1       0.56      0.17      0.26        30
-          2       0.47      0.90      0.62        20
+          0       0.83      0.79      0.81        24
+          1       0.33      0.10      0.15        31
+          2       0.42      0.90      0.57        20
 
-avg / total       0.62      0.61      0.56        75
+avg / total       0.51      0.53      0.47        75
 """
+    report = classification_report(y_true, y_pred)
+    assert_equal(report, expected_report)
+
+
+def test_classification_report_multiclass_with_string_label():
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    y_true = np.array(["blue", "green", "red"])[y_true]
+    y_pred = np.array(["blue", "green", "red"])[y_pred]
+
     expected_report = """\
              precision    recall  f1-score   support
 
-          0       0.83      0.79      0.81        24
-          1       0.33      0.10      0.15        31
-          2       0.42      0.90      0.57        20
+       blue       0.83      0.79      0.81        24
+      green       0.33      0.10      0.15        31
+        red       0.42      0.90      0.57        20
 
 avg / total       0.51      0.53      0.47        75
 """
     report = classification_report(y_true, y_pred)
     assert_equal(report, expected_report)
 
+    expected_report = """\
+             precision    recall  f1-score   support
+
+          a       0.83      0.79      0.81        24
+          b       0.33      0.10      0.15        31
+          c       0.42      0.90      0.57        20
+
+avg / total       0.51      0.53      0.47        75
+"""
+    report = classification_report(y_true, y_pred,
+                                   target_names=["a", "b", "c"])
+    assert_equal(report, expected_report)
+
 
 def test_multilabel_classification_report():
 
@@ -891,7 +950,7 @@ def test_symmetry():
 
     # We shouldn't forget any metrics
     assert_equal(set(SYMMETRIC_METRICS).union(NOT_SYMMETRIC_METRICS,
-                                             THRESHOLDED_METRICS),
+                                              THRESHOLDED_METRICS),
                  set(ALL_METRICS))
 
     assert_equal(set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)),
@@ -1009,6 +1068,42 @@ def test_format_invariance_with_1d_vectors():
             assert_raises(ValueError, metric, y1_row, y2_row)
 
 
+def test_invariance_string_vs_numbers_labels():
+    """Ensure that classification metrics with string labels"""
+    y1, y2, _ = make_prediction(binary=True)
+
+    y1_str = np.array(["eggs", "spam"])[y1]
+    y2_str = np.array(["eggs", "spam"])[y2]
+
+    pos_label_str = "spam"
+    labels_str = ["eggs", "spam"]
+
+    for name, metric in CLASSIFICATION_METRICS.items():
+        measure_with_number = metric(y1, y2)
+
+        # Ugly, but handle case with a pos_label and label
+        metric_str = metric
+        if name in METRICS_WITH_POS_LABEL:
+            metric_str = partial(metric_str, pos_label=pos_label_str)
+
+        measure_with_str = metric_str(y1_str, y2_str)
+
+        assert_array_equal(measure_with_number, measure_with_str,
+                           err_msg="{0} failed string vs number invariance "
+                                   "test".format(name))
+
+        if name in METRICS_WITH_LABELS:
+            metric_str = partial(metric_str, labels=labels_str)
+            measure_with_str = metric_str(y1_str, y2_str)
+            assert_array_equal(measure_with_number, measure_with_str,
+                               err_msg="{0} failed string vs number  "
+                                       "invariance test".format(name))
+
+    # TODO Currently not supported
+    for name, metrics in THRESHOLDED_METRICS.items():
+        assert_raises(ValueError, metrics, y1_str, y2_str)
+
+
 def test_clf_single_sample():
     """Non-regression test: scores should work with a single sample.