From 090037f13fe9b7af6496a37efb17a44e7f459da7 Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Mon, 14 Nov 2011 21:13:34 -0500
Subject: [PATCH 01/15] added avg_f1_score

---
 sklearn/metrics/__init__.py | 2 +-
 sklearn/metrics/metrics.py  | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index c8123b2343573..bff0e64cc5123 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -7,7 +7,7 @@
                 recall_score, fbeta_score, f1_score, zero_one_score, \
                 precision_recall_fscore_support, classification_report, \
                 precision_recall_curve, explained_variance_score, r2_score, \
-                zero_one, mean_square_error, hinge_loss
+                zero_one, mean_square_error, hinge_loss, avg_f1_score
 
 from . import cluster
 from .cluster import adjusted_rand_score
diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index e6f7c7ad12262..e69017f4c0b05 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -458,6 +458,11 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None):
 
     return precision, recall, fscore, support
 
+def avg_f1_score(y_true, y_pred):
+    """Return the average f1 score
+    """
+    p, r, f1, support = precision_recall_fscore_support(y_true, y_pred)
+    return np.average(f1, weights=support)
 
 def classification_report(y_true, y_pred, labels=None, target_names=None):
     """Build a text report showing the main classification metrics

From e546621f38a7e29f11770df8b27382849b77ba9c Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Mon, 14 Nov 2011 21:29:24 -0500
Subject: [PATCH 02/15] tst: added tests

---
 sklearn/metrics/metrics.py            | 20 ++++++++++++++++++--
 sklearn/metrics/tests/test_metrics.py |  4 ++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index e69017f4c0b05..dce6b3f4513b2 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -361,7 +361,7 @@ def f1_score(y_true, y_pred, pos_label=1):
     -------
     f1_score : float
         f1_score of the positive class in binary classification or weighted
-        avergage of the f1_scores of each class for the multiclass task
+        average of the f1_scores of each class for the multiclass task
 
     References
     ----------
@@ -458,12 +458,28 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None):
 
     return precision, recall, fscore, support
 
+
 def avg_f1_score(y_true, y_pred):
     """Return the average f1 score
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        true targets
+
+    y_pred : array, shape = [n_samples]
+        estimated targets
+
+    Returns
+    -------
+    avg_f1_score : float
+        average of the f1_scores of each class for the multiclass task
+
     """
-    p, r, f1, support = precision_recall_fscore_support(y_true, y_pred)
+    _, _, f1, support = precision_recall_fscore_support(y_true, y_pred)
     return np.average(f1, weights=support)
 
+
 def classification_report(y_true, y_pred, labels=None, target_names=None):
     """Build a text report showing the main classification metrics
 
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index 0fe5e6eae4942..ceef914bf1034 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -15,6 +15,7 @@
 from ..metrics import explained_variance_score
 from ..metrics import r2_score
 from ..metrics import f1_score
+from ..metrics import avg_f1_score
 from ..metrics import mean_square_error
 from ..metrics import precision_recall_curve
 from ..metrics import precision_recall_fscore_support
@@ -140,6 +141,9 @@ def test_precision_recall_f1_score_binary():
     fs = f1_score(y_true, y_pred)
     assert_array_almost_equal(fs, 0.74, 2)
 
+    afs = avg_f1_score(y_true, y_pred)
+    assert_array_almost_equal(afs, 0.74, decimal=2)
+
 
 def test_confusion_matrix_binary():
     """Test confusion matrix - binary classification case"""

From df6010bb341b811d89befd3f48113ed517574cb5 Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Mon, 14 Nov 2011 22:30:28 -0500
Subject: [PATCH 03/15] enh: added matthew's correlation coefficient

---
 sklearn/metrics/__init__.py           |  3 ++-
 sklearn/metrics/metrics.py            | 33 +++++++++++++++++++++++++++
 sklearn/metrics/tests/test_metrics.py | 15 ++++++++++++
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index bff0e64cc5123..ff5baef9e87f9 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -7,7 +7,8 @@
                 recall_score, fbeta_score, f1_score, zero_one_score, \
                 precision_recall_fscore_support, classification_report, \
                 precision_recall_curve, explained_variance_score, r2_score, \
-                zero_one, mean_square_error, hinge_loss, avg_f1_score
+                zero_one, mean_square_error, hinge_loss, avg_f1_score, \
+                matthews_corrcoef
 
 from . import cluster
 from .cluster import adjusted_rand_score
diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index dce6b3f4513b2..14551c3b6b5d7 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -480,6 +480,39 @@ def avg_f1_score(y_true, y_pred):
     return np.average(f1, weights=support)
 
 
+def matthews_corrcoef(y_true, y_pred):
+    """Returns matthew's correlation coefficient for binary classes
+
+    Only in the binary case does this relate to information about true and false
+    positives and negatives. See references below.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        true targets
+
+    y_pred : array, shape = [n_samples]
+        estimated targets
+
+    Returns
+    -------
+    mcc : float
+        matthew's correlation coefficient (+1 represents a perfect prediction,
+        0 an average random prediction and -1 and inverse prediction).
+
+    References
+    ----------
+    http://en.wikipedia.org/wiki/Matthews_correlation_coefficient
+    doi: 10.1093/bioinformatics/16.5.412
+
+    """
+    mcc = np.corrcoef(y_true, y_pred)[0,1]
+    if np.isnan(mcc):
+        return 0.
+    else:
+        return mcc
+
+
 def classification_report(y_true, y_pred, labels=None, target_names=None):
     """Build a text report showing the main classification metrics
 
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index ceef914bf1034..ada5daf77e7c2 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -16,6 +16,7 @@
 from ..metrics import r2_score
 from ..metrics import f1_score
 from ..metrics import avg_f1_score
+from ..metrics import matthews_corrcoef
 from ..metrics import mean_square_error
 from ..metrics import precision_recall_curve
 from ..metrics import precision_recall_fscore_support
@@ -152,6 +153,20 @@ def test_confusion_matrix_binary():
     cm = confusion_matrix(y_true, y_pred)
     assert_array_equal(cm, [[19, 6], [7, 18]])
 
+    tp = cm[0,0]
+    tn = cm[1,1]
+    fp = cm[0,1]
+    fn = cm[1,0]
+    num = (tp*tn-fp*fn)
+    den = np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
+    if den == 0.:
+        true_mcc = 0
+    else:
+        true_mcc = num/den
+    mcc = matthews_corrcoef(y_true, y_pred)
+    assert_array_almost_equal(mcc, true_mcc, decimal=2)
+    assert_array_almost_equal(mcc, 0.48, decimal=2)
+
 
 def test_precision_recall_f1_score_multiclass():
     """Test Precision Recall and F1 Score for multiclass classification task"""

From ee5451e5b7654404ca62ebca530ae38ae5e00fd5 Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Tue, 15 Nov 2011 08:20:26 -0500
Subject: [PATCH 04/15] sty: pep8 + doc

---
 sklearn/metrics/metrics.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 14551c3b6b5d7..786ddc18071dc 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -473,7 +473,7 @@ def avg_f1_score(y_true, y_pred):
     Returns
     -------
     avg_f1_score : float
-        average of the f1_scores of each class for the multiclass task
+        average of the f1_scores of all classes
 
     """
     _, _, f1, support = precision_recall_fscore_support(y_true, y_pred)
@@ -483,8 +483,8 @@ def avg_f1_score(y_true, y_pred):
 def matthews_corrcoef(y_true, y_pred):
     """Returns matthew's correlation coefficient for binary classes
 
-    Only in the binary case does this relate to information about true and false
-    positives and negatives. See references below.
+    Only in the binary case does this relate to information about true and
+    false positives and negatives. See references below.
 
     Parameters
     ----------
@@ -506,7 +506,7 @@ def matthews_corrcoef(y_true, y_pred):
     doi: 10.1093/bioinformatics/16.5.412
 
     """
-    mcc = np.corrcoef(y_true, y_pred)[0,1]
+    mcc = np.corrcoef(y_true, y_pred)[0, 1]
     if np.isnan(mcc):
         return 0.
     else:

From 940f34ac512999189f06718ea6ee58313f67816e Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Thu, 15 Dec 2011 14:26:00 +0100
Subject: [PATCH 05/15] enh: added support for weighted metrics closes #83

removed avg_f1_score
---
 sklearn/metrics/__init__.py           |   3 +-
 sklearn/metrics/metrics.py            | 159 +++++++++++++++++---------
 sklearn/metrics/tests/test_metrics.py |  42 +++++--
 3 files changed, 138 insertions(+), 66 deletions(-)

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index 5ecdd62980eb1..8486c0e9686bc 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -7,8 +7,7 @@
                 recall_score, fbeta_score, f1_score, zero_one_score, \
                 precision_recall_fscore_support, classification_report, \
                 precision_recall_curve, explained_variance_score, r2_score, \
-                zero_one, mean_square_error, hinge_loss, avg_f1_score, \
-                matthews_corrcoef
+                zero_one, mean_square_error, hinge_loss, matthews_corrcoef
 
 from . import cluster
 from .cluster import adjusted_rand_score
diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 786ddc18071dc..c973a93447eb1 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -205,7 +205,7 @@ def auc(x, y):
     return area
 
 
-def precision_score(y_true, y_pred, pos_label=1):
+def precision_score(y_true, y_pred, labels=None, pos_label=1, average='micro'):
     """Compute the precision
 
     The precision is the ratio :math:`tp / (tp + fp)` where tp is the
@@ -223,11 +223,18 @@ def precision_score(y_true, y_pred, pos_label=1):
     y_pred : array, shape = [n_samples]
         predicted targets
 
+    labels : array
+        integer array of labels
+
     pos_label : int
-        in the binary classification case, give the label of the
-        positive class (default is 1). Everything else but 'pos_label'
+        in the binary classification case, give the label of the positive
+        class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
-        Not used in the case of multiclass classification.
+        Set to None in the case of multiclass classification.
+
+    average : string, ['micro', 'macro', 'weighted']
+        in the multiclass classification case, this determines the
+        type of averaging performed on the data.
 
     Returns
     -------
@@ -237,14 +244,14 @@ def precision_score(y_true, y_pred, pos_label=1):
         multiclass task
 
     """
-    p, _, _, s = precision_recall_fscore_support(y_true, y_pred)
-    if p.shape[0] == 2:
-        return p[pos_label]
-    else:
-        return np.average(p, weights=s)
+    p, _, _, _ = precision_recall_fscore_support(y_true, y_pred,
+                                                 labels=labels,
+                                                 pos_label=pos_label,
+                                                 average=average)
+    return p
 
 
-def recall_score(y_true, y_pred, pos_label=1):
+def recall_score(y_true, y_pred, labels=None, pos_label=1, average='micro'):
     """Compute the recall
 
     The recall is the ratio :math:`tp / (tp + fn)` where tp is the number of
@@ -261,11 +268,18 @@ def recall_score(y_true, y_pred, pos_label=1):
     y_pred : array, shape = [n_samples]
         predicted targets
 
+    labels : array
+        integer array of labels
+
     pos_label : int
         in the binary classification case, give the label of the positive
         class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
-        Not used in the case of multiclass classification.
+        Set to None in the case of multiclass classification.
+
+    average : string, ['micro', 'macro', 'weighted']
+        in the multiclass classification case, this determines the
+        type of averaging performed on the data.
 
     Returns
     -------
@@ -274,14 +288,14 @@ class (default is 1). Everything else but 'pos_label'
         avergage of the recall of each class for the multiclass task.
 
     """
-    _, r, _, s = precision_recall_fscore_support(y_true, y_pred)
-    if r.shape[0] == 2:
-        return r[pos_label]
-    else:
-        return np.average(r, weights=s)
+    _, r, _, _ = precision_recall_fscore_support(y_true, y_pred,
+                                                 labels=labels,
+                                                 pos_label=pos_label,
+                                                 average=average)
+    return r
 
 
-def fbeta_score(y_true, y_pred, beta, pos_label=1):
+def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, average='micro'):
     """Compute fbeta score
 
     The F_beta score is the weighted harmonic mean of precision and recall,
@@ -301,11 +315,18 @@ def fbeta_score(y_true, y_pred, beta, pos_label=1):
 
     beta: float
 
+    labels : array
+        integer array of labels
+
     pos_label : int
         in the binary classification case, give the label of the positive
         class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
-        Not used in the case of multiclass classification.
+        Set to None in the case of multiclass classification.
+
+    average : string, ['micro', 'macro', 'weighted']
+        in the multiclass classification case, this determines the
+        type of averaging performed on the data.
 
     Returns
     -------
@@ -321,14 +342,15 @@ class (default is 1). Everything else but 'pos_label'
     http://en.wikipedia.org/wiki/F1_score
 
     """
-    _, _, f, s = precision_recall_fscore_support(y_true, y_pred, beta=beta)
-    if f.shape[0] == 2:
-        return f[pos_label]
-    else:
-        return np.average(f, weights=s)
+    _, _, f, _ = precision_recall_fscore_support(y_true, y_pred,
+                                                 beta=beta,
+                                                 labels=labels,
+                                                 pos_label=pos_label,
+                                                 average=average)
+    return f
 
 
-def f1_score(y_true, y_pred, pos_label=1):
+def f1_score(y_true, y_pred, labels=None, pos_label=1, average='micro'):
     """Compute f1 score
 
     The F1 score can be interpreted as a weighted average of the precision
@@ -351,11 +373,18 @@ def f1_score(y_true, y_pred, pos_label=1):
     y_pred : array, shape = [n_samples]
         predicted targets
 
+    labels : array
+        integer array of labels
+
     pos_label : int
-        in the binary classification case, give the label of the positive class
-        (default is 1). Everything else but 'pos_label'
+        in the binary classification case, give the label of the positive
+        class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
-        Not used in the case of multiclass classification.
+        Set to None in the case of multiclass classification.
+
+    average : string, ['micro', 'macro', 'weighted']
+        in the multiclass classification case, this determines the
+        type of averaging performed on the data.
 
     Returns
     -------
@@ -368,10 +397,12 @@ def f1_score(y_true, y_pred, pos_label=1):
     http://en.wikipedia.org/wiki/F1_score
 
     """
-    return fbeta_score(y_true, y_pred, 1, pos_label=pos_label)
+    return fbeta_score(y_true, y_pred, 1, labels=labels,
+                       pos_label=pos_label, average=average)
 
 
-def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None):
+def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
+                                    pos_label=None, average=None):
     """Compute precisions, recalls, f-measures and support for each class
 
     The precision is the ratio :math:`tp / (tp + fp)` where tp is the number of
@@ -392,6 +423,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None):
 
     The support is the number of occurrences of each class in y_true.
 
+    If pos_label is None, this function returns the average precision, recall
+    and f-measure. The averaging is either 'micro', 'macro', 'weighted'.
+
     Parameters
     ----------
     y_true : array, shape = [n_samples]
@@ -403,6 +437,19 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None):
     beta : float, 1.0 by default
         the strength of recall versus precision in the f-score
 
+    labels : array
+        integer array of labels
+
+    pos_label : int
+        in the binary classification case, give the label of the positive
+        class (default is 1). Everything else but 'pos_label'
+        is considered to belong to the negative class.
+        Set to None in the case of multiclass classification.
+
+    average : string, ['micro', 'macro', 'weighted']
+        in the multiclass classification case, this determines the
+        type of averaging performed on the data.
+
     Returns
     -------
     precision: array, shape = [n_unique_labels], dtype = np.double
@@ -456,28 +503,36 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None):
     finally:
         np.seterr(**old_err_settings)
 
-    return precision, recall, fscore, support
-
-
-def avg_f1_score(y_true, y_pred):
-    """Return the average f1 score
-
-    Parameters
-    ----------
-    y_true : array, shape = [n_samples]
-        true targets
-
-    y_pred : array, shape = [n_samples]
-        estimated targets
-
-    Returns
-    -------
-    avg_f1_score : float
-        average of the f1_scores of all classes
-
-    """
-    _, _, f1, support = precision_recall_fscore_support(y_true, y_pred)
-    return np.average(f1, weights=support)
+    if pos_label is not None:
+        if precision.shape[0] != 2:
+            raise ValueError(("pos_label should be set to None for multiclass "
+                              "tasks got %d") % pos_label)
+        if pos_label not in labels:
+            raise ValueError("pos_label=%d is not a valid label: %r" % (pos_label,
+                                                                        labels))
+        pos_label_idx = list(labels).index(pos_label)
+        return (precision[pos_label_idx], recall[pos_label_idx],
+                fscore[pos_label_idx], support[pos_label_idx])
+    else:
+        average_options = (None, 'micro', 'macro', 'weighted')
+        if average not in average_options:
+            raise ValueError('average has to be one of ' +
+                             str(average_options))
+        if average is None:
+            return precision, recall, fscore, support
+        if average == 'micro':
+            avg_precision = true_pos.sum() / (true_pos.sum() + false_pos.sum())
+            avg_recall = true_pos.sum() / (true_pos.sum() + false_neg.sum())
+            avg_fscore = (1 + beta2) * (avg_precision * avg_recall) / (beta2 * avg_precision + avg_recall)
+        if average == 'macro':
+            avg_precision = np.mean(precision)
+            avg_recall = np.mean(recall)
+            avg_fscore = np.mean(fscore)
+        if average == 'weighted':
+            avg_precision = np.average(precision, weights=support)
+            avg_recall = np.average(recall, weights=support)
+            avg_fscore = np.average(fscore, weights=support)
+        return avg_precision, avg_recall, avg_fscore, None
 
 
 def matthews_corrcoef(y_true, y_pred):
@@ -503,7 +558,7 @@ def matthews_corrcoef(y_true, y_pred):
     References
     ----------
     http://en.wikipedia.org/wiki/Matthews_correlation_coefficient
-    doi: 10.1093/bioinformatics/16.5.412
+    http://dx.doi.org/10.1093/bioinformatics/16.5.412
 
     """
     mcc = np.corrcoef(y_true, y_pred)[0, 1]
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index 8ac6bc309e854..573644e7eb2aa 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -4,7 +4,7 @@
 from nose.tools import raises
 from nose.tools import assert_true
 from numpy.testing import assert_array_almost_equal
-from numpy.testing import assert_array_equal
+from numpy.testing import assert_array_equal, assert_raises
 from numpy.testing import assert_equal, assert_almost_equal
 
 from ... import datasets
@@ -15,7 +15,6 @@
 from ..metrics import explained_variance_score
 from ..metrics import r2_score
 from ..metrics import f1_score
-from ..metrics import avg_f1_score
 from ..metrics import matthews_corrcoef
 from ..metrics import mean_square_error
 from ..metrics import precision_recall_curve
@@ -142,9 +141,6 @@ def test_precision_recall_f1_score_binary():
     fs = f1_score(y_true, y_pred)
     assert_array_almost_equal(fs, 0.74, 2)
 
-    afs = avg_f1_score(y_true, y_pred)
-    assert_array_almost_equal(afs, 0.74, decimal=2)
-
 
 def test_confusion_matrix_binary():
     """Test confusion matrix - binary classification case"""
@@ -183,14 +179,36 @@ def test_precision_recall_f1_score_multiclass():
     # multiclass case the score is the wieghthed average of the individual
     # class values hence f1_score is not necessary between precision_score and
     # recall_score
-    ps = precision_score(y_true, y_pred)
+    func = lambda : precision_recall_fscore_support(y_true, y_pred, pos_label=1)
+    assert_raises(ValueError, func)
+
+    # averaging tests
+    ps = precision_score(y_true, y_pred, pos_label=None, average='micro')
     assert_array_almost_equal(ps, 0.62, 2)
 
-    rs = recall_score(y_true, y_pred)
+    rs = recall_score(y_true, y_pred, pos_label=None, average='micro')
     assert_array_almost_equal(rs, 0.61, 2)
 
-    fs = f1_score(y_true, y_pred)
-    assert_array_almost_equal(fs, 0.56, 2)
+    fs = f1_score(y_true, y_pred, pos_label=None, average='micro')
+    assert_array_almost_equal(fs, 0.61, 2)
+
+    ps = precision_score(y_true, y_pred, pos_label=None, average='macro')
+    assert_array_almost_equal(ps, 0.62, 2)
+
+    rs = recall_score(y_true, y_pred, pos_label=None, average='macro')
+    assert_array_almost_equal(rs, 0.66, 2)
+
+    fs = f1_score(y_true, y_pred, pos_label=None, average='macro')
+    assert_array_almost_equal(fs, 0.58, 2)
+
+    ps = precision_score(y_true, y_pred, pos_label=None, average='weighted')
+    assert_array_almost_equal(ps, 0.62, 2)
+
+    rs = recall_score(y_true, y_pred, pos_label=None, average='weighted')
+    assert_array_almost_equal(rs, 0.61, 2)
+
+    fs = f1_score(y_true, y_pred, pos_label=None, average='weighted')
+    assert_array_almost_equal(fs, 0.55, 2)
 
     # same prediction but with and explicit label ordering
     p, r, f, s = precision_recall_fscore_support(
@@ -210,9 +228,9 @@ def test_zero_precision_recall():
         y_true = np.array([0, 1, 2, 0, 1, 2])
         y_pred = np.array([2, 0, 1, 1, 2, 0])
 
-        assert_almost_equal(precision_score(y_true, y_pred), 0.0, 2)
-        assert_almost_equal(recall_score(y_true, y_pred), 0.0, 2)
-        assert_almost_equal(f1_score(y_true, y_pred), 0.0, 2)
+        assert_almost_equal(precision_score(y_true, y_pred, pos_label=None, average='weighted'), 0.0, 2)
+        assert_almost_equal(recall_score(y_true, y_pred, pos_label=None, average='weighted'), 0.0, 2)
+        assert_almost_equal(f1_score(y_true, y_pred, pos_label=None, average='weighted'), 0.0, 2)
 
     finally:
         np.seterr(**old_error_settings)

From bbd8b71fdd18f85e3029bd335038110f38bc49e9 Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Thu, 15 Dec 2011 14:34:36 +0100
Subject: [PATCH 06/15] doc: added description for matthew's corrcoef from
 wikipedia

---
 sklearn/metrics/metrics.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index c973a93447eb1..731574f3bc58e 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -538,6 +538,16 @@ class (default is 1). Everything else but 'pos_label'
 def matthews_corrcoef(y_true, y_pred):
     """Returns matthew's correlation coefficient for binary classes
 
+    The Matthews correlation coefficient is used in machine learning as a
+    measure of the quality of binary (two-class) classifications. It takes
+    into account true and false positives and negatives and is generally
+    regarded as a balanced measure which can be used even if the classes are
+    of very different sizes. The MCC is in essence a correlation coefficient
+    between the observed and predicted binary classifications; it returns a
+    value between −1 and +1. A coefficient of +1 represents a perfect
+    prediction, 0 an average random prediction and −1 an inverse prediction.
+    The statistic is also known as the phi coefficient. [source: Wikipedia]
+
     Only in the binary case does this relate to information about true and
     false positives and negatives. See references below.
 

From 493e20e142e8def62a65aee819cfb4ae0c917a4a Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Thu, 15 Dec 2011 14:36:24 +0100
Subject: [PATCH 07/15] sty: pep8 fixes

---
 sklearn/metrics/metrics.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 731574f3bc58e..390e80dba42d3 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -295,7 +295,8 @@ class (default is 1). Everything else but 'pos_label'
     return r
 
 
-def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, average='micro'):
+def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
+                average='micro'):
     """Compute fbeta score
 
     The F_beta score is the weighted harmonic mean of precision and recall,
@@ -508,8 +509,8 @@ class (default is 1). Everything else but 'pos_label'
             raise ValueError(("pos_label should be set to None for multiclass "
                               "tasks got %d") % pos_label)
         if pos_label not in labels:
-            raise ValueError("pos_label=%d is not a valid label: %r" % (pos_label,
-                                                                        labels))
+            raise ValueError("pos_label=%d is not a valid label: %r" %
+                             (pos_label, labels))
         pos_label_idx = list(labels).index(pos_label)
         return (precision[pos_label_idx], recall[pos_label_idx],
                 fscore[pos_label_idx], support[pos_label_idx])
@@ -521,9 +522,11 @@ class (default is 1). Everything else but 'pos_label'
         if average is None:
             return precision, recall, fscore, support
         if average == 'micro':
-            avg_precision = true_pos.sum() / (true_pos.sum() + false_pos.sum())
+            avg_precision = true_pos.sum() / (true_pos.sum() +
+                                              false_pos.sum())
             avg_recall = true_pos.sum() / (true_pos.sum() + false_neg.sum())
-            avg_fscore = (1 + beta2) * (avg_precision * avg_recall) / (beta2 * avg_precision + avg_recall)
+            avg_fscore = (1 + beta2) * (avg_precision * avg_recall) / \
+                         (beta2 * avg_precision + avg_recall)
         if average == 'macro':
             avg_precision = np.mean(precision)
             avg_recall = np.mean(recall)

From e69c24d56c0297df8cb5f9d6a71ad04465903dcd Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Thu, 15 Dec 2011 15:09:11 +0100
Subject: [PATCH 08/15] sty: pep8 on test file

---
 sklearn/metrics/tests/test_metrics.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index 573644e7eb2aa..63d982e23328b 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -149,16 +149,16 @@ def test_confusion_matrix_binary():
     cm = confusion_matrix(y_true, y_pred)
     assert_array_equal(cm, [[19, 6], [7, 18]])
 
-    tp = cm[0,0]
-    tn = cm[1,1]
-    fp = cm[0,1]
-    fn = cm[1,0]
-    num = (tp*tn-fp*fn)
-    den = np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
+    tp = cm[0, 0]
+    tn = cm[1, 1]
+    fp = cm[0, 1]
+    fn = cm[1, 0]
+    num = (tp * tn - fp * fn)
+    den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
     if den == 0.:
         true_mcc = 0
     else:
-        true_mcc = num/den
+        true_mcc = num / den
     mcc = matthews_corrcoef(y_true, y_pred)
     assert_array_almost_equal(mcc, true_mcc, decimal=2)
     assert_array_almost_equal(mcc, 0.48, decimal=2)
@@ -179,7 +179,8 @@ def test_precision_recall_f1_score_multiclass():
     # multiclass case the score is the wieghthed average of the individual
     # class values hence f1_score is not necessary between precision_score and
     # recall_score
-    func = lambda : precision_recall_fscore_support(y_true, y_pred, pos_label=1)
+    func = lambda: precision_recall_fscore_support(y_true, y_pred,
+                                                   pos_label=1)
     assert_raises(ValueError, func)
 
     # averaging tests
@@ -228,9 +229,12 @@ def test_zero_precision_recall():
         y_true = np.array([0, 1, 2, 0, 1, 2])
         y_pred = np.array([2, 0, 1, 1, 2, 0])
 
-        assert_almost_equal(precision_score(y_true, y_pred, pos_label=None, average='weighted'), 0.0, 2)
-        assert_almost_equal(recall_score(y_true, y_pred, pos_label=None, average='weighted'), 0.0, 2)
-        assert_almost_equal(f1_score(y_true, y_pred, pos_label=None, average='weighted'), 0.0, 2)
+        assert_almost_equal(precision_score(y_true, y_pred, pos_label=None,
+                                            average='weighted'), 0.0, 2)
+        assert_almost_equal(recall_score(y_true, y_pred, pos_label=None,
+                                         average='weighted'), 0.0, 2)
+        assert_almost_equal(f1_score(y_true, y_pred, pos_label=None,
+                                     average='weighted'), 0.0, 2)
 
     finally:
         np.seterr(**old_error_settings)

From 8c052b4d2476a3148dfa4371b8de2e5130e5be2a Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Thu, 15 Dec 2011 15:17:10 +0100
Subject: [PATCH 09/15] doc: removed strange character

---
 sklearn/metrics/metrics.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 390e80dba42d3..792472ceedafe 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -546,9 +546,8 @@ def matthews_corrcoef(y_true, y_pred):
     into account true and false positives and negatives and is generally
     regarded as a balanced measure which can be used even if the classes are
     of very different sizes. The MCC is in essence a correlation coefficient
-    between the observed and predicted binary classifications; it returns a
-    value between −1 and +1. A coefficient of +1 represents a perfect
-    prediction, 0 an average random prediction and −1 an inverse prediction.
+    value between -1 and +1. A coefficient of +1 represents a perfect
+    prediction, 0 an average random prediction and -1 an inverse prediction.
     The statistic is also known as the phi coefficient. [source: Wikipedia]
 
     Only in the binary case does this relate to information about true and

From 584a2e667a84d118f3dc60709c6202bc13cb95bb Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Thu, 15 Dec 2011 15:17:40 +0100
Subject: [PATCH 10/15] fix: updated tests to reflect that micro shows the same
 precision and recall

---
 sklearn/metrics/tests/test_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index 63d982e23328b..baf5942db8c49 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -185,7 +185,7 @@ def test_precision_recall_f1_score_multiclass():
 
     # averaging tests
     ps = precision_score(y_true, y_pred, pos_label=None, average='micro')
-    assert_array_almost_equal(ps, 0.62, 2)
+    assert_array_almost_equal(ps, 0.61, 2)
 
     rs = recall_score(y_true, y_pred, pos_label=None, average='micro')
     assert_array_almost_equal(rs, 0.61, 2)

From 3ade680f85d7384d1d7ec94a366f2ad904cea119 Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Thu, 15 Dec 2011 15:36:11 +0100
Subject: [PATCH 11/15] fix: average with elif

---
 sklearn/metrics/metrics.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 792472ceedafe..e36966cb21a0e 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -425,7 +425,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     The support is the number of occurrences of each class in y_true.
 
     If pos_label is None, this function returns the average precision, recall
-    and f-measure. The averaging is either 'micro', 'macro', 'weighted'.
+    and f-measure if `average` is one of 'micro', 'macro', 'weighted'.
 
     Parameters
     ----------
@@ -516,25 +516,26 @@ class (default is 1). Everything else but 'pos_label'
                 fscore[pos_label_idx], support[pos_label_idx])
     else:
         average_options = (None, 'micro', 'macro', 'weighted')
-        if average not in average_options:
-            raise ValueError('average has to be one of ' +
-                             str(average_options))
         if average is None:
             return precision, recall, fscore, support
-        if average == 'micro':
+        elif average == 'micro':
             avg_precision = true_pos.sum() / (true_pos.sum() +
                                               false_pos.sum())
             avg_recall = true_pos.sum() / (true_pos.sum() + false_neg.sum())
             avg_fscore = (1 + beta2) * (avg_precision * avg_recall) / \
                          (beta2 * avg_precision + avg_recall)
-        if average == 'macro':
+        elif average == 'macro':
             avg_precision = np.mean(precision)
             avg_recall = np.mean(recall)
             avg_fscore = np.mean(fscore)
-        if average == 'weighted':
+        elif average == 'weighted':
             avg_precision = np.average(precision, weights=support)
             avg_recall = np.average(recall, weights=support)
             avg_fscore = np.average(fscore, weights=support)
+        else:
+            raise ValueError('average has to be one of ' +
+                             str(average_options))
+
         return avg_precision, avg_recall, avg_fscore, None
 
 

From 9a6d934b06713e10b144cadbbcab6e38144888b8 Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Thu, 15 Dec 2011 15:50:24 +0100
Subject: [PATCH 12/15] doc: improved description of average

---
 sklearn/metrics/metrics.py | 49 ++++++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index e36966cb21a0e..647669d444e6c 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -205,7 +205,7 @@ def auc(x, y):
     return area
 
 
-def precision_score(y_true, y_pred, labels=None, pos_label=1, average='micro'):
+def precision_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     """Compute the precision
 
     The precision is the ratio :math:`tp / (tp + fp)` where tp is the
@@ -232,15 +232,20 @@ class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
         Set to None in the case of multiclass classification.
 
-    average : string, ['micro', 'macro', 'weighted']
+    average : string, ['micro', 'macro', 'weighted'(default)]
         in the multiclass classification case, this determines the
         type of averaging performed on the data.
+        macro: average over classes (does not take imbalance into account)
+        micro: average over instances (takes imbalance into account)
+               implies that precision == recall == f1
+        weighted: average weighted by support (takes imbalance into account)
+               can have f1 score that is not between precision and recall
 
     Returns
     -------
     precision : float
         precision of the positive class in binary classification or
-        weighted avergage of the precision of each class for the
+        weighted average of the precision of each class for the
         multiclass task
 
     """
@@ -251,7 +256,7 @@ class (default is 1). Everything else but 'pos_label'
     return p
 
 
-def recall_score(y_true, y_pred, labels=None, pos_label=1, average='micro'):
+def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     """Compute the recall
 
     The recall is the ratio :math:`tp / (tp + fn)` where tp is the number of
@@ -277,15 +282,20 @@ class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
         Set to None in the case of multiclass classification.
 
-    average : string, ['micro', 'macro', 'weighted']
+    average : string, [None, 'micro', 'macro', 'weighted'(default)]
         in the multiclass classification case, this determines the
         type of averaging performed on the data.
+        macro: average over classes (does not take imbalance into account)
+        micro: average over instances (takes imbalance into account)
+               implies that precision == recall == f1
+        weighted: average weighted by support (takes imbalance into account)
+               can have f1 score that is not between precision and recall
 
     Returns
     -------
     recall : float
         recall of the positive class in binary classification or weighted
-        avergage of the recall of each class for the multiclass task.
+        average of the recall of each class for the multiclass task.
 
     """
     _, r, _, _ = precision_recall_fscore_support(y_true, y_pred,
@@ -296,7 +306,7 @@ class (default is 1). Everything else but 'pos_label'
 
 
 def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
-                average='micro'):
+                average='weighted'):
     """Compute fbeta score
 
     The F_beta score is the weighted harmonic mean of precision and recall,
@@ -325,15 +335,20 @@ class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
         Set to None in the case of multiclass classification.
 
-    average : string, ['micro', 'macro', 'weighted']
+    average : string, [None, 'micro', 'macro', 'weighted'(default)]
         in the multiclass classification case, this determines the
         type of averaging performed on the data.
+        macro: average over classes (does not take imbalance into account)
+        micro: average over instances (takes imbalance into account)
+               implies that precision == recall == f1
+        weighted: average weighted by support (takes imbalance into account)
+               can have f1 score that is not between precision and recall
 
     Returns
     -------
     fbeta_score : float
         fbeta_score of the positive class in binary classification or weighted
-        avergage of the fbeta_score of each class for the multiclass task.
+        average of the fbeta_score of each class for the multiclass task.
 
     See also
     --------
@@ -351,7 +366,7 @@ class (default is 1). Everything else but 'pos_label'
     return f
 
 
-def f1_score(y_true, y_pred, labels=None, pos_label=1, average='micro'):
+def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     """Compute f1 score
 
     The F1 score can be interpreted as a weighted average of the precision
@@ -383,9 +398,14 @@ class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
         Set to None in the case of multiclass classification.
 
-    average : string, ['micro', 'macro', 'weighted']
+    average : string, [None, 'micro', 'macro', 'weighted'(default)]
         in the multiclass classification case, this determines the
         type of averaging performed on the data.
+        macro: average over classes (does not take imbalance into account)
+        micro: average over instances (takes imbalance into account)
+               implies that precision == recall == f1
+        weighted: average weighted by support (takes imbalance into account)
+               can have f1 score that is not between precision and recall
 
     Returns
     -------
@@ -447,9 +467,14 @@ class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
         Set to None in the case of multiclass classification.
 
-    average : string, ['micro', 'macro', 'weighted']
+    average : string, [None (default), 'micro', 'macro', 'weighted']
         in the multiclass classification case, this determines the
         type of averaging performed on the data.
+        macro: average over classes (does not take imbalance into account)
+        micro: average over instances (takes imbalance into account)
+               implies that precision == recall == f1
+        weighted: average weighted by support (takes imbalance into account)
+               can have f1 score that is not between precision and recall
 
     Returns
     -------

From 0a8b6e51f3afe61ee754c4d894f0a130748c44b1 Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Wed, 21 Dec 2011 09:45:23 +0100
Subject: [PATCH 13/15] api: changed pos_label to None for metrics

---
 sklearn/metrics/metrics.py            | 8 ++++----
 sklearn/metrics/tests/test_metrics.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 647669d444e6c..5772fd21765ea 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -205,7 +205,7 @@ def auc(x, y):
     return area
 
 
-def precision_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
+def precision_score(y_true, y_pred, labels=None, pos_label=None, average='weighted'):
     """Compute the precision
 
     The precision is the ratio :math:`tp / (tp + fp)` where tp is the
@@ -256,7 +256,7 @@ class (default is 1). Everything else but 'pos_label'
     return p
 
 
-def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
+def recall_score(y_true, y_pred, labels=None, pos_label=None, average='weighted'):
     """Compute the recall
 
     The recall is the ratio :math:`tp / (tp + fn)` where tp is the number of
@@ -305,7 +305,7 @@ class (default is 1). Everything else but 'pos_label'
     return r
 
 
-def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
+def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=None,
                 average='weighted'):
     """Compute fbeta score
 
@@ -366,7 +366,7 @@ class (default is 1). Everything else but 'pos_label'
     return f
 
 
-def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
+def f1_score(y_true, y_pred, labels=None, pos_label=None, average='weighted'):
     """Compute f1 score
 
     The F1 score can be interpreted as a weighted average of the precision
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index baf5942db8c49..3716890704567 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -136,7 +136,7 @@ def test_precision_recall_f1_score_binary():
     assert_array_almost_equal(ps, 0.75, 2)
 
     rs = recall_score(y_true, y_pred)
-    assert_array_almost_equal(rs, 0.72, 2)
+    assert_array_almost_equal(rs, 0.74, 2)
 
     fs = f1_score(y_true, y_pred)
     assert_array_almost_equal(fs, 0.74, 2)

From 2fa7a096a87a2e14f50b76015d20f9040cfcd8dc Mon Sep 17 00:00:00 2001
From: Mathieu Blondel <mathieu@mblondel.org>
Date: Fri, 27 Jan 2012 21:26:36 +0900
Subject: [PATCH 14/15] Backward compatibility in precision, recall and
 f1-score.

---
 sklearn/metrics/metrics.py            | 24 +++++++-------
 sklearn/metrics/tests/test_metrics.py | 46 +++++++++++----------------
 2 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 8af3ac0435328..7da145f145544 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -212,7 +212,7 @@ def auc(x, y):
     return area
 
 
-def precision_score(y_true, y_pred, labels=None, pos_label=None, average='weighted'):
+def precision_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     """Compute the precision
 
     The precision is the ratio :math:`tp / (tp + fp)` where tp is the
@@ -263,7 +263,7 @@ class (default is 1). Everything else but 'pos_label'
     return p
 
 
-def recall_score(y_true, y_pred, labels=None, pos_label=None, average='weighted'):
+def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     """Compute the recall
 
     The recall is the ratio :math:`tp / (tp + fn)` where tp is the number of
@@ -312,7 +312,7 @@ class (default is 1). Everything else but 'pos_label'
     return r
 
 
-def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=None,
+def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
                 average='weighted'):
     """Compute fbeta score
 
@@ -373,7 +373,7 @@ class (default is 1). Everything else but 'pos_label'
     return f
 
 
-def f1_score(y_true, y_pred, labels=None, pos_label=None, average='weighted'):
+def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     """Compute f1 score
 
     The F1 score can be interpreted as a weighted average of the precision
@@ -431,7 +431,7 @@ class (default is 1). Everything else but 'pos_label'
 
 
 def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
-                                    pos_label=None, average=None):
+                                    pos_label=1, average="weighted"):
     """Compute precisions, recalls, f-measures and support for each class
 
     The precision is the ratio :math:`tp / (tp + fp)` where tp is the number of
@@ -540,10 +540,10 @@ class (default is 1). Everything else but 'pos_label'
     finally:
         np.seterr(**old_err_settings)
 
-    if pos_label is not None:
-        if precision.shape[0] != 2:
-            raise ValueError(("pos_label should be set to None for multiclass "
-                              "tasks got %d") % pos_label)
+    if n_labels == 2:
+        if not average:
+            return precision, recall, fscore, support
+
         if pos_label not in labels:
             raise ValueError("pos_label=%d is not a valid label: %r" %
                              (pos_label, labels))
@@ -552,7 +552,7 @@ class (default is 1). Everything else but 'pos_label'
                 fscore[pos_label_idx], support[pos_label_idx])
     else:
         average_options = (None, 'micro', 'macro', 'weighted')
-        if average is None:
+        if not average:
             return precision, recall, fscore, support
         elif average == 'micro':
             avg_precision = true_pos.sum() / (true_pos.sum() +
@@ -666,7 +666,9 @@ def classification_report(y_true, y_pred, labels=None, target_names=None):
     report += '\n'
 
     p, r, f1, s = precision_recall_fscore_support(y_true, y_pred,
-                                                  labels=labels)
+                                                  labels=labels,
+                                                  average=None)
+
     for i, label in enumerate(labels):
         values = [target_names[i]]
         for v in (p[i], r[i], f1[i]):
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index 2ab0f118c7fdd..41af3b9f679a4 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -139,7 +139,7 @@ def test_precision_recall_f1_score_binary():
     y_true, y_pred, _ = make_prediction(binary=True)
 
     # detailed measures for each class
-    p, r, f, s = precision_recall_fscore_support(y_true, y_pred)
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
     assert_array_almost_equal(p, [0.73, 0.75], 2)
     assert_array_almost_equal(r, [0.76, 0.72], 2)
     assert_array_almost_equal(f, [0.75, 0.74], 2)
@@ -152,7 +152,7 @@ def test_precision_recall_f1_score_binary():
     assert_array_almost_equal(ps, 0.75, 2)
 
     rs = recall_score(y_true, y_pred)
-    assert_array_almost_equal(rs, 0.74, 2)
+    assert_array_almost_equal(rs, 0.72, 2)
 
     fs = f1_score(y_true, y_pred)
     assert_array_almost_equal(fs, 0.74, 2)
@@ -185,51 +185,43 @@ def test_precision_recall_f1_score_multiclass():
     y_true, y_pred, _ = make_prediction(binary=False)
 
     # compute scores with default labels introspection
-    p, r, f, s = precision_recall_fscore_support(y_true, y_pred)
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
     assert_array_almost_equal(p, [0.82, 0.55, 0.47], 2)
     assert_array_almost_equal(r, [0.92, 0.17, 0.90], 2)
     assert_array_almost_equal(f, [0.87, 0.26, 0.62], 2)
     assert_array_equal(s, [25, 30, 20])
 
-    # individual scoring function that can be used for grid search: in the
-    # multiclass case the score is the wieghthed average of the individual
-    # class values hence f1_score is not necessary between precision_score and
-    # recall_score
-    func = lambda: precision_recall_fscore_support(y_true, y_pred,
-                                                   pos_label=1)
-    assert_raises(ValueError, func)
-
     # averaging tests
-    ps = precision_score(y_true, y_pred, pos_label=None, average='micro')
+    ps = precision_score(y_true, y_pred, pos_label=1, average='micro')
     assert_array_almost_equal(ps, 0.61, 2)
 
-    rs = recall_score(y_true, y_pred, pos_label=None, average='micro')
+    rs = recall_score(y_true, y_pred, average='micro')
     assert_array_almost_equal(rs, 0.61, 2)
 
-    fs = f1_score(y_true, y_pred, pos_label=None, average='micro')
+    fs = f1_score(y_true, y_pred, average='micro')
     assert_array_almost_equal(fs, 0.61, 2)
 
-    ps = precision_score(y_true, y_pred, pos_label=None, average='macro')
+    ps = precision_score(y_true, y_pred, average='macro')
     assert_array_almost_equal(ps, 0.62, 2)
 
-    rs = recall_score(y_true, y_pred, pos_label=None, average='macro')
+    rs = recall_score(y_true, y_pred, average='macro')
     assert_array_almost_equal(rs, 0.66, 2)
 
-    fs = f1_score(y_true, y_pred, pos_label=None, average='macro')
+    fs = f1_score(y_true, y_pred, average='macro')
     assert_array_almost_equal(fs, 0.58, 2)
 
-    ps = precision_score(y_true, y_pred, pos_label=None, average='weighted')
+    ps = precision_score(y_true, y_pred, average='weighted')
     assert_array_almost_equal(ps, 0.62, 2)
 
-    rs = recall_score(y_true, y_pred, pos_label=None, average='weighted')
+    rs = recall_score(y_true, y_pred, average='weighted')
     assert_array_almost_equal(rs, 0.61, 2)
 
-    fs = f1_score(y_true, y_pred, pos_label=None, average='weighted')
+    fs = f1_score(y_true, y_pred, average='weighted')
     assert_array_almost_equal(fs, 0.55, 2)
 
     # same prediction but with and explicit label ordering
     p, r, f, s = precision_recall_fscore_support(
-        y_true, y_pred, labels=[0, 2, 1])
+        y_true, y_pred, labels=[0, 2, 1], average=None)
     assert_array_almost_equal(p, [0.82, 0.47, 0.55], 2)
     assert_array_almost_equal(r, [0.92, 0.90, 0.17], 2)
     assert_array_almost_equal(f, [0.87, 0.62, 0.26], 2)
@@ -245,12 +237,12 @@ def test_zero_precision_recall():
         y_true = np.array([0, 1, 2, 0, 1, 2])
         y_pred = np.array([2, 0, 1, 1, 2, 0])
 
-        assert_almost_equal(precision_score(y_true, y_pred, pos_label=None,
-                                            average='weighted'), 0.0, 2)
-        assert_almost_equal(recall_score(y_true, y_pred, pos_label=None,
-                                         average='weighted'), 0.0, 2)
-        assert_almost_equal(f1_score(y_true, y_pred, pos_label=None,
-                                     average='weighted'), 0.0, 2)
+        assert_almost_equal(precision_score(y_true, y_pred, average='weighted'),
+                            0.0, 2)
+        assert_almost_equal(recall_score(y_true, y_pred, average='weighted'),
+                            0.0, 2)
+        assert_almost_equal(f1_score(y_true, y_pred, average='weighted'),
+                            0.0, 2)
 
     finally:
         np.seterr(**old_error_settings)

From f62398b86208979e326e1033871ff6930fba85ef Mon Sep 17 00:00:00 2001
From: Mathieu Blondel <mathieu@mblondel.org>
Date: Fri, 27 Jan 2012 21:40:32 +0900
Subject: [PATCH 15/15] Factor some code.

---
 sklearn/metrics/metrics.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 7da145f145544..5b64e21c0e5cf 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -540,10 +540,10 @@ class (default is 1). Everything else but 'pos_label'
     finally:
         np.seterr(**old_err_settings)
 
-    if n_labels == 2:
-        if not average:
-            return precision, recall, fscore, support
+    if not average:
+        return precision, recall, fscore, support
 
+    elif n_labels == 2:
         if pos_label not in labels:
             raise ValueError("pos_label=%d is not a valid label: %r" %
                              (pos_label, labels))
@@ -552,9 +552,7 @@ class (default is 1). Everything else but 'pos_label'
                 fscore[pos_label_idx], support[pos_label_idx])
     else:
         average_options = (None, 'micro', 'macro', 'weighted')
-        if not average:
-            return precision, recall, fscore, support
-        elif average == 'micro':
+        if average == 'micro':
             avg_precision = true_pos.sum() / (true_pos.sum() +
                                               false_pos.sum())
             avg_recall = true_pos.sum() / (true_pos.sum() + false_neg.sum())