From 068dcc399a9b93b658be7de71eb1c95aae3a71aa Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Wed, 22 May 2013 12:05:10 +0200
Subject: [PATCH 01/14] FIX bug in f_score with beta !=1

---
 sklearn/metrics/metrics.py            |  2 +-
 sklearn/metrics/tests/test_metrics.py | 65 ++++++++++++++++++++++++++-
 2 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 5a97f503682e3..3a15ae155c239 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -1458,7 +1458,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
 
             precision = size_inter / size_true
             recall = size_inter / size_pred
-            f_score = ((1 + beta2 ** 2) * size_inter /
+            f_score = ((1 + beta2) * size_inter /
                        (beta2 * size_pred + size_true))
         finally:
             np.seterr(**old_err_settings)
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index 90cdd7f258da4..985f3cfaf6ab4 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -503,6 +503,9 @@ def test_precision_recall_f1_score_binary():
     fs = f1_score(y_true, y_pred)
     assert_array_almost_equal(fs, 0.76, 2)
 
+    assert_almost_equal(fbeta_score(y_true, y_pred, beta=2),
+                        (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
+
 
 def test_precision_recall_f_binary_single_class():
     """Test precision, recall and F1 score behave with a single positive or
@@ -1467,6 +1470,10 @@ def test_precision_recall_f1_score_multilabel_1():
         assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
         assert_array_almost_equal(s, [1, 1, 1, 1], 2)
 
+        f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
+        support = s
+        assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2)
+
         # Check macro
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="macro")
@@ -1474,6 +1481,9 @@ def test_precision_recall_f1_score_multilabel_1():
         assert_almost_equal(r, 0.5)
         assert_almost_equal(f, 2.5 / 1.5 * 0.25)
         assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="macro"),
+                            np.mean(f2))
 
         # Check micro
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
@@ -1482,6 +1492,9 @@ def test_precision_recall_f1_score_multilabel_1():
         assert_almost_equal(r, 0.5)
         assert_almost_equal(f, 0.5)
         assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="micro"),
+                            (1 + 4) * p * r / (4 * p + r))
 
         # Check weigted
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
@@ -1490,7 +1503,9 @@ def test_precision_recall_f1_score_multilabel_1():
         assert_almost_equal(r, 0.5)
         assert_almost_equal(f, 2.5 / 1.5 * 0.25)
         assert_equal(s, None)
-
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="weighted"),
+                            np.average(f2, weights=support))
         # Check weigted
         # |h(x_i) inter y_i | = [0, 1, 1]
         # |y_i| = [1, 1, 2]
@@ -1501,7 +1516,9 @@ def test_precision_recall_f1_score_multilabel_1():
         assert_almost_equal(r, 0.5)
         assert_almost_equal(f, 0.5)
         assert_equal(s, None)
-
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="samples"),
+                            0.5)
 
 def test_precision_recall_f1_score_multilabel_2():
     """ Test precision_recall_f1_score on a crafted multilabel example 2
@@ -1526,12 +1543,20 @@ def test_precision_recall_f1_score_multilabel_2():
         assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2)
         assert_array_almost_equal(s, [1, 2, 1, 0], 2)
 
+        f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
+        support = s
+        assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2)
+
+
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="micro")
         assert_almost_equal(p, 0.25)
         assert_almost_equal(r, 0.25)
         assert_almost_equal(f, 2 * 0.25 * 0.25 / 0.5)
         assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="micro"),
+                            (1 + 4) * p * r / (4 * p + r))
 
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="macro")
@@ -1539,6 +1564,9 @@ def test_precision_recall_f1_score_multilabel_2():
         assert_almost_equal(r, 0.125)
         assert_almost_equal(f, 2 / 12)
         assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="macro"),
+                            np.mean(f2))
 
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="weighted")
@@ -1546,6 +1574,9 @@ def test_precision_recall_f1_score_multilabel_2():
         assert_almost_equal(r, 1 / 4)
         assert_almost_equal(f, 2 / 3 * 2 / 4)
         assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="weighted"),
+                            np.average(f2, weights=support))
 
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="samples")
@@ -1557,6 +1588,9 @@ def test_precision_recall_f1_score_multilabel_2():
         assert_almost_equal(r, 1 / 6)
         assert_almost_equal(f, 2 / 4 * 1 / 3)
         assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="samples"),
+                            0.1666, 2)
 
 
 def test_precision_recall_f1_score_with_an_empty_prediction():
@@ -1580,12 +1614,19 @@ def test_precision_recall_f1_score_with_an_empty_prediction():
         assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
         assert_array_almost_equal(s, [1, 2, 1, 0], 2)
 
+        f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
+        support = s
+        assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2)
+
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="macro")
         assert_almost_equal(p, 0.5)
         assert_almost_equal(r, 1.5 / 4)
         assert_almost_equal(f, 2.5 / (4 * 1.5))
         assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="macro"),
+                            np.mean(f2))
 
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="micro")
@@ -1593,6 +1634,9 @@ def test_precision_recall_f1_score_with_an_empty_prediction():
         assert_almost_equal(r, 0.5)
         assert_almost_equal(f, 2 / 3 / (2 / 3 + 0.5))
         assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="micro"),
+                            (1 + 4) * p * r / (4 * p + r))
 
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="weighted")
@@ -1600,6 +1644,9 @@ def test_precision_recall_f1_score_with_an_empty_prediction():
         assert_almost_equal(r, 0.5)
         assert_almost_equal(f, (2 / 1.5 + 1) / 4)
         assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="weighted"),
+                            np.average(f2, weights=support))
 
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="samples")
@@ -1610,6 +1657,9 @@ def test_precision_recall_f1_score_with_an_empty_prediction():
         assert_almost_equal(r, 2 / 3)
         assert_almost_equal(f, 1 / 3)
         assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="samples"),
+                            0.333, 2)
 
 
 def test_precision_recall_f1_no_labels():
@@ -1628,6 +1678,8 @@ def test_precision_recall_f1_no_labels():
     assert_array_almost_equal(r, [0, 0, 0], 2)
     assert_array_almost_equal(f, [0, 0, 0], 2)
     assert_array_almost_equal(s, [0, 0, 0], 2)
+    assert_array_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                          average=None), [0, 0, 0], 2)
 
     # Check macro
     p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
@@ -1636,6 +1688,8 @@ def test_precision_recall_f1_no_labels():
     assert_almost_equal(r, 0)
     assert_almost_equal(f, 0)
     assert_equal(s, None)
+    assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                    average="macro"), 0)
 
     # Check micro
     p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
@@ -1644,6 +1698,9 @@ def test_precision_recall_f1_no_labels():
     assert_almost_equal(r, 0)
     assert_almost_equal(f, 0)
     assert_equal(s, None)
+    assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                    average="micro"), 0)
+
 
     # Check weighted
     p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
@@ -1652,6 +1709,8 @@ def test_precision_recall_f1_no_labels():
     assert_almost_equal(r, 0)
     assert_almost_equal(f, 0)
     assert_equal(s, None)
+    assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                    average="weighted"), 0)
 
     # # Check example
     # |h(x_i) inter y_i | = [0, 0, 0]
@@ -1663,6 +1722,8 @@ def test_precision_recall_f1_no_labels():
     assert_almost_equal(r, 1)
     assert_almost_equal(f, 1)
     assert_equal(s, None)
+    assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                    average="samples"), 1)
 
 
 def test__check_clf_targets():

From 1868790207f974e5874b9b6d1eeb581584336b0d Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Wed, 22 May 2013 15:47:39 +0200
Subject: [PATCH 02/14] FIX formula inversion for sample-based precision/recall

---
 sklearn/metrics/metrics.py            | 18 ++++++++++--------
 sklearn/metrics/tests/test_metrics.py |  4 ++--
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 3a15ae155c239..a533e54271588 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -1415,6 +1415,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     ... # doctest: +ELLIPSIS
     (0.22..., 0.33..., 0.26..., None)
 
+
     """
     if beta <= 0:
         raise ValueError("beta should be >0 in the F-beta score")
@@ -1448,24 +1449,24 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
                 size_true[i] = len(true_set)
         else:
             raise ValueError("Example-based precision, recall, fscore is "
-                             "not meaning full outside multilabe"
-                             "classification. See the accuracy_score instead.")
+                             "not meaning full outside multilabel"
+                             "classification. Use accuracy_score instead.")
 
         try:
             # oddly, we may get an "invalid" rather than a "divide" error
             # here
             old_err_settings = np.seterr(divide='ignore', invalid='ignore')
 
-            precision = size_inter / size_true
-            recall = size_inter / size_pred
+            precision = size_inter / size_pred
+            recall = size_inter / size_true
             f_score = ((1 + beta2) * size_inter /
-                       (beta2 * size_pred + size_true))
+                       (beta2 * size_true + size_pred))
         finally:
             np.seterr(**old_err_settings)
 
-        precision[size_true == 0] = 1.0
-        recall[size_pred == 0] = 1.0
-        f_score[(beta2 * size_pred + size_true) == 0] = 1.0
+        precision[size_pred == 0] = 1.0
+        recall[size_true == 0] = 1.0
+        f_score[(beta2 * size_true + size_pred) == 0] = 1.0
 
         precision = np.mean(precision)
         recall = np.mean(recall)
@@ -1698,6 +1699,7 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     >>> recall_score(y_true, y_pred, average=None)
     array([ 1.,  0.,  0.])
 
+
     """
     _, r, _, _ = precision_recall_fscore_support(y_true, y_pred,
                                                  labels=labels,
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index 985f3cfaf6ab4..e92b069a30a95 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -1653,8 +1653,8 @@ def test_precision_recall_f1_score_with_an_empty_prediction():
         # |h(x_i) inter y_i | = [0, 0, 2]
         # |y_i| = [1, 1, 2]
         # |h(x_i)| = [0, 1, 2]
-        assert_almost_equal(p, 1 / 3)
-        assert_almost_equal(r, 2 / 3)
+        assert_almost_equal(p, 2 / 3)
+        assert_almost_equal(r, 1 / 3)
         assert_almost_equal(f, 1 / 3)
         assert_equal(s, None)
         assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,

From 42a127084eff73906a89802fe748289e49d995fb Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Thu, 23 May 2013 08:56:20 +0200
Subject: [PATCH 03/14] FIX set same default behavior for precision, recall and
 f-score

---
 sklearn/metrics/metrics.py            |  7 +++----
 sklearn/metrics/tests/test_metrics.py | 11 +++++------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index a533e54271588..f43d5f2e8a2f4 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -1415,7 +1415,6 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     ... # doctest: +ELLIPSIS
     (0.22..., 0.33..., 0.26..., None)
 
-
     """
     if beta <= 0:
         raise ValueError("beta should be >0 in the F-beta score")
@@ -1464,9 +1463,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         finally:
             np.seterr(**old_err_settings)
 
-        precision[size_pred == 0] = 1.0
-        recall[size_true == 0] = 1.0
-        f_score[(beta2 * size_true + size_pred) == 0] = 1.0
+        precision[size_pred == 0] = 0.0
+        recall[size_true == 0] = 0.0
+        f_score[(beta2 * size_true + size_pred) == 0] = 0.0
 
         precision = np.mean(precision)
         recall = np.mean(recall)
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index e92b069a30a95..094df4651e614 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -1653,7 +1653,7 @@ def test_precision_recall_f1_score_with_an_empty_prediction():
         # |h(x_i) inter y_i | = [0, 0, 2]
         # |y_i| = [1, 1, 2]
         # |h(x_i)| = [0, 1, 2]
-        assert_almost_equal(p, 2 / 3)
+        assert_almost_equal(p, 1 / 3)
         assert_almost_equal(r, 1 / 3)
         assert_almost_equal(f, 1 / 3)
         assert_equal(s, None)
@@ -1701,7 +1701,6 @@ def test_precision_recall_f1_no_labels():
     assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                     average="micro"), 0)
 
-
     # Check weighted
     p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                  average="weighted")
@@ -1718,12 +1717,12 @@ def test_precision_recall_f1_no_labels():
     # |h(x_i)| = [1, 1, 2]
     p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                  average="samples")
-    assert_almost_equal(p, 1)
-    assert_almost_equal(r, 1)
-    assert_almost_equal(f, 1)
+    assert_almost_equal(p, 0)
+    assert_almost_equal(r, 0)
+    assert_almost_equal(f, 0)
     assert_equal(s, None)
     assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                    average="samples"), 1)
+                                    average="samples"), 0)
 
 
 def test__check_clf_targets():

From 98a6b70135b532c4ddc37c01732e1a98f991338e Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Thu, 23 May 2013 15:14:46 +0200
Subject: [PATCH 04/14] ENH raise warning with ill define precision, recall and
 fscore

---
 sklearn/metrics/metrics.py            |  72 ++++++++--
 sklearn/metrics/tests/test_metrics.py | 186 +++++++++++++-------------
 2 files changed, 155 insertions(+), 103 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index f43d5f2e8a2f4..a2f4984dbe16a 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -1484,20 +1484,46 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         precision = divide(true_pos.astype(np.float), true_pos + false_pos)
         recall = divide(true_pos.astype(np.float), true_pos + false_neg)
 
+        idx_ill_defined_precision = (true_pos + false_pos) == 0
+        idx_ill_defined_recall = (true_pos + false_neg) == 0
+
         # handle division by 0 in precision and recall
-        precision[(true_pos + false_pos) == 0] = 0.0
-        recall[(true_pos + false_neg) == 0] = 0.0
+        precision[idx_ill_defined_precision] = 0.0
+        recall[idx_ill_defined_recall] = 0.0
 
         # fbeta score
         fscore = divide((1 + beta2) * precision * recall,
                         beta2 * precision + recall)
 
         # handle division by 0 in fscore
-        fscore[(beta2 * precision + recall) == 0] = 0.0
+        idx_ill_defined_fbeta_score = (beta2 * precision + recall) == 0
+        fscore[idx_ill_defined_fbeta_score] = 0.0
     finally:
         np.seterr(**old_err_settings)
 
     if not average:
+        warning_msg = ""
+        if np.any(idx_ill_defined_precision):
+            warning_msg += ("The sum of true positives and false positives "
+                            "are equal to zero for some labels. Precision is "
+                            "ill defined for those labels %s. "
+                            % labels[idx_ill_defined_precision])
+
+        if np.any(idx_ill_defined_recall):
+            warning_msg += ("The sum of true positives and false negatives "
+                            "are equal to zero for some labels. Recall is ill "
+                            "defined for those labels %s. "
+                            % labels[idx_ill_defined_recall])
+
+        if np.any(idx_ill_defined_fbeta_score):
+            warning_msg += ("The precision and recall are equal to zero for "
+                            "some labels. fbeta_score is ill defined for "
+                            "those labels %s. "
+                            % labels[idx_ill_defined_fbeta_score])
+
+        if warning_msg:
+            warnings.warn(warning_msg)
+
         return precision, recall, fscore, support
 
     elif y_type == 'binary' and pos_label is not None:
@@ -1513,24 +1539,42 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     else:
         average_options = (None, 'micro', 'macro', 'weighted', 'samples')
         if average == 'micro':
-            avg_precision = divide(true_pos.sum(),
-                                   true_pos.sum() + false_pos.sum(),
-                                   dtype=np.double)
-            avg_recall = divide(true_pos.sum(),
-                                true_pos.sum() + false_neg.sum(),
-                                dtype=np.double)
-            avg_fscore = divide((1 + beta2) * (avg_precision * avg_recall),
-                                beta2 * avg_precision + avg_recall,
-                                dtype=np.double)
+            try:
+                # oddly, we may get an "invalid" rather than a "divide" error here
+                old_err_settings = np.seterr(divide='ignore', invalid='ignore')
+                avg_precision = divide(true_pos.sum(),
+                                       true_pos.sum() + false_pos.sum(),
+                                       dtype=np.double)
+                avg_recall = divide(true_pos.sum(),
+                                    true_pos.sum() + false_neg.sum(),
+                                    dtype=np.double)
+                avg_fscore = divide((1 + beta2) * (avg_precision * avg_recall),
+                                    beta2 * avg_precision + avg_recall,
+                                    dtype=np.double)
+            finally:
+                np.seterr(**old_err_settings)
+
+            warning_msg = ""
 
             if np.isnan(avg_precision):
                 avg_precision = 0.
+                warning_msg += ("The sum of true positives and false "
+                                "positives are equal to zero. Micro-precision"
+                                " is ill defined. ")
 
             if np.isnan(avg_recall):
                 avg_recall = 0.
+                warning_msg += ("The sum of true positives and false "
+                                "negatives are equal to zero. Micro-recall "
+                                "is ill defined. ")
 
             if np.isnan(avg_fscore):
                 avg_fscore = 0.
+                warning_msg += ("Micro-precision and micro-recall are equal "
+                                "to zero. Micro-fbeta_score is ill defined.")
+
+            if warning_msg:
+                warnings.warn(warning_msg)
 
         elif average == 'macro':
             avg_precision = np.mean(precision)
@@ -1542,6 +1586,10 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
                 avg_precision = 0.
                 avg_recall = 0.
                 avg_fscore = 0.
+                warnings.warn("There isn't any labels in y_true. "
+                              "Weighted-precision, weighted-recall and "
+                              "weighted-fbeta_score are ill defined.")
+
             else:
                 avg_precision = np.average(precision, weights=support)
                 avg_recall = np.average(recall, weights=support)
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index 094df4651e614..de3734b7cb4de 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -1458,21 +1458,23 @@ def test_precision_recall_f1_score_multilabel_1():
     y_pred_bi = lb.transform(y_pred_ll)
 
     for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:
-        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                     average=None)
-        #tp = [0, 1, 1, 0]
-        #fn = [1, 0, 0, 1]
-        #fp = [1, 1, 0, 0]
+        with warnings.catch_warnings(True):
 
-        # Check per class
-        assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2)
-        assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2)
-        assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
-        assert_array_almost_equal(s, [1, 1, 1, 1], 2)
+            p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                         average=None)
+            #tp = [0, 1, 1, 0]
+            #fn = [1, 0, 0, 1]
+            #fp = [1, 1, 0, 0]
+            # Check per class
 
-        f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
-        support = s
-        assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2)
+            assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2)
+            assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2)
+            assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
+            assert_array_almost_equal(s, [1, 1, 1, 1], 2)
+
+            f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
+            support = s
+            assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2)
 
         # Check macro
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
@@ -1520,6 +1522,7 @@ def test_precision_recall_f1_score_multilabel_1():
                                         average="samples"),
                             0.5)
 
+
 def test_precision_recall_f1_score_multilabel_2():
     """ Test precision_recall_f1_score on a crafted multilabel example 2
     """
@@ -1536,17 +1539,17 @@ def test_precision_recall_f1_score_multilabel_2():
         # fp = [ 1.  0.  0.  2.]
         # fn = [ 1.  1.  1.  0.]
 
-        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                     average=None)
-        assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2)
-        assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2)
-        assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2)
-        assert_array_almost_equal(s, [1, 2, 1, 0], 2)
-
-        f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
-        support = s
-        assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2)
+        with warnings.catch_warnings(True):
+            p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                         average=None)
+            assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2)
+            assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2)
+            assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2)
+            assert_array_almost_equal(s, [1, 2, 1, 0], 2)
 
+            f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
+            support = s
+            assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2)
 
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="micro")
@@ -1606,17 +1609,17 @@ def test_precision_recall_f1_score_with_an_empty_prediction():
         # true_pos = [ 0.  1.  1.  0.]
         # false_pos = [ 0.  0.  0.  1.]
         # false_neg = [ 1.  1.  0.  0.]
-
-        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                     average=None)
-        assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2)
-        assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2)
-        assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
-        assert_array_almost_equal(s, [1, 2, 1, 0], 2)
-
-        f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
-        support = s
-        assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2)
+        with warnings.catch_warnings(True):
+            p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                         average=None)
+            assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2)
+            assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2)
+            assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
+            assert_array_almost_equal(s, [1, 2, 1, 0], 2)
+
+            f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
+            support = s
+            assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2)
 
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="macro")
@@ -1666,63 +1669,64 @@ def test_precision_recall_f1_no_labels():
     y_true = np.zeros((20, 3))
     y_pred = np.zeros_like(y_true)
 
-    p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                 average=None)
-    #tp = [0, 0, 0]
-    #fn = [0, 0, 0]
-    #fp = [0, 0, 0]
-    #support = [0, 0, 0]
-
-    # Check per class
-    assert_array_almost_equal(p, [0, 0, 0], 2)
-    assert_array_almost_equal(r, [0, 0, 0], 2)
-    assert_array_almost_equal(f, [0, 0, 0], 2)
-    assert_array_almost_equal(s, [0, 0, 0], 2)
-    assert_array_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                          average=None), [0, 0, 0], 2)
-
-    # Check macro
-    p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                 average="macro")
-    assert_almost_equal(p, 0)
-    assert_almost_equal(r, 0)
-    assert_almost_equal(f, 0)
-    assert_equal(s, None)
-    assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                    average="macro"), 0)
-
-    # Check micro
-    p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                 average="micro")
-    assert_almost_equal(p, 0)
-    assert_almost_equal(r, 0)
-    assert_almost_equal(f, 0)
-    assert_equal(s, None)
-    assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                    average="micro"), 0)
-
-    # Check weighted
-    p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                 average="weighted")
-    assert_almost_equal(p, 0)
-    assert_almost_equal(r, 0)
-    assert_almost_equal(f, 0)
-    assert_equal(s, None)
-    assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                    average="weighted"), 0)
-
-    # # Check example
-    # |h(x_i) inter y_i | = [0, 0, 0]
-    # |y_i| = [0, 0, 0]
-    # |h(x_i)| = [1, 1, 2]
-    p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                 average="samples")
-    assert_almost_equal(p, 0)
-    assert_almost_equal(r, 0)
-    assert_almost_equal(f, 0)
-    assert_equal(s, None)
-    assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                    average="samples"), 0)
+    with warnings.catch_warnings(True):
+        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                     average=None)
+        #tp = [0, 0, 0]
+        #fn = [0, 0, 0]
+        #fp = [0, 0, 0]
+        #support = [0, 0, 0]
+
+        # Check per class
+        assert_array_almost_equal(p, [0, 0, 0], 2)
+        assert_array_almost_equal(r, [0, 0, 0], 2)
+        assert_array_almost_equal(f, [0, 0, 0], 2)
+        assert_array_almost_equal(s, [0, 0, 0], 2)
+        assert_array_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                              average=None), [0, 0, 0], 2)
+
+        # Check macro
+        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                     average="macro")
+        assert_almost_equal(p, 0)
+        assert_almost_equal(r, 0)
+        assert_almost_equal(f, 0)
+        assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="macro"), 0)
+
+        # Check micro
+        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                     average="micro")
+        assert_almost_equal(p, 0)
+        assert_almost_equal(r, 0)
+        assert_almost_equal(f, 0)
+        assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="micro"), 0)
+
+        # Check weighted
+        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                     average="weighted")
+        assert_almost_equal(p, 0)
+        assert_almost_equal(r, 0)
+        assert_almost_equal(f, 0)
+        assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="weighted"), 0)
+
+        # # Check example
+        # |h(x_i) inter y_i | = [0, 0, 0]
+        # |y_i| = [0, 0, 0]
+        # |h(x_i)| = [1, 1, 2]
+        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                     average="samples")
+        assert_almost_equal(p, 0)
+        assert_almost_equal(r, 0)
+        assert_almost_equal(f, 0)
+        assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="samples"), 0)
 
 
 def test__check_clf_targets():

From cc6963b6adcdecaaf3598d7d02def94658b00a22 Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Tue, 28 May 2013 13:48:57 +0200
Subject: [PATCH 05/14] Backport assert_warns and assert_no_warnings from np
 1.7

---
 sklearn/utils/testing.py            | 36 +++++++++++++++++++++++
 sklearn/utils/tests/test_testing.py | 45 +++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)

diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py
index 43edb39e52460..eb2f742af14bf 100644
--- a/sklearn/utils/testing.py
+++ b/sklearn/utils/testing.py
@@ -9,6 +9,7 @@
 # License: BSD 3 clause
 import inspect
 import pkgutil
+import warnings
 
 import scipy as sp
 from functools import wraps
@@ -76,6 +77,41 @@ def _assert_greater(a, b, msg=None):
     assert a > b, message
 
 
+# To remove when we support numpy 1.7
+def assert_warns(warning_class, func, *args, **kw):
+    with warnings.catch_warnings(record=True) as w:
+        # Cause all warnings to always be triggered.
+        warnings.simplefilter("always")
+
+        # Trigger a warning.
+        result = func(*args, **kw)
+
+        # Verify some things
+        if not len(w) > 0:
+            raise AssertionError("No warning raised when calling %s"
+                                 % func.__name__)
+
+        if not w[0].category is warning_class:
+            raise AssertionError("First warning for %s is not a "
+                                 "%s( is %s)"
+                                 % (func.__name__, warning_class, w[0]))
+
+    return result
+
+
+# To remove when we support numpy 1.7
+def assert_no_warnings(func, *args, **kw):
+    # XXX: once we may depend on python >= 2.6, this can be replaced by the
+    # warnings module context manager.
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter('always')
+
+        result = func(*args, **kw)
+        if len(w) > 0:
+            raise AssertionError("Got warnings when calling %s: %s"
+                                 % (func.__name__, w))
+    return result
+
 try:
     from nose.tools import assert_less
 except ImportError:
diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py
index 40e33b1a04158..47ff9aa66532a 100644
--- a/sklearn/utils/tests/test_testing.py
+++ b/sklearn/utils/tests/test_testing.py
@@ -1,8 +1,14 @@
+import warnings
+import unittest
+import sys
+
 from nose.tools import assert_raises
 
 from sklearn.utils.testing import (
     _assert_less,
     _assert_greater,
+    assert_warns,
+    assert_no_warnings,
     assert_equal,
     set_random_state,
     assert_raise_message)
@@ -62,3 +68,42 @@ def _raise_ValueError(message):
     assert_raises(ValueError,
                   assert_raise_message, TypeError, "something else",
                   _raise_ValueError, "test")
+
+
+
+# This class is taken from numpy 1.7
+class TestWarns(unittest.TestCase):
+    def test_warn(self):
+        def f():
+            warnings.warn("yo")
+            return 3
+
+        before_filters = sys.modules['warnings'].filters[:]
+        assert_equal(assert_warns(UserWarning, f), 3)
+        after_filters = sys.modules['warnings'].filters
+
+        assert_raises(AssertionError, assert_no_warnings, f)
+        assert_equal(assert_no_warnings(lambda x: x, 1), 1)
+
+        # Check that the warnings state is unchanged
+        assert_equal(before_filters, after_filters,
+                     "assert_warns does not preserver warnings state")
+
+    def test_warn_wrong_warning(self):
+        def f():
+            warnings.warn("yo", DeprecationWarning)
+
+        failed = False
+        filters = sys.modules['warnings'].filters[:]
+        try:
+            try:
+                # Should raise an AssertionError
+                assert_warns(UserWarning, f)
+                failed = True
+            except AssertionError:
+                pass
+        finally:
+            sys.modules['warnings'].filters = filters
+
+        if failed:
+            raise AssertionError("wrong warning caught by assert_warn")

From 6a4a3629100546ecc95a4a1d16fe3f0cecff873e Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Tue, 28 May 2013 15:25:16 +0200
Subject: [PATCH 06/14] TST test warning + ENH Add warning average=samples

---
 sklearn/metrics/metrics.py            |  50 +++++--
 sklearn/metrics/tests/test_metrics.py | 191 ++++++++++++--------------
 2 files changed, 122 insertions(+), 119 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index a2f4984dbe16a..09ef9a1f55012 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -1448,18 +1448,36 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
                 size_true[i] = len(true_set)
         else:
             raise ValueError("Example-based precision, recall, fscore is "
-                             "not meaning full outside multilabel"
+                             "not meaningful outside of multilabel"
                              "classification. Use accuracy_score instead.")
 
+        warning_msg = ""
+        if np.any(size_pred == 0):
+            warning_msg += ("Sample-based precision is undefined for some "
+                            "samples. ")
+
+        if np.any(size_true == 0):
+            warning_msg += ("Sample-based recall is undefined for some "
+                            "samples. ")
+
+        if np.any((beta2 * size_true + size_pred) == 0):
+            warning_msg += ("Sample-based f_score is undefined for some "
+                            "samples. ")
+
+        if warning_msg:
+            warnings.warn(warning_msg)
+
+
         try:
             # oddly, we may get an "invalid" rather than a "divide" error
             # here
             old_err_settings = np.seterr(divide='ignore', invalid='ignore')
 
-            precision = size_inter / size_pred
-            recall = size_inter / size_true
-            f_score = ((1 + beta2) * size_inter /
-                       (beta2 * size_true + size_pred))
+            precision = divide(size_inter, size_pred, dtype=np.double)
+            recall = divide(size_inter, size_true, dtype=np.double)
+            f_score = divide((1 + beta2) * size_inter,
+                             (beta2 * size_true + size_pred),
+                             dtype=np.double)
         finally:
             np.seterr(**old_err_settings)
 
@@ -1467,6 +1485,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         recall[size_true == 0] = 0.0
         f_score[(beta2 * size_true + size_pred) == 0] = 0.0
 
+
+
         precision = np.mean(precision)
         recall = np.mean(recall)
         f_score = np.mean(f_score)
@@ -1501,7 +1521,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     finally:
         np.seterr(**old_err_settings)
 
-    if not average:
+    if average in (None, "macro", "weighted"):
         warning_msg = ""
         if np.any(idx_ill_defined_precision):
             warning_msg += ("The sum of true positives and false positives "
@@ -1524,6 +1544,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         if warning_msg:
             warnings.warn(warning_msg)
 
+    if not average:
         return precision, recall, fscore, support
 
     elif y_type == 'binary' and pos_label is not None:
@@ -1542,12 +1563,11 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
             try:
                 # oddly, we may get an "invalid" rather than a "divide" error here
                 old_err_settings = np.seterr(divide='ignore', invalid='ignore')
-                avg_precision = divide(true_pos.sum(),
-                                       true_pos.sum() + false_pos.sum(),
-                                       dtype=np.double)
-                avg_recall = divide(true_pos.sum(),
-                                    true_pos.sum() + false_neg.sum(),
-                                    dtype=np.double)
+                tp_sum = true_pos.sum()
+                fp_sum = false_pos.sum()
+                fn_sum = false_neg.sum()
+                avg_precision = divide(tp_sum, tp_sum + fp_sum, dtype=np.double)
+                avg_recall = divide(tp_sum, tp_sum + fn_sum, dtype=np.double)
                 avg_fscore = divide((1 + beta2) * (avg_precision * avg_recall),
                                     beta2 * avg_precision + avg_recall,
                                     dtype=np.double)
@@ -1556,19 +1576,19 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
 
             warning_msg = ""
 
-            if np.isnan(avg_precision):
+            if tp_sum + fp_sum == 0:
                 avg_precision = 0.
                 warning_msg += ("The sum of true positives and false "
                                 "positives are equal to zero. Micro-precision"
                                 " is ill defined. ")
 
-            if np.isnan(avg_recall):
+            if tp_sum + fn_sum == 0:
                 avg_recall = 0.
                 warning_msg += ("The sum of true positives and false "
                                 "negatives are equal to zero. Micro-recall "
                                 "is ill defined. ")
 
-            if np.isnan(avg_fscore):
+            if beta2 * avg_precision + avg_recall == 0:
                 avg_fscore = 0.
                 warning_msg += ("Micro-precision and micro-recall are equal "
                                 "to zero. Micro-fbeta_score is ill defined.")
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index de3734b7cb4de..e499780b13eb8 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -21,6 +21,7 @@
                                    assert_not_equal,
                                    assert_array_equal,
                                    assert_array_almost_equal,
+                                   assert_warns,
                                    assert_greater)
 
 
@@ -55,6 +56,7 @@
 
 from sklearn.externals.six.moves import xrange
 
+
 REGRESSION_METRICS = {
     "mean_absolute_error": mean_absolute_error,
     "mean_squared_error": mean_squared_error,
@@ -1221,45 +1223,47 @@ def test_multilabel_representation_invariance():
     y2_shuffle_binary_indicator = lb.transform(y2_shuffle)
 
     for name, metric in MULTILABELS_METRICS.items():
-        measure = metric(y1, y2)
-
-        # Check representation invariance
-        assert_almost_equal(metric(y1_binary_indicator, y2_binary_indicator),
-                            measure,
-                            err_msg="%s failed representation invariance  "
-                                    "between list of list of labels format "
-                                    "and dense binary indicator format."
-                                    % name)
-
-        # Check invariance with redundant labels with list of labels
-        assert_almost_equal(metric(y1, y2_redundant), measure,
-                            err_msg="%s failed rendundant label invariance"
-                                    % name)
-
-        assert_almost_equal(metric(y1_redundant, y2_redundant), measure,
-                            err_msg="%s failed rendundant label invariance"
-                                    % name)
-
-        assert_almost_equal(metric(y1_redundant, y2), measure,
-                            err_msg="%s failed rendundant label invariance"
-                                    % name)
-
-        # Check shuffling invariance with list of labels
-        assert_almost_equal(metric(y1_shuffle, y2_shuffle), measure,
-                            err_msg="%s failed shuffling invariance "
-                                    "with list of list of labels format."
-                                    % name)
-
-        # Check shuffling invariance with dense binary indicator matrix
-        assert_almost_equal(metric(y1_shuffle_binary_indicator,
-                                   y2_shuffle_binary_indicator), measure,
-                            err_msg="%s failed shuffling invariance "
-                                    " with dense binary indicator format."
-                                    % name)
-
-        # Check raises error with mix input representation
-        assert_raises(ValueError, metric, y1, y2_binary_indicator)
-        assert_raises(ValueError, metric, y1_binary_indicator, y2)
+        with warnings.catch_warnings(True):
+            measure = metric(y1, y2)
+
+            # Check representation invariance
+            assert_almost_equal(metric(y1_binary_indicator,
+                                       y2_binary_indicator),
+                                measure,
+                                err_msg="%s failed representation invariance  "
+                                        "between list of list of labels "
+                                        "format and dense binary indicator "
+                                        "format." % name)
+
+            # Check invariance with redundant labels with list of labels
+            assert_almost_equal(metric(y1, y2_redundant), measure,
+                                err_msg="%s failed rendundant label invariance"
+                                        % name)
+
+            assert_almost_equal(metric(y1_redundant, y2_redundant), measure,
+                                err_msg="%s failed rendundant label invariance"
+                                        % name)
+
+            assert_almost_equal(metric(y1_redundant, y2), measure,
+                                err_msg="%s failed rendundant label invariance"
+                                        % name)
+
+            # Check shuffling invariance with list of labels
+            assert_almost_equal(metric(y1_shuffle, y2_shuffle), measure,
+                                err_msg="%s failed shuffling invariance "
+                                        "with list of list of labels format."
+                                        % name)
+
+            # Check shuffling invariance with dense binary indicator matrix
+            assert_almost_equal(metric(y1_shuffle_binary_indicator,
+                                       y2_shuffle_binary_indicator), measure,
+                                err_msg="%s failed shuffling invariance "
+                                        " with dense binary indicator format."
+                                        % name)
+
+            # Check raises error with mix input representation
+            assert_raises(ValueError, metric, y1, y2_binary_indicator)
+            assert_raises(ValueError, metric, y1_binary_indicator, y2)
 
 
 def test_multilabel_zero_one_loss_subset():
@@ -1587,6 +1591,7 @@ def test_precision_recall_f1_score_multilabel_2():
         # |h(x_i) inter y_i | = [0, 0, 1]
         # |y_i| = [1, 1, 2]
         # |h(x_i)| = [1, 1, 2]
+
         assert_almost_equal(p, 1 / 6)
         assert_almost_equal(r, 1 / 6)
         assert_almost_equal(f, 2 / 4 * 1 / 3)
@@ -1651,82 +1656,60 @@ def test_precision_recall_f1_score_with_an_empty_prediction():
                                         average="weighted"),
                             np.average(f2, weights=support))
 
-        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                     average="samples")
-        # |h(x_i) inter y_i | = [0, 0, 2]
-        # |y_i| = [1, 1, 2]
-        # |h(x_i)| = [0, 1, 2]
-        assert_almost_equal(p, 1 / 3)
-        assert_almost_equal(r, 1 / 3)
-        assert_almost_equal(f, 1 / 3)
-        assert_equal(s, None)
-        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                        average="samples"),
-                            0.333, 2)
+        with warnings.catch_warnings(True):
+            p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                         average="samples")
+            # |h(x_i) inter y_i | = [0, 0, 2]
+            # |y_i| = [1, 1, 2]
+            # |h(x_i)| = [0, 1, 2]
+            assert_almost_equal(p, 1 / 3)
+            assert_almost_equal(r, 1 / 3)
+            assert_almost_equal(f, 1 / 3)
+            assert_equal(s, None)
+            assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                            average="samples"),
+                                0.333, 2)
 
 
 def test_precision_recall_f1_no_labels():
     y_true = np.zeros((20, 3))
     y_pred = np.zeros_like(y_true)
 
-    with warnings.catch_warnings(True):
-        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                     average=None)
-        #tp = [0, 0, 0]
-        #fn = [0, 0, 0]
-        #fp = [0, 0, 0]
-        #support = [0, 0, 0]
-
-        # Check per class
+    # tp = [0, 0, 0]
+    # fn = [0, 0, 0]
+    # fp = [0, 0, 0]
+    # support = [0, 0, 0]
+    # |y_hat_i inter y_i | = [0, 0, 0]
+    # |y_i| = [0, 0, 0]
+    # |y_hat_i| = [1, 1, 2]
+    warnings.simplefilter("always")
+
+    for beta in [1]:
+        p, r, f, s = assert_warns(UserWarning,
+                                  precision_recall_fscore_support,
+                                  y_true, y_pred, average=None, beta=beta)
         assert_array_almost_equal(p, [0, 0, 0], 2)
         assert_array_almost_equal(r, [0, 0, 0], 2)
         assert_array_almost_equal(f, [0, 0, 0], 2)
         assert_array_almost_equal(s, [0, 0, 0], 2)
-        assert_array_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                              average=None), [0, 0, 0], 2)
-
-        # Check macro
-        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                     average="macro")
-        assert_almost_equal(p, 0)
-        assert_almost_equal(r, 0)
-        assert_almost_equal(f, 0)
-        assert_equal(s, None)
-        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                        average="macro"), 0)
-
-        # Check micro
-        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                     average="micro")
-        assert_almost_equal(p, 0)
-        assert_almost_equal(r, 0)
-        assert_almost_equal(f, 0)
-        assert_equal(s, None)
-        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                        average="micro"), 0)
-
-        # Check weighted
-        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                     average="weighted")
-        assert_almost_equal(p, 0)
-        assert_almost_equal(r, 0)
-        assert_almost_equal(f, 0)
-        assert_equal(s, None)
-        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                        average="weighted"), 0)
 
-        # # Check example
-        # |h(x_i) inter y_i | = [0, 0, 0]
-        # |y_i| = [0, 0, 0]
-        # |h(x_i)| = [1, 1, 2]
-        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                     average="samples")
-        assert_almost_equal(p, 0)
-        assert_almost_equal(r, 0)
-        assert_almost_equal(f, 0)
-        assert_equal(s, None)
-        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                        average="samples"), 0)
+        fbeta = assert_warns(UserWarning, fbeta_score, y_true, y_pred,
+                             beta=beta, average=None)
+        assert_array_almost_equal(fbeta, [0, 0, 0], 2)
+
+        for average in ["macro", "micro", "weighted", "samples"]:
+            p, r, f, s = assert_warns(UserWarning,
+                                      precision_recall_fscore_support,
+                                      y_true, y_pred, average=average,
+                                      beta=beta)
+            assert_almost_equal(p, 0)
+            assert_almost_equal(r, 0)
+            assert_almost_equal(f, 0)
+            assert_equal(s, None)
+
+            fbeta = assert_warns(UserWarning, fbeta_score, y_true, y_pred,
+                                 beta=beta, average=average)
+            assert_almost_equal(fbeta, 0)
 
 
 def test__check_clf_targets():

From 645bacc8623e908b666d68a97d3ce27c004c009f Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Mon, 8 Jul 2013 14:27:35 +0200
Subject: [PATCH 07/14] FIX TST with warnings thx to @jnothman

---
 sklearn/metrics/tests/test_metrics.py | 51 ++++++++++++++-------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index e499780b13eb8..ea3d2c64c43a2 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -1224,6 +1224,8 @@ def test_multilabel_representation_invariance():
 
     for name, metric in MULTILABELS_METRICS.items():
         with warnings.catch_warnings(True):
+            warnings.simplefilter("always")
+
             measure = metric(y1, y2)
 
             # Check representation invariance
@@ -1682,34 +1684,35 @@ def test_precision_recall_f1_no_labels():
     # |y_hat_i inter y_i | = [0, 0, 0]
     # |y_i| = [0, 0, 0]
     # |y_hat_i| = [1, 1, 2]
-    warnings.simplefilter("always")
-
-    for beta in [1]:
-        p, r, f, s = assert_warns(UserWarning,
-                                  precision_recall_fscore_support,
-                                  y_true, y_pred, average=None, beta=beta)
-        assert_array_almost_equal(p, [0, 0, 0], 2)
-        assert_array_almost_equal(r, [0, 0, 0], 2)
-        assert_array_almost_equal(f, [0, 0, 0], 2)
-        assert_array_almost_equal(s, [0, 0, 0], 2)
-
-        fbeta = assert_warns(UserWarning, fbeta_score, y_true, y_pred,
-                             beta=beta, average=None)
-        assert_array_almost_equal(fbeta, [0, 0, 0], 2)
-
-        for average in ["macro", "micro", "weighted", "samples"]:
+    with warnings.catch_warnings(record=True):
+        warnings.simplefilter("always")
+
+        for beta in [1]:
             p, r, f, s = assert_warns(UserWarning,
                                       precision_recall_fscore_support,
-                                      y_true, y_pred, average=average,
-                                      beta=beta)
-            assert_almost_equal(p, 0)
-            assert_almost_equal(r, 0)
-            assert_almost_equal(f, 0)
-            assert_equal(s, None)
+                                      y_true, y_pred, average=None, beta=beta)
+            assert_array_almost_equal(p, [0, 0, 0], 2)
+            assert_array_almost_equal(r, [0, 0, 0], 2)
+            assert_array_almost_equal(f, [0, 0, 0], 2)
+            assert_array_almost_equal(s, [0, 0, 0], 2)
 
             fbeta = assert_warns(UserWarning, fbeta_score, y_true, y_pred,
-                                 beta=beta, average=average)
-            assert_almost_equal(fbeta, 0)
+                                 beta=beta, average=None)
+            assert_array_almost_equal(fbeta, [0, 0, 0], 2)
+
+            for average in ["macro", "micro", "weighted", "samples"]:
+                p, r, f, s = assert_warns(UserWarning,
+                                          precision_recall_fscore_support,
+                                          y_true, y_pred, average=average,
+                                          beta=beta)
+                assert_almost_equal(p, 0)
+                assert_almost_equal(r, 0)
+                assert_almost_equal(f, 0)
+                assert_equal(s, None)
+
+                fbeta = assert_warns(UserWarning, fbeta_score, y_true, y_pred,
+                                     beta=beta, average=average)
+                assert_almost_equal(fbeta, 0)
 
 
 def test__check_clf_targets():

From 377a963c14e34357568179751a17dfd3356b4816 Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Mon, 8 Jul 2013 14:31:55 +0200
Subject: [PATCH 08/14] flake8

---
 sklearn/metrics/metrics.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 09ef9a1f55012..2b14951d5735b 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -1467,7 +1467,6 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         if warning_msg:
             warnings.warn(warning_msg)
 
-
         try:
             # oddly, we may get an "invalid" rather than a "divide" error
             # here
@@ -1485,8 +1484,6 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         recall[size_true == 0] = 0.0
         f_score[(beta2 * size_true + size_pred) == 0] = 0.0
 
-
-
         precision = np.mean(precision)
         recall = np.mean(recall)
         f_score = np.mean(f_score)
@@ -1561,12 +1558,14 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         average_options = (None, 'micro', 'macro', 'weighted', 'samples')
         if average == 'micro':
             try:
-                # oddly, we may get an "invalid" rather than a "divide" error here
+                # oddly, we may get an "invalid" rather than a "divide" error
+                # here
                 old_err_settings = np.seterr(divide='ignore', invalid='ignore')
                 tp_sum = true_pos.sum()
                 fp_sum = false_pos.sum()
                 fn_sum = false_neg.sum()
-                avg_precision = divide(tp_sum, tp_sum + fp_sum, dtype=np.double)
+                avg_precision = divide(tp_sum, tp_sum + fp_sum,
+                                       dtype=np.double)
                 avg_recall = divide(tp_sum, tp_sum + fn_sum, dtype=np.double)
                 avg_fscore = divide((1 + beta2) * (avg_precision * avg_recall),
                                     beta2 * avg_precision + avg_recall,

From 06c2c7b1b6e46b308495a970860efc240c131a4b Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Wed, 10 Jul 2013 16:03:29 +0200
Subject: [PATCH 09/14] ENH set warning to stacklevel 2

---
 sklearn/metrics/metrics.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 2b14951d5735b..030446c72032c 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -1539,7 +1539,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
                             % labels[idx_ill_defined_fbeta_score])
 
         if warning_msg:
-            warnings.warn(warning_msg)
+            warnings.warn(warning_msg, stacklevel=2)
 
     if not average:
         return precision, recall, fscore, support
@@ -1593,7 +1593,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
                                 "to zero. Micro-fbeta_score is ill defined.")
 
             if warning_msg:
-                warnings.warn(warning_msg)
+                warnings.warn(warning_msg, stacklevel=2)
 
         elif average == 'macro':
             avg_precision = np.mean(precision)
@@ -1607,7 +1607,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
                 avg_fscore = 0.
                 warnings.warn("There isn't any labels in y_true. "
                               "Weighted-precision, weighted-recall and "
-                              "weighted-fbeta_score are ill defined.")
+                              "weighted-fbeta_score are ill defined.",
+                              stacklevel=2)
 
             else:
                 avg_precision = np.average(precision, weights=support)

From 6dafe5752147633e863883dfcbe2be5431faa46c Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Wed, 10 Jul 2013 16:04:18 +0200
Subject: [PATCH 10/14] TST silence warning

---
 sklearn/metrics/tests/test_metrics.py | 98 ++++++++++++++-------------
 1 file changed, 51 insertions(+), 47 deletions(-)

diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index ea3d2c64c43a2..33c5d9d423f11 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -514,6 +514,8 @@ def test_precision_recall_f_binary_single_class():
     negative class
 
     Such a case may occur with non-stratified cross-validation"""
+    warnings.simplefilter("ignore")
+
     assert_equal(1., precision_score([1, 1], [1, 1]))
     assert_equal(1., recall_score([1, 1], [1, 1]))
     assert_equal(1., f1_score([1, 1], [1, 1]))
@@ -1463,24 +1465,25 @@ def test_precision_recall_f1_score_multilabel_1():
     y_true_bi = lb.transform(y_true_ll)
     y_pred_bi = lb.transform(y_pred_ll)
 
+    warnings.simplefilter("ignore")
+
     for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:
-        with warnings.catch_warnings(True):
 
-            p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                         average=None)
-            #tp = [0, 1, 1, 0]
-            #fn = [1, 0, 0, 1]
-            #fp = [1, 1, 0, 0]
-            # Check per class
+        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                     average=None)
+        #tp = [0, 1, 1, 0]
+        #fn = [1, 0, 0, 1]
+        #fp = [1, 1, 0, 0]
+        # Check per class
 
-            assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2)
-            assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2)
-            assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
-            assert_array_almost_equal(s, [1, 1, 1, 1], 2)
+        assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2)
+        assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2)
+        assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
+        assert_array_almost_equal(s, [1, 1, 1, 1], 2)
 
-            f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
-            support = s
-            assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2)
+        f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
+        support = s
+        assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2)
 
         # Check macro
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
@@ -1540,22 +1543,23 @@ def test_precision_recall_f1_score_multilabel_2():
     y_true_bi = lb.transform(y_true_ll)
     y_pred_bi = lb.transform(y_pred_ll)
 
+    warnings.simplefilter("ignore")
+
     for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:
         # tp = [ 0.  1.  0.  0.]
         # fp = [ 1.  0.  0.  2.]
         # fn = [ 1.  1.  1.  0.]
 
-        with warnings.catch_warnings(True):
-            p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                         average=None)
-            assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2)
-            assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2)
-            assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2)
-            assert_array_almost_equal(s, [1, 2, 1, 0], 2)
+        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                     average=None)
+        assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2)
+        assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2)
+        assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2)
+        assert_array_almost_equal(s, [1, 2, 1, 0], 2)
 
-            f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
-            support = s
-            assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2)
+        f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
+        support = s
+        assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2)
 
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="micro")
@@ -1612,21 +1616,22 @@ def test_precision_recall_f1_score_with_an_empty_prediction():
     y_true_bi = lb.transform(y_true_ll)
     y_pred_bi = lb.transform(y_pred_ll)
 
+    warnings.simplefilter("ignore")
+
     for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:
         # true_pos = [ 0.  1.  1.  0.]
         # false_pos = [ 0.  0.  0.  1.]
         # false_neg = [ 1.  1.  0.  0.]
-        with warnings.catch_warnings(True):
-            p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                         average=None)
-            assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2)
-            assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2)
-            assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
-            assert_array_almost_equal(s, [1, 2, 1, 0], 2)
+        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                     average=None)
+        assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2)
+        assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2)
+        assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
+        assert_array_almost_equal(s, [1, 2, 1, 0], 2)
 
-            f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
-            support = s
-            assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2)
+        f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
+        support = s
+        assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2)
 
         p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                      average="macro")
@@ -1658,19 +1663,18 @@ def test_precision_recall_f1_score_with_an_empty_prediction():
                                         average="weighted"),
                             np.average(f2, weights=support))
 
-        with warnings.catch_warnings(True):
-            p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
-                                                         average="samples")
-            # |h(x_i) inter y_i | = [0, 0, 2]
-            # |y_i| = [1, 1, 2]
-            # |h(x_i)| = [0, 1, 2]
-            assert_almost_equal(p, 1 / 3)
-            assert_almost_equal(r, 1 / 3)
-            assert_almost_equal(f, 1 / 3)
-            assert_equal(s, None)
-            assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
-                                            average="samples"),
-                                0.333, 2)
+        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
+                                                     average="samples")
+        # |h(x_i) inter y_i | = [0, 0, 2]
+        # |y_i| = [1, 1, 2]
+        # |h(x_i)| = [0, 1, 2]
+        assert_almost_equal(p, 1 / 3)
+        assert_almost_equal(r, 1 / 3)
+        assert_almost_equal(f, 1 / 3)
+        assert_equal(s, None)
+        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
+                                        average="samples"),
+                            0.333, 2)
 
 
 def test_precision_recall_f1_no_labels():

From 79a0cc9368f9802c0cb878b4ed63405940f5e6d1 Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Wed, 10 Jul 2013 16:09:11 +0200
Subject: [PATCH 11/14] ENH use with np.errstate

---
 sklearn/metrics/metrics.py | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index 030446c72032c..667679918bf82 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -910,11 +910,9 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True):
     # Compute accuracy for each possible representation
     y_type, y_true, y_pred = _check_clf_targets(y_true, y_pred)
     if y_type == 'multilabel-indicator':
-        try:
+        with np.errstate(divide='ignore', invalid='ignore'):
             # oddly, we may get an "invalid" rather than a "divide"
             # error here
-            old_err_settings = np.seterr(divide='ignore',
-                                         invalid='ignore')
             y_pred_pos_label = y_pred == 1
             y_true_pos_label = y_true == 1
             pred_inter_true = np.sum(np.logical_and(y_pred_pos_label,
@@ -929,8 +927,6 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True):
             # the jaccard to 1: lim_{x->0} x/x = 1
             # Note with py2.6 and np 1.3: we can't check safely for nan.
             score[pred_union_true == 0.0] = 1.0
-        finally:
-            np.seterr(**old_err_settings)
 
     elif y_type == 'multilabel-sequences':
         score = np.empty(len(y_true), dtype=np.float)
@@ -1467,18 +1463,14 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         if warning_msg:
             warnings.warn(warning_msg)
 
-        try:
+        with np.errstate(divide="ignore", invalid="ignore"):
             # oddly, we may get an "invalid" rather than a "divide" error
             # here
-            old_err_settings = np.seterr(divide='ignore', invalid='ignore')
-
             precision = divide(size_inter, size_pred, dtype=np.double)
             recall = divide(size_inter, size_true, dtype=np.double)
             f_score = divide((1 + beta2) * size_inter,
                              (beta2 * size_true + size_pred),
                              dtype=np.double)
-        finally:
-            np.seterr(**old_err_settings)
 
         precision[size_pred == 0] = 0.0
         recall[size_true == 0] = 0.0
@@ -1493,9 +1485,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     true_pos, _, false_pos, false_neg = _tp_tn_fp_fn(y_true, y_pred, labels)
     support = true_pos + false_neg
 
-    try:
+    with np.errstate(divide='ignore', invalid='ignore'):
         # oddly, we may get an "invalid" rather than a "divide" error here
-        old_err_settings = np.seterr(divide='ignore', invalid='ignore')
 
         # precision and recall
         precision = divide(true_pos.astype(np.float), true_pos + false_pos)
@@ -1515,8 +1506,6 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         # handle division by 0 in fscore
         idx_ill_defined_fbeta_score = (beta2 * precision + recall) == 0
         fscore[idx_ill_defined_fbeta_score] = 0.0
-    finally:
-        np.seterr(**old_err_settings)
 
     if average in (None, "macro", "weighted"):
         warning_msg = ""
@@ -1557,10 +1546,10 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     else:
         average_options = (None, 'micro', 'macro', 'weighted', 'samples')
         if average == 'micro':
-            try:
+            with np.errstate(divide='ignore', invalid='ignore'):
                 # oddly, we may get an "invalid" rather than a "divide" error
                 # here
-                old_err_settings = np.seterr(divide='ignore', invalid='ignore')
+
                 tp_sum = true_pos.sum()
                 fp_sum = false_pos.sum()
                 fn_sum = false_neg.sum()
@@ -1570,11 +1559,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
                 avg_fscore = divide((1 + beta2) * (avg_precision * avg_recall),
                                     beta2 * avg_precision + avg_recall,
                                     dtype=np.double)
-            finally:
-                np.seterr(**old_err_settings)
 
             warning_msg = ""
-
             if tp_sum + fp_sum == 0:
                 avg_precision = 0.
                 warning_msg += ("The sum of true positives and false "

From aa0c47e8b1696fbe48a74cc2af92eecade4458a4 Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Wed, 10 Jul 2013 16:41:53 +0200
Subject: [PATCH 12/14] DOC TST correct comment

---
 sklearn/metrics/tests/test_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index 33c5d9d423f11..1e00feddb6512 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -1687,7 +1687,7 @@ def test_precision_recall_f1_no_labels():
     # support = [0, 0, 0]
     # |y_hat_i inter y_i | = [0, 0, 0]
     # |y_i| = [0, 0, 0]
-    # |y_hat_i| = [1, 1, 2]
+    # |y_hat_i| = [0, 0, 0]
     with warnings.catch_warnings(record=True):
         warnings.simplefilter("always")
 

From a0aa7776e7fbbfb737d500b4accd93cc7799b311 Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Thu, 25 Jul 2013 11:26:15 +0200
Subject: [PATCH 13/14] FIX warning test

---
 sklearn/svm/tests/test_sparse.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/sklearn/svm/tests/test_sparse.py b/sklearn/svm/tests/test_sparse.py
index 82ac3308c730c..72fd0605d02bf 100644
--- a/sklearn/svm/tests/test_sparse.py
+++ b/sklearn/svm/tests/test_sparse.py
@@ -3,7 +3,7 @@
 from scipy import sparse
 from sklearn import datasets, svm, linear_model, base
 from numpy.testing import (assert_array_almost_equal, assert_array_equal,
-                           assert_equal)
+                           assert_equal, assert_warns)
 
 from nose.tools import assert_raises, assert_true, assert_false
 from nose.tools import assert_equal as nose_assert_equal
@@ -275,13 +275,7 @@ def test_sparse_svc_clone_with_callable_kernel():
 def test_timeout():
     sp = svm.SVC(C=1, kernel=lambda x, y: x * y.T, probability=True,
                  max_iter=1)
-    with warnings.catch_warnings(record=True) as foo:
-        sp.fit(X_sp, Y)
-        nose_assert_equal(len(foo), 1, msg=foo)
-        nose_assert_equal(foo[0].category, ConvergenceWarning,
-                          msg=foo[0].category)
+    with warnings.catch_warnings(record=True):
+        warnings.simplefilter("always")
 
-
-if __name__ == '__main__':
-    import nose
-    nose.runmodule()
+        assert_warns(ConvergenceWarning, sp.fit, X_sp, Y)

From a5a026ca3140a2f14c007510ac99c4ef12947472 Mon Sep 17 00:00:00 2001
From: Arnaud Joly <arnaud.v.joly@gmail.com>
Date: Thu, 25 Jul 2013 11:41:20 +0200
Subject: [PATCH 14/14] FIX warning tests in preprocessing

---
 sklearn/preprocessing/tests/test_data.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 37b81c3d03d71..b4bdb0f7d9076 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -10,6 +10,7 @@
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_false
+from sklearn.utils.testing import assert_warns
 
 from sklearn.utils.sparsefuncs import mean_variance_axis0
 from sklearn.preprocessing.data import _transform_selected
@@ -306,13 +307,13 @@ def test_warning_scaling_integers():
     X = np.array([[1, 2, 0],
                   [0, 0, 0]], dtype=np.uint8)
 
-    with warnings.catch_warnings(record=True) as w:
-        StandardScaler().fit(X)
-        assert_equal(len(w), 1)
+    with warnings.catch_warnings(record=True):
+        warnings.simplefilter("always")
+        assert_warns(UserWarning, StandardScaler().fit, X)
 
-    with warnings.catch_warnings(record=True) as w:
-        MinMaxScaler().fit(X)
-        assert_equal(len(w), 1)
+    with warnings.catch_warnings(record=True):
+        warnings.simplefilter("always")
+        assert_warns(UserWarning, MinMaxScaler().fit, X)
 
 
 def test_normalizer_l1():