From 068dcc399a9b93b658be7de71eb1c95aae3a71aa Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Wed, 22 May 2013 12:05:10 +0200 Subject: [PATCH 01/14] FIX bug in f_score with beta !=1 --- sklearn/metrics/metrics.py | 2 +- sklearn/metrics/tests/test_metrics.py | 65 ++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 5a97f503682e3..3a15ae155c239 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1458,7 +1458,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, precision = size_inter / size_true recall = size_inter / size_pred - f_score = ((1 + beta2 ** 2) * size_inter / + f_score = ((1 + beta2) * size_inter / (beta2 * size_pred + size_true)) finally: np.seterr(**old_err_settings) diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index 90cdd7f258da4..985f3cfaf6ab4 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -503,6 +503,9 @@ def test_precision_recall_f1_score_binary(): fs = f1_score(y_true, y_pred) assert_array_almost_equal(fs, 0.76, 2) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2), + (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2) + def test_precision_recall_f_binary_single_class(): """Test precision, recall and F1 score behave with a single positive or @@ -1467,6 +1470,10 @@ def test_precision_recall_f1_score_multilabel_1(): assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) assert_array_almost_equal(s, [1, 1, 1, 1], 2) + f2 = fbeta_score(y_true, y_pred, beta=2, average=None) + support = s + assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2) + # Check macro p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") @@ -1474,6 +1481,9 @@ def test_precision_recall_f1_score_multilabel_1(): assert_almost_equal(r, 0.5) assert_almost_equal(f, 2.5 / 1.5 * 0.25) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="macro"), + np.mean(f2)) # Check micro p, r, f, s = precision_recall_fscore_support(y_true, y_pred, @@ -1482,6 +1492,9 @@ def test_precision_recall_f1_score_multilabel_1(): assert_almost_equal(r, 0.5) assert_almost_equal(f, 0.5) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="micro"), + (1 + 4) * p * r / (4 * p + r)) # Check weigted p, r, f, s = precision_recall_fscore_support(y_true, y_pred, @@ -1490,7 +1503,9 @@ def test_precision_recall_f1_score_multilabel_1(): assert_almost_equal(r, 0.5) assert_almost_equal(f, 2.5 / 1.5 * 0.25) assert_equal(s, None) - + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="weighted"), + np.average(f2, weights=support)) # Check weigted # |h(x_i) inter y_i | = [0, 1, 1] # |y_i| = [1, 1, 2] @@ -1501,7 +1516,9 @@ def test_precision_recall_f1_score_multilabel_1(): assert_almost_equal(r, 0.5) assert_almost_equal(f, 0.5) assert_equal(s, None) - + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="samples"), + 0.5) def test_precision_recall_f1_score_multilabel_2(): """ Test precision_recall_f1_score on a crafted multilabel example 2 @@ -1526,12 +1543,20 @@ def test_precision_recall_f1_score_multilabel_2(): assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2) assert_array_almost_equal(s, [1, 2, 1, 0], 2) + f2 = fbeta_score(y_true, y_pred, beta=2, average=None) + support = s + assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") assert_almost_equal(p, 0.25) assert_almost_equal(r, 0.25) assert_almost_equal(f, 2 * 0.25 * 0.25 / 0.5) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="micro"), + (1 + 4) * p * r / (4 * p + r)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") @@ -1539,6 +1564,9 @@ def test_precision_recall_f1_score_multilabel_2(): assert_almost_equal(r, 0.125) assert_almost_equal(f, 2 / 12) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="macro"), + np.mean(f2)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") @@ -1546,6 +1574,9 @@ def test_precision_recall_f1_score_multilabel_2(): assert_almost_equal(r, 1 / 4) assert_almost_equal(f, 2 / 3 * 2 / 4) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="weighted"), + np.average(f2, weights=support)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples") @@ -1557,6 +1588,9 @@ def test_precision_recall_f1_score_multilabel_2(): assert_almost_equal(r, 1 / 6) assert_almost_equal(f, 2 / 4 * 1 / 3) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="samples"), + 0.1666, 2) def test_precision_recall_f1_score_with_an_empty_prediction(): @@ -1580,12 +1614,19 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) assert_array_almost_equal(s, [1, 2, 1, 0], 2) + f2 = fbeta_score(y_true, y_pred, beta=2, average=None) + support = s + assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2) + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") assert_almost_equal(p, 0.5) assert_almost_equal(r, 1.5 / 4) assert_almost_equal(f, 2.5 / (4 * 1.5)) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="macro"), + np.mean(f2)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") @@ -1593,6 +1634,9 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): assert_almost_equal(r, 0.5) assert_almost_equal(f, 2 / 3 / (2 / 3 + 0.5)) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="micro"), + (1 + 4) * p * r / (4 * p + r)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") @@ -1600,6 +1644,9 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): assert_almost_equal(r, 0.5) assert_almost_equal(f, (2 / 1.5 + 1) / 4) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="weighted"), + np.average(f2, weights=support)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples") @@ -1610,6 +1657,9 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): assert_almost_equal(r, 2 / 3) assert_almost_equal(f, 1 / 3) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="samples"), + 0.333, 2) def test_precision_recall_f1_no_labels(): @@ -1628,6 +1678,8 @@ def test_precision_recall_f1_no_labels(): assert_array_almost_equal(r, [0, 0, 0], 2) assert_array_almost_equal(f, [0, 0, 0], 2) assert_array_almost_equal(s, [0, 0, 0], 2) + assert_array_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average=None), [0, 0, 0], 2) # Check macro p, r, f, s = precision_recall_fscore_support(y_true, y_pred, @@ -1636,6 +1688,8 @@ def test_precision_recall_f1_no_labels(): assert_almost_equal(r, 0) assert_almost_equal(f, 0) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="macro"), 0) # Check micro p, r, f, s = precision_recall_fscore_support(y_true, y_pred, @@ -1644,6 +1698,9 @@ def test_precision_recall_f1_no_labels(): assert_almost_equal(r, 0) assert_almost_equal(f, 0) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="micro"), 0) + # Check weighted p, r, f, s = precision_recall_fscore_support(y_true, y_pred, @@ -1652,6 +1709,8 @@ def test_precision_recall_f1_no_labels(): assert_almost_equal(r, 0) assert_almost_equal(f, 0) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="weighted"), 0) # # Check example # |h(x_i) inter y_i | = [0, 0, 0] @@ -1663,6 +1722,8 @@ def test_precision_recall_f1_no_labels(): assert_almost_equal(r, 1) assert_almost_equal(f, 1) assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="samples"), 1) def test__check_clf_targets(): From 1868790207f974e5874b9b6d1eeb581584336b0d Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Wed, 22 May 2013 15:47:39 +0200 Subject: [PATCH 02/14] FIX formula inversion for sample-based precision/recall --- sklearn/metrics/metrics.py | 18 ++++++++++-------- sklearn/metrics/tests/test_metrics.py | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 3a15ae155c239..a533e54271588 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1415,6 +1415,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, ... # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) + """ if beta <= 0: raise ValueError("beta should be >0 in the F-beta score") @@ -1448,24 +1449,24 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, size_true[i] = len(true_set) else: raise ValueError("Example-based precision, recall, fscore is " - "not meaning full outside multilabe" - "classification. See the accuracy_score instead.") + "not meaning full outside multilabel" + "classification. Use accuracy_score instead.") try: # oddly, we may get an "invalid" rather than a "divide" error # here old_err_settings = np.seterr(divide='ignore', invalid='ignore') - precision = size_inter / size_true - recall = size_inter / size_pred + precision = size_inter / size_pred + recall = size_inter / size_true f_score = ((1 + beta2) * size_inter / - (beta2 * size_pred + size_true)) + (beta2 * size_true + size_pred)) finally: np.seterr(**old_err_settings) - precision[size_true == 0] = 1.0 - recall[size_pred == 0] = 1.0 - f_score[(beta2 * size_pred + size_true) == 0] = 1.0 + precision[size_pred == 0] = 1.0 + recall[size_true == 0] = 1.0 + f_score[(beta2 * size_true + size_pred) == 0] = 1.0 precision = np.mean(precision) recall = np.mean(recall) @@ -1698,6 +1699,7 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): >>> recall_score(y_true, y_pred, average=None) array([ 1., 0., 0.]) + """ _, r, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labels, diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index 985f3cfaf6ab4..e92b069a30a95 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -1653,8 +1653,8 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): # |h(x_i) inter y_i | = [0, 0, 2] # |y_i| = [1, 1, 2] # |h(x_i)| = [0, 1, 2] - assert_almost_equal(p, 1 / 3) - assert_almost_equal(r, 2 / 3) + assert_almost_equal(p, 2 / 3) + assert_almost_equal(r, 1 / 3) assert_almost_equal(f, 1 / 3) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, From 42a127084eff73906a89802fe748289e49d995fb Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Thu, 23 May 2013 08:56:20 +0200 Subject: [PATCH 03/14] FIX set same default behavior for precision, recall and f-score --- sklearn/metrics/metrics.py | 7 +++---- sklearn/metrics/tests/test_metrics.py | 11 +++++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index a533e54271588..f43d5f2e8a2f4 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1415,7 +1415,6 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, ... # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) - """ if beta <= 0: raise ValueError("beta should be >0 in the F-beta score") @@ -1464,9 +1463,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, finally: np.seterr(**old_err_settings) - precision[size_pred == 0] = 1.0 - recall[size_true == 0] = 1.0 - f_score[(beta2 * size_true + size_pred) == 0] = 1.0 + precision[size_pred == 0] = 0.0 + recall[size_true == 0] = 0.0 + f_score[(beta2 * size_true + size_pred) == 0] = 0.0 precision = np.mean(precision) recall = np.mean(recall) diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index e92b069a30a95..094df4651e614 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -1653,7 +1653,7 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): # |h(x_i) inter y_i | = [0, 0, 2] # |y_i| = [1, 1, 2] # |h(x_i)| = [0, 1, 2] - assert_almost_equal(p, 2 / 3) + assert_almost_equal(p, 1 / 3) assert_almost_equal(r, 1 / 3) assert_almost_equal(f, 1 / 3) assert_equal(s, None) @@ -1701,7 +1701,6 @@ def test_precision_recall_f1_no_labels(): assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="micro"), 0) - # Check weighted p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") @@ -1718,12 +1717,12 @@ def test_precision_recall_f1_no_labels(): # |h(x_i)| = [1, 1, 2] p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples") - assert_almost_equal(p, 1) - assert_almost_equal(r, 1) - assert_almost_equal(f, 1) + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average="samples"), 1) + average="samples"), 0) def test__check_clf_targets(): From 98a6b70135b532c4ddc37c01732e1a98f991338e Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Thu, 23 May 2013 15:14:46 +0200 Subject: [PATCH 04/14] ENH raise warning with ill define precision, recall and fscore --- sklearn/metrics/metrics.py | 72 ++++++++-- sklearn/metrics/tests/test_metrics.py | 186 +++++++++++++------------- 2 files changed, 155 insertions(+), 103 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index f43d5f2e8a2f4..a2f4984dbe16a 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1484,20 +1484,46 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, precision = divide(true_pos.astype(np.float), true_pos + false_pos) recall = divide(true_pos.astype(np.float), true_pos + false_neg) + idx_ill_defined_precision = (true_pos + false_pos) == 0 + idx_ill_defined_recall = (true_pos + false_neg) == 0 + # handle division by 0 in precision and recall - precision[(true_pos + false_pos) == 0] = 0.0 - recall[(true_pos + false_neg) == 0] = 0.0 + precision[idx_ill_defined_precision] = 0.0 + recall[idx_ill_defined_recall] = 0.0 # fbeta score fscore = divide((1 + beta2) * precision * recall, beta2 * precision + recall) # handle division by 0 in fscore - fscore[(beta2 * precision + recall) == 0] = 0.0 + idx_ill_defined_fbeta_score = (beta2 * precision + recall) == 0 + fscore[idx_ill_defined_fbeta_score] = 0.0 finally: np.seterr(**old_err_settings) if not average: + warning_msg = "" + if np.any(idx_ill_defined_precision): + warning_msg += ("The sum of true positives and false positives " + "are equal to zero for some labels. Precision is " + "ill defined for those labels %s. " + % labels[idx_ill_defined_precision]) + + if np.any(idx_ill_defined_recall): + warning_msg += ("The sum of true positives and false negatives " + "are equal to zero for some labels. Recall is ill " + "defined for those labels %s. " + % labels[idx_ill_defined_recall]) + + if np.any(idx_ill_defined_fbeta_score): + warning_msg += ("The precision and recall are equal to zero for " + "some labels. fbeta_score is ill defined for " + "those labels %s. " + % labels[idx_ill_defined_fbeta_score]) + + if warning_msg: + warnings.warn(warning_msg) + return precision, recall, fscore, support elif y_type == 'binary' and pos_label is not None: @@ -1513,24 +1539,42 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, else: average_options = (None, 'micro', 'macro', 'weighted', 'samples') if average == 'micro': - avg_precision = divide(true_pos.sum(), - true_pos.sum() + false_pos.sum(), - dtype=np.double) - avg_recall = divide(true_pos.sum(), - true_pos.sum() + false_neg.sum(), - dtype=np.double) - avg_fscore = divide((1 + beta2) * (avg_precision * avg_recall), - beta2 * avg_precision + avg_recall, - dtype=np.double) + try: + # oddly, we may get an "invalid" rather than a "divide" error here + old_err_settings = np.seterr(divide='ignore', invalid='ignore') + avg_precision = divide(true_pos.sum(), + true_pos.sum() + false_pos.sum(), + dtype=np.double) + avg_recall = divide(true_pos.sum(), + true_pos.sum() + false_neg.sum(), + dtype=np.double) + avg_fscore = divide((1 + beta2) * (avg_precision * avg_recall), + beta2 * avg_precision + avg_recall, + dtype=np.double) + finally: + np.seterr(**old_err_settings) + + warning_msg = "" if np.isnan(avg_precision): avg_precision = 0. + warning_msg += ("The sum of true positives and false " + "positives are equal to zero. Micro-precision" + " is ill defined. ") if np.isnan(avg_recall): avg_recall = 0. + warning_msg += ("The sum of true positives and false " + "negatives are equal to zero. Micro-recall " + "is ill defined. ") if np.isnan(avg_fscore): avg_fscore = 0. + warning_msg += ("Micro-precision and micro-recall are equal " + "to zero. Micro-fbeta_score is ill defined.") + + if warning_msg: + warnings.warn(warning_msg) elif average == 'macro': avg_precision = np.mean(precision) @@ -1542,6 +1586,10 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, avg_precision = 0. avg_recall = 0. avg_fscore = 0. + warnings.warn("There isn't any labels in y_true. " + "Weighted-precision, weighted-recall and " + "weighted-fbeta_score are ill defined.") + else: avg_precision = np.average(precision, weights=support) avg_recall = np.average(recall, weights=support) diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index 094df4651e614..de3734b7cb4de 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -1458,21 +1458,23 @@ def test_precision_recall_f1_score_multilabel_1(): y_pred_bi = lb.transform(y_pred_ll) for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average=None) - #tp = [0, 1, 1, 0] - #fn = [1, 0, 0, 1] - #fp = [1, 1, 0, 0] + with warnings.catch_warnings(True): - # Check per class - assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2) - assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2) - assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) - assert_array_almost_equal(s, [1, 1, 1, 1], 2) + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + #tp = [0, 1, 1, 0] + #fn = [1, 0, 0, 1] + #fp = [1, 1, 0, 0] + # Check per class - f2 = fbeta_score(y_true, y_pred, beta=2, average=None) - support = s - assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2) + assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) + assert_array_almost_equal(s, [1, 1, 1, 1], 2) + + f2 = fbeta_score(y_true, y_pred, beta=2, average=None) + support = s + assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2) # Check macro p, r, f, s = precision_recall_fscore_support(y_true, y_pred, @@ -1520,6 +1522,7 @@ def test_precision_recall_f1_score_multilabel_1(): average="samples"), 0.5) + def test_precision_recall_f1_score_multilabel_2(): """ Test precision_recall_f1_score on a crafted multilabel example 2 """ @@ -1536,17 +1539,17 @@ def test_precision_recall_f1_score_multilabel_2(): # fp = [ 1. 0. 0. 2.] # fn = [ 1. 1. 1. 0.] - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average=None) - assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2) - assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2) - assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2) - assert_array_almost_equal(s, [1, 2, 1, 0], 2) - - f2 = fbeta_score(y_true, y_pred, beta=2, average=None) - support = s - assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2) + with warnings.catch_warnings(True): + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2) + assert_array_almost_equal(s, [1, 2, 1, 0], 2) + f2 = fbeta_score(y_true, y_pred, beta=2, average=None) + support = s + assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") @@ -1606,17 +1609,17 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): # true_pos = [ 0. 1. 1. 0.] # false_pos = [ 0. 0. 0. 1.] # false_neg = [ 1. 1. 0. 0.] - - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average=None) - assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2) - assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2) - assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) - assert_array_almost_equal(s, [1, 2, 1, 0], 2) - - f2 = fbeta_score(y_true, y_pred, beta=2, average=None) - support = s - assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2) + with warnings.catch_warnings(True): + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) + assert_array_almost_equal(s, [1, 2, 1, 0], 2) + + f2 = fbeta_score(y_true, y_pred, beta=2, average=None) + support = s + assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") @@ -1666,63 +1669,64 @@ def test_precision_recall_f1_no_labels(): y_true = np.zeros((20, 3)) y_pred = np.zeros_like(y_true) - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average=None) - #tp = [0, 0, 0] - #fn = [0, 0, 0] - #fp = [0, 0, 0] - #support = [0, 0, 0] - - # Check per class - assert_array_almost_equal(p, [0, 0, 0], 2) - assert_array_almost_equal(r, [0, 0, 0], 2) - assert_array_almost_equal(f, [0, 0, 0], 2) - assert_array_almost_equal(s, [0, 0, 0], 2) - assert_array_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average=None), [0, 0, 0], 2) - - # Check macro - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="macro") - assert_almost_equal(p, 0) - assert_almost_equal(r, 0) - assert_almost_equal(f, 0) - assert_equal(s, None) - assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average="macro"), 0) - - # Check micro - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="micro") - assert_almost_equal(p, 0) - assert_almost_equal(r, 0) - assert_almost_equal(f, 0) - assert_equal(s, None) - assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average="micro"), 0) - - # Check weighted - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="weighted") - assert_almost_equal(p, 0) - assert_almost_equal(r, 0) - assert_almost_equal(f, 0) - assert_equal(s, None) - assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average="weighted"), 0) - - # # Check example - # |h(x_i) inter y_i | = [0, 0, 0] - # |y_i| = [0, 0, 0] - # |h(x_i)| = [1, 1, 2] - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="samples") - assert_almost_equal(p, 0) - assert_almost_equal(r, 0) - assert_almost_equal(f, 0) - assert_equal(s, None) - assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average="samples"), 0) + with warnings.catch_warnings(True): + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + #tp = [0, 0, 0] + #fn = [0, 0, 0] + #fp = [0, 0, 0] + #support = [0, 0, 0] + + # Check per class + assert_array_almost_equal(p, [0, 0, 0], 2) + assert_array_almost_equal(r, [0, 0, 0], 2) + assert_array_almost_equal(f, [0, 0, 0], 2) + assert_array_almost_equal(s, [0, 0, 0], 2) + assert_array_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average=None), [0, 0, 0], 2) + + # Check macro + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="macro") + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="macro"), 0) + + # Check micro + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="micro") + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="micro"), 0) + + # Check weighted + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="weighted") + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="weighted"), 0) + + # # Check example + # |h(x_i) inter y_i | = [0, 0, 0] + # |y_i| = [0, 0, 0] + # |h(x_i)| = [1, 1, 2] + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="samples") + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="samples"), 0) def test__check_clf_targets(): From cc6963b6adcdecaaf3598d7d02def94658b00a22 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 28 May 2013 13:48:57 +0200 Subject: [PATCH 05/14] Backport assert_warns and assert_no_warnings from np 1.7 --- sklearn/utils/testing.py | 36 +++++++++++++++++++++++ sklearn/utils/tests/test_testing.py | 45 +++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 43edb39e52460..eb2f742af14bf 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -9,6 +9,7 @@ # License: BSD 3 clause import inspect import pkgutil +import warnings import scipy as sp from functools import wraps @@ -76,6 +77,41 @@ def _assert_greater(a, b, msg=None): assert a > b, message +# To remove when we support numpy 1.7 +def assert_warns(warning_class, func, *args, **kw): + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + + # Trigger a warning. + result = func(*args, **kw) + + # Verify some things + if not len(w) > 0: + raise AssertionError("No warning raised when calling %s" + % func.__name__) + + if not w[0].category is warning_class: + raise AssertionError("First warning for %s is not a " + "%s( is %s)" + % (func.__name__, warning_class, w[0])) + + return result + + +# To remove when we support numpy 1.7 +def assert_no_warnings(func, *args, **kw): + # XXX: once we may depend on python >= 2.6, this can be replaced by the + # warnings module context manager. + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + + result = func(*args, **kw) + if len(w) > 0: + raise AssertionError("Got warnings when calling %s: %s" + % (func.__name__, w)) + return result + try: from nose.tools import assert_less except ImportError: diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index 40e33b1a04158..47ff9aa66532a 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -1,8 +1,14 @@ +import warnings +import unittest +import sys + from nose.tools import assert_raises from sklearn.utils.testing import ( _assert_less, _assert_greater, + assert_warns, + assert_no_warnings, assert_equal, set_random_state, assert_raise_message) @@ -62,3 +68,42 @@ def _raise_ValueError(message): assert_raises(ValueError, assert_raise_message, TypeError, "something else", _raise_ValueError, "test") + + + +# This class is taken from numpy 1.7 +class TestWarns(unittest.TestCase): + def test_warn(self): + def f(): + warnings.warn("yo") + return 3 + + before_filters = sys.modules['warnings'].filters[:] + assert_equal(assert_warns(UserWarning, f), 3) + after_filters = sys.modules['warnings'].filters + + assert_raises(AssertionError, assert_no_warnings, f) + assert_equal(assert_no_warnings(lambda x: x, 1), 1) + + # Check that the warnings state is unchanged + assert_equal(before_filters, after_filters, + "assert_warns does not preserver warnings state") + + def test_warn_wrong_warning(self): + def f(): + warnings.warn("yo", DeprecationWarning) + + failed = False + filters = sys.modules['warnings'].filters[:] + try: + try: + # Should raise an AssertionError + assert_warns(UserWarning, f) + failed = True + except AssertionError: + pass + finally: + sys.modules['warnings'].filters = filters + + if failed: + raise AssertionError("wrong warning caught by assert_warn") From 6a4a3629100546ecc95a4a1d16fe3f0cecff873e Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 28 May 2013 15:25:16 +0200 Subject: [PATCH 06/14] TST test warning + ENH Add warning average=samples --- sklearn/metrics/metrics.py | 50 +++++-- sklearn/metrics/tests/test_metrics.py | 191 ++++++++++++-------------- 2 files changed, 122 insertions(+), 119 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index a2f4984dbe16a..09ef9a1f55012 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1448,18 +1448,36 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, size_true[i] = len(true_set) else: raise ValueError("Example-based precision, recall, fscore is " - "not meaning full outside multilabel" + "not meaningful outside of multilabel" "classification. Use accuracy_score instead.") + warning_msg = "" + if np.any(size_pred == 0): + warning_msg += ("Sample-based precision is undefined for some " + "samples. ") + + if np.any(size_true == 0): + warning_msg += ("Sample-based recall is undefined for some " + "samples. ") + + if np.any((beta2 * size_true + size_pred) == 0): + warning_msg += ("Sample-based f_score is undefined for some " + "samples. ") + + if warning_msg: + warnings.warn(warning_msg) + + try: # oddly, we may get an "invalid" rather than a "divide" error # here old_err_settings = np.seterr(divide='ignore', invalid='ignore') - precision = size_inter / size_pred - recall = size_inter / size_true - f_score = ((1 + beta2) * size_inter / - (beta2 * size_true + size_pred)) + precision = divide(size_inter, size_pred, dtype=np.double) + recall = divide(size_inter, size_true, dtype=np.double) + f_score = divide((1 + beta2) * size_inter, + (beta2 * size_true + size_pred), + dtype=np.double) finally: np.seterr(**old_err_settings) @@ -1467,6 +1485,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, recall[size_true == 0] = 0.0 f_score[(beta2 * size_true + size_pred) == 0] = 0.0 + + precision = np.mean(precision) recall = np.mean(recall) f_score = np.mean(f_score) @@ -1501,7 +1521,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, finally: np.seterr(**old_err_settings) - if not average: + if average in (None, "macro", "weighted"): warning_msg = "" if np.any(idx_ill_defined_precision): warning_msg += ("The sum of true positives and false positives " @@ -1524,6 +1544,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, if warning_msg: warnings.warn(warning_msg) + if not average: return precision, recall, fscore, support elif y_type == 'binary' and pos_label is not None: @@ -1542,12 +1563,11 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, try: # oddly, we may get an "invalid" rather than a "divide" error here old_err_settings = np.seterr(divide='ignore', invalid='ignore') - avg_precision = divide(true_pos.sum(), - true_pos.sum() + false_pos.sum(), - dtype=np.double) - avg_recall = divide(true_pos.sum(), - true_pos.sum() + false_neg.sum(), - dtype=np.double) + tp_sum = true_pos.sum() + fp_sum = false_pos.sum() + fn_sum = false_neg.sum() + avg_precision = divide(tp_sum, tp_sum + fp_sum, dtype=np.double) + avg_recall = divide(tp_sum, tp_sum + fn_sum, dtype=np.double) avg_fscore = divide((1 + beta2) * (avg_precision * avg_recall), beta2 * avg_precision + avg_recall, dtype=np.double) @@ -1556,19 +1576,19 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, warning_msg = "" - if np.isnan(avg_precision): + if tp_sum + fp_sum == 0: avg_precision = 0. warning_msg += ("The sum of true positives and false " "positives are equal to zero. Micro-precision" " is ill defined. ") - if np.isnan(avg_recall): + if tp_sum + fn_sum == 0: avg_recall = 0. warning_msg += ("The sum of true positives and false " "negatives are equal to zero. Micro-recall " "is ill defined. ") - if np.isnan(avg_fscore): + if beta2 * avg_precision + avg_recall == 0: avg_fscore = 0. warning_msg += ("Micro-precision and micro-recall are equal " "to zero. Micro-fbeta_score is ill defined.") diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index de3734b7cb4de..e499780b13eb8 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -21,6 +21,7 @@ assert_not_equal, assert_array_equal, assert_array_almost_equal, + assert_warns, assert_greater) @@ -55,6 +56,7 @@ from sklearn.externals.six.moves import xrange + REGRESSION_METRICS = { "mean_absolute_error": mean_absolute_error, "mean_squared_error": mean_squared_error, @@ -1221,45 +1223,47 @@ def test_multilabel_representation_invariance(): y2_shuffle_binary_indicator = lb.transform(y2_shuffle) for name, metric in MULTILABELS_METRICS.items(): - measure = metric(y1, y2) - - # Check representation invariance - assert_almost_equal(metric(y1_binary_indicator, y2_binary_indicator), - measure, - err_msg="%s failed representation invariance " - "between list of list of labels format " - "and dense binary indicator format." - % name) - - # Check invariance with redundant labels with list of labels - assert_almost_equal(metric(y1, y2_redundant), measure, - err_msg="%s failed rendundant label invariance" - % name) - - assert_almost_equal(metric(y1_redundant, y2_redundant), measure, - err_msg="%s failed rendundant label invariance" - % name) - - assert_almost_equal(metric(y1_redundant, y2), measure, - err_msg="%s failed rendundant label invariance" - % name) - - # Check shuffling invariance with list of labels - assert_almost_equal(metric(y1_shuffle, y2_shuffle), measure, - err_msg="%s failed shuffling invariance " - "with list of list of labels format." - % name) - - # Check shuffling invariance with dense binary indicator matrix - assert_almost_equal(metric(y1_shuffle_binary_indicator, - y2_shuffle_binary_indicator), measure, - err_msg="%s failed shuffling invariance " - " with dense binary indicator format." - % name) - - # Check raises error with mix input representation - assert_raises(ValueError, metric, y1, y2_binary_indicator) - assert_raises(ValueError, metric, y1_binary_indicator, y2) + with warnings.catch_warnings(True): + measure = metric(y1, y2) + + # Check representation invariance + assert_almost_equal(metric(y1_binary_indicator, + y2_binary_indicator), + measure, + err_msg="%s failed representation invariance " + "between list of list of labels " + "format and dense binary indicator " + "format." % name) + + # Check invariance with redundant labels with list of labels + assert_almost_equal(metric(y1, y2_redundant), measure, + err_msg="%s failed rendundant label invariance" + % name) + + assert_almost_equal(metric(y1_redundant, y2_redundant), measure, + err_msg="%s failed rendundant label invariance" + % name) + + assert_almost_equal(metric(y1_redundant, y2), measure, + err_msg="%s failed rendundant label invariance" + % name) + + # Check shuffling invariance with list of labels + assert_almost_equal(metric(y1_shuffle, y2_shuffle), measure, + err_msg="%s failed shuffling invariance " + "with list of list of labels format." + % name) + + # Check shuffling invariance with dense binary indicator matrix + assert_almost_equal(metric(y1_shuffle_binary_indicator, + y2_shuffle_binary_indicator), measure, + err_msg="%s failed shuffling invariance " + " with dense binary indicator format." + % name) + + # Check raises error with mix input representation + assert_raises(ValueError, metric, y1, y2_binary_indicator) + assert_raises(ValueError, metric, y1_binary_indicator, y2) def test_multilabel_zero_one_loss_subset(): @@ -1587,6 +1591,7 @@ def test_precision_recall_f1_score_multilabel_2(): # |h(x_i) inter y_i | = [0, 0, 1] # |y_i| = [1, 1, 2] # |h(x_i)| = [1, 1, 2] + assert_almost_equal(p, 1 / 6) assert_almost_equal(r, 1 / 6) assert_almost_equal(f, 2 / 4 * 1 / 3) @@ -1651,82 +1656,60 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): average="weighted"), np.average(f2, weights=support)) - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="samples") - # |h(x_i) inter y_i | = [0, 0, 2] - # |y_i| = [1, 1, 2] - # |h(x_i)| = [0, 1, 2] - assert_almost_equal(p, 1 / 3) - assert_almost_equal(r, 1 / 3) - assert_almost_equal(f, 1 / 3) - assert_equal(s, None) - assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average="samples"), - 0.333, 2) + with warnings.catch_warnings(True): + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="samples") + # |h(x_i) inter y_i | = [0, 0, 2] + # |y_i| = [1, 1, 2] + # |h(x_i)| = [0, 1, 2] + assert_almost_equal(p, 1 / 3) + assert_almost_equal(r, 1 / 3) + assert_almost_equal(f, 1 / 3) + assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="samples"), + 0.333, 2) def test_precision_recall_f1_no_labels(): y_true = np.zeros((20, 3)) y_pred = np.zeros_like(y_true) - with warnings.catch_warnings(True): - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average=None) - #tp = [0, 0, 0] - #fn = [0, 0, 0] - #fp = [0, 0, 0] - #support = [0, 0, 0] - - # Check per class + # tp = [0, 0, 0] + # fn = [0, 0, 0] + # fp = [0, 0, 0] + # support = [0, 0, 0] + # |y_hat_i inter y_i | = [0, 0, 0] + # |y_i| = [0, 0, 0] + # |y_hat_i| = [1, 1, 2] + warnings.simplefilter("always") + + for beta in [1]: + p, r, f, s = assert_warns(UserWarning, + precision_recall_fscore_support, + y_true, y_pred, average=None, beta=beta) assert_array_almost_equal(p, [0, 0, 0], 2) assert_array_almost_equal(r, [0, 0, 0], 2) assert_array_almost_equal(f, [0, 0, 0], 2) assert_array_almost_equal(s, [0, 0, 0], 2) - assert_array_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average=None), [0, 0, 0], 2) - - # Check macro - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="macro") - assert_almost_equal(p, 0) - assert_almost_equal(r, 0) - assert_almost_equal(f, 0) - assert_equal(s, None) - assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average="macro"), 0) - - # Check micro - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="micro") - assert_almost_equal(p, 0) - assert_almost_equal(r, 0) - assert_almost_equal(f, 0) - assert_equal(s, None) - assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average="micro"), 0) - - # Check weighted - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="weighted") - assert_almost_equal(p, 0) - assert_almost_equal(r, 0) - assert_almost_equal(f, 0) - assert_equal(s, None) - assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average="weighted"), 0) - # # Check example - # |h(x_i) inter y_i | = [0, 0, 0] - # |y_i| = [0, 0, 0] - # |h(x_i)| = [1, 1, 2] - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="samples") - assert_almost_equal(p, 0) - assert_almost_equal(r, 0) - assert_almost_equal(f, 0) - assert_equal(s, None) - assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average="samples"), 0) + fbeta = assert_warns(UserWarning, fbeta_score, y_true, y_pred, + beta=beta, average=None) + assert_array_almost_equal(fbeta, [0, 0, 0], 2) + + for average in ["macro", "micro", "weighted", "samples"]: + p, r, f, s = assert_warns(UserWarning, + precision_recall_fscore_support, + y_true, y_pred, average=average, + beta=beta) + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + + fbeta = assert_warns(UserWarning, fbeta_score, y_true, y_pred, + beta=beta, average=average) + assert_almost_equal(fbeta, 0) def test__check_clf_targets(): From 645bacc8623e908b666d68a97d3ce27c004c009f Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Mon, 8 Jul 2013 14:27:35 +0200 Subject: [PATCH 07/14] FIX TST with warnings thx to @jnothman --- sklearn/metrics/tests/test_metrics.py | 51 ++++++++++++++------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index e499780b13eb8..ea3d2c64c43a2 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -1224,6 +1224,8 @@ def test_multilabel_representation_invariance(): for name, metric in MULTILABELS_METRICS.items(): with warnings.catch_warnings(True): + warnings.simplefilter("always") + measure = metric(y1, y2) # Check representation invariance @@ -1682,34 +1684,35 @@ def test_precision_recall_f1_no_labels(): # |y_hat_i inter y_i | = [0, 0, 0] # |y_i| = [0, 0, 0] # |y_hat_i| = [1, 1, 2] - warnings.simplefilter("always") - - for beta in [1]: - p, r, f, s = assert_warns(UserWarning, - precision_recall_fscore_support, - y_true, y_pred, average=None, beta=beta) - assert_array_almost_equal(p, [0, 0, 0], 2) - assert_array_almost_equal(r, [0, 0, 0], 2) - assert_array_almost_equal(f, [0, 0, 0], 2) - assert_array_almost_equal(s, [0, 0, 0], 2) - - fbeta = assert_warns(UserWarning, fbeta_score, y_true, y_pred, - beta=beta, average=None) - assert_array_almost_equal(fbeta, [0, 0, 0], 2) - - for average in ["macro", "micro", "weighted", "samples"]: + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + + for beta in [1]: p, r, f, s = assert_warns(UserWarning, precision_recall_fscore_support, - y_true, y_pred, average=average, - beta=beta) - assert_almost_equal(p, 0) - assert_almost_equal(r, 0) - assert_almost_equal(f, 0) - assert_equal(s, None) + y_true, y_pred, average=None, beta=beta) + assert_array_almost_equal(p, [0, 0, 0], 2) + assert_array_almost_equal(r, [0, 0, 0], 2) + assert_array_almost_equal(f, [0, 0, 0], 2) + assert_array_almost_equal(s, [0, 0, 0], 2) fbeta = assert_warns(UserWarning, fbeta_score, y_true, y_pred, - beta=beta, average=average) - assert_almost_equal(fbeta, 0) + beta=beta, average=None) + assert_array_almost_equal(fbeta, [0, 0, 0], 2) + + for average in ["macro", "micro", "weighted", "samples"]: + p, r, f, s = assert_warns(UserWarning, + precision_recall_fscore_support, + y_true, y_pred, average=average, + beta=beta) + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + + fbeta = assert_warns(UserWarning, fbeta_score, y_true, y_pred, + beta=beta, average=average) + assert_almost_equal(fbeta, 0) def test__check_clf_targets(): From 377a963c14e34357568179751a17dfd3356b4816 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Mon, 8 Jul 2013 14:31:55 +0200 Subject: [PATCH 08/14] flake8 --- sklearn/metrics/metrics.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 09ef9a1f55012..2b14951d5735b 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1467,7 +1467,6 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, if warning_msg: warnings.warn(warning_msg) - try: # oddly, we may get an "invalid" rather than a "divide" error # here @@ -1485,8 +1484,6 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, recall[size_true == 0] = 0.0 f_score[(beta2 * size_true + size_pred) == 0] = 0.0 - - precision = np.mean(precision) recall = np.mean(recall) f_score = np.mean(f_score) @@ -1561,12 +1558,14 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, average_options = (None, 'micro', 'macro', 'weighted', 'samples') if average == 'micro': try: - # oddly, we may get an "invalid" rather than a "divide" error here + # oddly, we may get an "invalid" rather than a "divide" error + # here old_err_settings = np.seterr(divide='ignore', invalid='ignore') tp_sum = true_pos.sum() fp_sum = false_pos.sum() fn_sum = false_neg.sum() - avg_precision = divide(tp_sum, tp_sum + fp_sum, dtype=np.double) + avg_precision = divide(tp_sum, tp_sum + fp_sum, + dtype=np.double) avg_recall = divide(tp_sum, tp_sum + fn_sum, dtype=np.double) avg_fscore = divide((1 + beta2) * (avg_precision * avg_recall), beta2 * avg_precision + avg_recall, From 06c2c7b1b6e46b308495a970860efc240c131a4b Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Wed, 10 Jul 2013 16:03:29 +0200 Subject: [PATCH 09/14] ENH set warning to stacklevel 2 --- sklearn/metrics/metrics.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 2b14951d5735b..030446c72032c 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1539,7 +1539,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, % labels[idx_ill_defined_fbeta_score]) if warning_msg: - warnings.warn(warning_msg) + warnings.warn(warning_msg, stacklevel=2) if not average: return precision, recall, fscore, support @@ -1593,7 +1593,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, "to zero. Micro-fbeta_score is ill defined.") if warning_msg: - warnings.warn(warning_msg) + warnings.warn(warning_msg, stacklevel=2) elif average == 'macro': avg_precision = np.mean(precision) @@ -1607,7 +1607,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, avg_fscore = 0. warnings.warn("There isn't any labels in y_true. " "Weighted-precision, weighted-recall and " - "weighted-fbeta_score are ill defined.") + "weighted-fbeta_score are ill defined.", + stacklevel=2) else: avg_precision = np.average(precision, weights=support) From 6dafe5752147633e863883dfcbe2be5431faa46c Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Wed, 10 Jul 2013 16:04:18 +0200 Subject: [PATCH 10/14] TST silence warning --- sklearn/metrics/tests/test_metrics.py | 98 ++++++++++++++------------- 1 file changed, 51 insertions(+), 47 deletions(-) diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index ea3d2c64c43a2..33c5d9d423f11 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -514,6 +514,8 @@ def test_precision_recall_f_binary_single_class(): negative class Such a case may occur with non-stratified cross-validation""" + warnings.simplefilter("ignore") + assert_equal(1., precision_score([1, 1], [1, 1])) assert_equal(1., recall_score([1, 1], [1, 1])) assert_equal(1., f1_score([1, 1], [1, 1])) @@ -1463,24 +1465,25 @@ def test_precision_recall_f1_score_multilabel_1(): y_true_bi = lb.transform(y_true_ll) y_pred_bi = lb.transform(y_pred_ll) + warnings.simplefilter("ignore") + for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: - with warnings.catch_warnings(True): - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average=None) - #tp = [0, 1, 1, 0] - #fn = [1, 0, 0, 1] - #fp = [1, 1, 0, 0] - # Check per class + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + #tp = [0, 1, 1, 0] + #fn = [1, 0, 0, 1] + #fp = [1, 1, 0, 0] + # Check per class - assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2) - assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2) - assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) - assert_array_almost_equal(s, [1, 1, 1, 1], 2) + assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) + assert_array_almost_equal(s, [1, 1, 1, 1], 2) - f2 = fbeta_score(y_true, y_pred, beta=2, average=None) - support = s - assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2) + f2 = fbeta_score(y_true, y_pred, beta=2, average=None) + support = s + assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2) # Check macro p, r, f, s = precision_recall_fscore_support(y_true, y_pred, @@ -1540,22 +1543,23 @@ def test_precision_recall_f1_score_multilabel_2(): y_true_bi = lb.transform(y_true_ll) y_pred_bi = lb.transform(y_pred_ll) + warnings.simplefilter("ignore") + for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: # tp = [ 0. 1. 0. 0.] # fp = [ 1. 0. 0. 2.] # fn = [ 1. 1. 1. 0.] - with warnings.catch_warnings(True): - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average=None) - assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2) - assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2) - assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2) - assert_array_almost_equal(s, [1, 2, 1, 0], 2) + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2) + assert_array_almost_equal(s, [1, 2, 1, 0], 2) - f2 = fbeta_score(y_true, y_pred, beta=2, average=None) - support = s - assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2) + f2 = fbeta_score(y_true, y_pred, beta=2, average=None) + support = s + assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") @@ -1612,21 +1616,22 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): y_true_bi = lb.transform(y_true_ll) y_pred_bi = lb.transform(y_pred_ll) + warnings.simplefilter("ignore") + for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: # true_pos = [ 0. 1. 1. 0.] # false_pos = [ 0. 0. 0. 1.] # false_neg = [ 1. 1. 0. 0.] - with warnings.catch_warnings(True): - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average=None) - assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2) - assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2) - assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) - assert_array_almost_equal(s, [1, 2, 1, 0], 2) + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) + assert_array_almost_equal(s, [1, 2, 1, 0], 2) - f2 = fbeta_score(y_true, y_pred, beta=2, average=None) - support = s - assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2) + f2 = fbeta_score(y_true, y_pred, beta=2, average=None) + support = s + assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") @@ -1658,19 +1663,18 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): average="weighted"), np.average(f2, weights=support)) - with warnings.catch_warnings(True): - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="samples") - # |h(x_i) inter y_i | = [0, 0, 2] - # |y_i| = [1, 1, 2] - # |h(x_i)| = [0, 1, 2] - assert_almost_equal(p, 1 / 3) - assert_almost_equal(r, 1 / 3) - assert_almost_equal(f, 1 / 3) - assert_equal(s, None) - assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, - average="samples"), - 0.333, 2) + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="samples") + # |h(x_i) inter y_i | = [0, 0, 2] + # |y_i| = [1, 1, 2] + # |h(x_i)| = [0, 1, 2] + assert_almost_equal(p, 1 / 3) + assert_almost_equal(r, 1 / 3) + assert_almost_equal(f, 1 / 3) + assert_equal(s, None) + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, + average="samples"), + 0.333, 2) def test_precision_recall_f1_no_labels(): From 79a0cc9368f9802c0cb878b4ed63405940f5e6d1 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Wed, 10 Jul 2013 16:09:11 +0200 Subject: [PATCH 11/14] ENH use with np.errstate --- sklearn/metrics/metrics.py | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 030446c72032c..667679918bf82 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -910,11 +910,9 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True): # Compute accuracy for each possible representation y_type, y_true, y_pred = _check_clf_targets(y_true, y_pred) if y_type == 'multilabel-indicator': - try: + with np.errstate(divide='ignore', invalid='ignore'): # oddly, we may get an "invalid" rather than a "divide" # error here - old_err_settings = np.seterr(divide='ignore', - invalid='ignore') y_pred_pos_label = y_pred == 1 y_true_pos_label = y_true == 1 pred_inter_true = np.sum(np.logical_and(y_pred_pos_label, @@ -929,8 +927,6 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True): # the jaccard to 1: lim_{x->0} x/x = 1 # Note with py2.6 and np 1.3: we can't check safely for nan. score[pred_union_true == 0.0] = 1.0 - finally: - np.seterr(**old_err_settings) elif y_type == 'multilabel-sequences': score = np.empty(len(y_true), dtype=np.float) @@ -1467,18 +1463,14 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, if warning_msg: warnings.warn(warning_msg) - try: + with np.errstate(divide="ignore", invalid="ignore"): # oddly, we may get an "invalid" rather than a "divide" error # here - old_err_settings = np.seterr(divide='ignore', invalid='ignore') - precision = divide(size_inter, size_pred, dtype=np.double) recall = divide(size_inter, size_true, dtype=np.double) f_score = divide((1 + beta2) * size_inter, (beta2 * size_true + size_pred), dtype=np.double) - finally: - np.seterr(**old_err_settings) precision[size_pred == 0] = 0.0 recall[size_true == 0] = 0.0 @@ -1493,9 +1485,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, true_pos, _, false_pos, false_neg = _tp_tn_fp_fn(y_true, y_pred, labels) support = true_pos + false_neg - try: + with np.errstate(divide='ignore', invalid='ignore'): # oddly, we may get an "invalid" rather than a "divide" error here - old_err_settings = np.seterr(divide='ignore', invalid='ignore') # precision and recall precision = divide(true_pos.astype(np.float), true_pos + false_pos) @@ -1515,8 +1506,6 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, # handle division by 0 in fscore idx_ill_defined_fbeta_score = (beta2 * precision + recall) == 0 fscore[idx_ill_defined_fbeta_score] = 0.0 - finally: - np.seterr(**old_err_settings) if average in (None, "macro", "weighted"): warning_msg = "" @@ -1557,10 +1546,10 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, else: average_options = (None, 'micro', 'macro', 'weighted', 'samples') if average == 'micro': - try: + with np.errstate(divide='ignore', invalid='ignore'): # oddly, we may get an "invalid" rather than a "divide" error # here - old_err_settings = np.seterr(divide='ignore', invalid='ignore') + tp_sum = true_pos.sum() fp_sum = false_pos.sum() fn_sum = false_neg.sum() @@ -1570,11 +1559,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, avg_fscore = divide((1 + beta2) * (avg_precision * avg_recall), beta2 * avg_precision + avg_recall, dtype=np.double) - finally: - np.seterr(**old_err_settings) warning_msg = "" - if tp_sum + fp_sum == 0: avg_precision = 0. warning_msg += ("The sum of true positives and false " From aa0c47e8b1696fbe48a74cc2af92eecade4458a4 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Wed, 10 Jul 2013 16:41:53 +0200 Subject: [PATCH 12/14] DOC TST correct comment --- sklearn/metrics/tests/test_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index 33c5d9d423f11..1e00feddb6512 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -1687,7 +1687,7 @@ def test_precision_recall_f1_no_labels(): # support = [0, 0, 0] # |y_hat_i inter y_i | = [0, 0, 0] # |y_i| = [0, 0, 0] - # |y_hat_i| = [1, 1, 2] + # |y_hat_i| = [0, 0, 0] with warnings.catch_warnings(record=True): warnings.simplefilter("always") From a0aa7776e7fbbfb737d500b4accd93cc7799b311 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Thu, 25 Jul 2013 11:26:15 +0200 Subject: [PATCH 13/14] FIX warning test --- sklearn/svm/tests/test_sparse.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/sklearn/svm/tests/test_sparse.py b/sklearn/svm/tests/test_sparse.py index 82ac3308c730c..72fd0605d02bf 100644 --- a/sklearn/svm/tests/test_sparse.py +++ b/sklearn/svm/tests/test_sparse.py @@ -3,7 +3,7 @@ from scipy import sparse from sklearn import datasets, svm, linear_model, base from numpy.testing import (assert_array_almost_equal, assert_array_equal, - assert_equal) + assert_equal, assert_warns) from nose.tools import assert_raises, assert_true, assert_false from nose.tools import assert_equal as nose_assert_equal @@ -275,13 +275,7 @@ def test_sparse_svc_clone_with_callable_kernel(): def test_timeout(): sp = svm.SVC(C=1, kernel=lambda x, y: x * y.T, probability=True, max_iter=1) - with warnings.catch_warnings(record=True) as foo: - sp.fit(X_sp, Y) - nose_assert_equal(len(foo), 1, msg=foo) - nose_assert_equal(foo[0].category, ConvergenceWarning, - msg=foo[0].category) + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") - -if __name__ == '__main__': - import nose - nose.runmodule() + assert_warns(ConvergenceWarning, sp.fit, X_sp, Y) From a5a026ca3140a2f14c007510ac99c4ef12947472 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Thu, 25 Jul 2013 11:41:20 +0200 Subject: [PATCH 14/14] FIX warning tests in preprocessing --- sklearn/preprocessing/tests/test_data.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 37b81c3d03d71..b4bdb0f7d9076 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -10,6 +10,7 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_false +from sklearn.utils.testing import assert_warns from sklearn.utils.sparsefuncs import mean_variance_axis0 from sklearn.preprocessing.data import _transform_selected @@ -306,13 +307,13 @@ def test_warning_scaling_integers(): X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) - with warnings.catch_warnings(record=True) as w: - StandardScaler().fit(X) - assert_equal(len(w), 1) + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + assert_warns(UserWarning, StandardScaler().fit, X) - with warnings.catch_warnings(record=True) as w: - MinMaxScaler().fit(X) - assert_equal(len(w), 1) + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + assert_warns(UserWarning, MinMaxScaler().fit, X) def test_normalizer_l1():