MNT Consistent warning and more doc about the edge cases of P/R/F (#13143)

qinhanmin2014 · adrinjalali · commit 2237b17b9a09 · 2019-02-26T19:06:52.000+01:00
* consistent error message

* new test

* ignore warnings

* notes

* joel's comment

* adrin's comment
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
@@ -922,6 +922,11 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
     >>> f1_score(y_true, y_pred, average=None)
     array([0.8, 0. , 0. ])
 
+    Notes
+    -----
+    When ``true positive + false positive == 0`` or
+    ``true positive + false negative == 0``, f-score returns 0 and raises
+    ``UndefinedMetricWarning``.
     """
     return fbeta_score(y_true, y_pred, 1, labels=labels,
                        pos_label=pos_label, average=average,
@@ -1036,6 +1041,11 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
     ... # doctest: +ELLIPSIS
     array([0.71..., 0.        , 0.        ])
 
+    Notes
+    -----
+    When ``true positive + false positive == 0`` or
+    ``true positive + false negative == 0``, f-score returns 0 and raises
+    ``UndefinedMetricWarning``.
     """
     _, _, f, _ = precision_recall_fscore_support(y_true, y_pred,
                                                  beta=beta,
@@ -1233,6 +1243,12 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
      array([0., 0., 1.]), array([0. , 0. , 0.8]),
      array([2, 2, 2]))
 
+    Notes
+    -----
+    When ``true positive + false positive == 0``, precision is undefined;
+    When ``true positive + false negative == 0``, recall is undefined.
+    In such cases, the metric will be set to 0, as will f-score, and
+    ``UndefinedMetricWarning`` will be raised.
     """
     average_options = (None, 'micro', 'macro', 'weighted', 'samples')
     if average not in average_options and average != 'binary':
@@ -1247,13 +1263,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
 
     if average == 'binary':
         if y_type == 'binary':
-            if pos_label not in present_labels:
-                if len(present_labels) < 2:
-                    # Only negative labels
-                    return (0., 0., 0., 0)
-                else:
-                    raise ValueError("pos_label=%r is not a valid label: %r" %
-                                     (pos_label, present_labels))
+            if pos_label not in present_labels and len(present_labels) >= 2:
+                raise ValueError("pos_label=%r is not a valid label: %r" %
+                                 (pos_label, present_labels))
             labels = [pos_label]
         else:
             raise ValueError("Target is %s but average='binary'. Please "
@@ -1279,7 +1291,6 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         true_sum = np.array([true_sum.sum()])
 
     # Finally, we have all our sufficient statistics. Divide! #
-
     beta2 = beta ** 2
     with np.errstate(divide='ignore', invalid='ignore'):
         # Divide, and on zero-division, set scores to 0 and warn:
@@ -1297,7 +1308,6 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         f_score[tp_sum == 0] = 0.0
 
     # Average the results
-
     if average == 'weighted':
         weights = true_sum
         if weights.sum() == 0:
@@ -1410,6 +1420,10 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
     >>> precision_score(y_true, y_pred, average=None)  # doctest: +ELLIPSIS
     array([0.66..., 0.        , 0.        ])
 
+    Notes
+    -----
+    When ``true positive + false positive == 0``, precision returns 0 and
+    raises ``UndefinedMetricWarning``.
     """
     p, _, _, _ = precision_recall_fscore_support(y_true, y_pred,
                                                  labels=labels,
@@ -1512,6 +1526,10 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
     >>> recall_score(y_true, y_pred, average=None)
     array([1., 0., 0.])
 
+    Notes
+    -----
+    When ``true positive + false negative == 0``, recall returns 0 and raises
+    ``UndefinedMetricWarning``.
     """
     _, r, _, _ = precision_recall_fscore_support(y_true, y_pred,
                                                  labels=labels,
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -198,6 +198,7 @@ def test_precision_recall_f1_score_binary():
                             (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
 
 
+@ignore_warnings
 def test_precision_recall_f_binary_single_class():
     # Test precision, recall and F1 score behave with a single positive or
     # negative class
@@ -1065,6 +1066,7 @@ def test_classification_report_no_labels_target_names_unequal_length():
                          y_true, y_pred, target_names=target_names)
 
 
+@ignore_warnings
 def test_multilabel_classification_report():
     n_classes = 4
     n_samples = 50
@@ -1446,6 +1448,17 @@ def test_prf_warnings():
            'being set to 0.0 due to no true samples.')
     my_assert(w, msg, f, [-1, -1], [1, 1], average='binary')
 
+    clean_warning_registry()
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter('always')
+        precision_recall_fscore_support([0, 0], [0, 0], average="binary")
+        msg = ('Recall and F-score are ill-defined and '
+               'being set to 0.0 due to no true samples.')
+        assert_equal(str(record.pop().message), msg)
+        msg = ('Precision and F-score are ill-defined and '
+               'being set to 0.0 due to no predicted samples.')
+        assert_equal(str(record.pop().message), msg)
+
 
 def test_recall_warnings():
     assert_no_warnings(recall_score,
@@ -1461,19 +1474,26 @@ def test_recall_warnings():
         assert_equal(str(record.pop().message),
                      'Recall is ill-defined and '
                      'being set to 0.0 due to no true samples.')
+        recall_score([0, 0], [0, 0])
+        assert_equal(str(record.pop().message),
+                     'Recall is ill-defined and '
+                     'being set to 0.0 due to no true samples.')
 
 
 def test_precision_warnings():
     clean_warning_registry()
     with warnings.catch_warnings(record=True) as record:
         warnings.simplefilter('always')
-
         precision_score(np.array([[1, 1], [1, 1]]),
                         np.array([[0, 0], [0, 0]]),
                         average='micro')
         assert_equal(str(record.pop().message),
                      'Precision is ill-defined and '
                      'being set to 0.0 due to no predicted samples.')
+        precision_score([0, 0], [0, 0])
+        assert_equal(str(record.pop().message),
+                     'Precision is ill-defined and '
+                     'being set to 0.0 due to no predicted samples.')
 
     assert_no_warnings(precision_score,
                        np.array([[0, 0], [0, 0]]),
@@ -1499,6 +1519,13 @@ def test_fscore_warnings():
             assert_equal(str(record.pop().message),
                          'F-score is ill-defined and '
                          'being set to 0.0 due to no true samples.')
+            score([0, 0], [0, 0])
+            assert_equal(str(record.pop().message),
+                         'F-score is ill-defined and '
+                         'being set to 0.0 due to no true samples.')
+            assert_equal(str(record.pop().message),
+                         'F-score is ill-defined and '
+                         'being set to 0.0 due to no predicted samples.')
 
 
 def test_prf_average_binary_data_non_binary():