Merge pull request #4192 from jnothman/binary_iff_binary

ogrisel · ogrisel · commit 681352827ba2 · 2015-03-03T09:21:36.000+01:00
[MRG+2] P/R/F: in future, average='binary' iff 2 labels in y one of which is pos_label
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
@@ -503,15 +503,19 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
         Integer array of labels.
 
     pos_label : str or int, 1 by default
-        If ``average`` is not ``None`` and the classification target is binary,
-        only this class's scores will be returned.
+        The class to report if ``average='binary'``. Until version 0.18 it is
+        necessary to set ``pos_label=None`` if seeking to use another averaging
+        method over binary targets.
 
-    average : one of [None, 'micro', 'macro', 'samples', 'weighted']
+    average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
+                       'weighted']
         This parameter is required for multiclass/multilabel targets.
-        If ``None``, the scores for each class are returned. Otherwise,
-        unless ``pos_label`` is given in binary classification, this
+        If ``None``, the scores for each class are returned. Otherwise, this
         determines the type of averaging performed on the data:
 
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
         ``'micro'``:
             Calculate metrics globally by counting the total true positives,
             false negatives and false positives.
@@ -528,6 +532,10 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
             meaningful for multilabel classification where this differs from
             :func:`accuracy_score`).
 
+        Note that if ``pos_label`` is given in binary classification with
+        `average != 'binary'`, only that positive class is reported. This
+        behavior is deprecated and will change in version 0.18.
+
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
 
@@ -590,15 +598,19 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
         Integer array of labels.
 
     pos_label : str or int, 1 by default
-        If ``average`` is not ``None`` and the classification target is binary,
-        only this class's scores will be returned.
+        The class to report if ``average='binary'``. Until version 0.18 it is
+        necessary to set ``pos_label=None`` if seeking to use another averaging
+        method over binary targets.
 
-    average : one of [None, 'micro', 'macro', 'samples', 'weighted']
+    average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
+                       'weighted']
         This parameter is required for multiclass/multilabel targets.
-        If ``None``, the scores for each class are returned. Otherwise,
-        unless ``pos_label`` is given in binary classification, this
+        If ``None``, the scores for each class are returned. Otherwise, this
         determines the type of averaging performed on the data:
 
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
         ``'micro'``:
             Calculate metrics globally by counting the total true positives,
             false negatives and false positives.
@@ -615,6 +627,10 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
             meaningful for multilabel classification where this differs from
             :func:`accuracy_score`).
 
+        Note that if ``pos_label`` is given in binary classification with
+        `average != 'binary'`, only that positive class is reported. This
+        behavior is deprecated and will change in version 0.18.
+
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
 
@@ -750,14 +766,18 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         Integer array of labels.
 
     pos_label : str or int, 1 by default
-        If ``average`` is not ``None`` and the classification target is binary,
-        only this class's scores will be returned.
+        The class to report if ``average='binary'``. Until version 0.18 it is
+        necessary to set ``pos_label=None`` if seeking to use another averaging
+        method over binary targets.
 
-    average : string, [None (default), 'micro', 'macro', 'samples', 'weighted']
-        If ``None``, the scores for each class are returned. Otherwise,
-        unless ``pos_label`` is given in binary classification, this
+    average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \
+                       'weighted']
+        If ``None``, the scores for each class are returned. Otherwise, this
         determines the type of averaging performed on the data:
 
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
         ``'micro'``:
             Calculate metrics globally by counting the total true positives,
             false negatives and false positives.
@@ -774,6 +794,10 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
             meaningful for multilabel classification where this differs from
             :func:`accuracy_score`).
 
+        Note that if ``pos_label`` is given in binary classification with
+        `average != 'binary'`, only that positive class is reported. This
+        behavior is deprecated and will change in version 0.18.
+
     warn_for : tuple or set, for internal use
         This determines which warnings will be made in the case that this
         function is being used to return only one of its metrics.
@@ -834,11 +858,11 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
 
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
 
-    if average == 'binary' and y_type != 'binary':
+    if average == 'binary' and (y_type != 'binary' or pos_label is None):
         warnings.warn('The default `weighted` averaging is deprecated, '
                       'and from version 0.18, use of precision, recall or '
-                      'F-score with multiclass or multilabel data will result '
-                      'in an exception. '
+                      'F-score with multiclass or multilabel data or '
+                      'pos_label=None will result in an exception. '
                       'Please set an explicit value for `average`, one of '
                       '%s. In cross validation use, for instance, '
                       'scoring="f1_weighted" instead of scoring="f1".'
@@ -900,14 +924,12 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     ### Select labels to keep ###
 
     if y_type == 'binary' and average is not None and pos_label is not None:
-        if average != 'binary' and label_order is not None \
-           and len(label_order) == 2:
-            warnings.warn('In the future, providing two `labels` values, as '
-                          'well as `average!=`binary`` will average over '
-                          'those labels. For now, please use `labels=None` '
-                          'with `pos_label` to evaluate precision, recall and '
-                          'F-score for the positive label only.',
-                          FutureWarning)
+        if average != 'binary':
+            warnings.warn('From version 0.18, binary input will not be '
+                          'handled specially when using averaged '
+                          'precision/recall/F-score. '
+                          'Please use average=\'binary\' to report only the '
+                          'positive class performance.', DeprecationWarning)
         if pos_label not in labels:
             if len(labels) == 1:
                 # Only negative labels
@@ -955,6 +977,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         weights = None
 
     if average is not None:
+        assert average != 'binary' or len(precision) == 1
         precision = np.average(precision, weights=weights)
         recall = np.average(recall, weights=weights)
         f_score = np.average(f_score, weights=weights)
@@ -992,15 +1015,19 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
         Integer array of labels.
 
     pos_label : str or int, 1 by default
-        If ``average`` is not ``None`` and the classification target is binary,
-        only this class's scores will be returned.
+        The class to report if ``average='binary'``. Until version 0.18 it is
+        necessary to set ``pos_label=None`` if seeking to use another averaging
+        method over binary targets.
 
-    average : one of [None, 'micro', 'macro', 'samples', 'weighted']
+    average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
+                       'weighted']
         This parameter is required for multiclass/multilabel targets.
-        If ``None``, the scores for each class are returned. Otherwise,
-        unless ``pos_label`` is given in binary classification, this
+        If ``None``, the scores for each class are returned. Otherwise, this
         determines the type of averaging performed on the data:
 
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
         ``'micro'``:
             Calculate metrics globally by counting the total true positives,
             false negatives and false positives.
@@ -1017,6 +1044,10 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
             meaningful for multilabel classification where this differs from
             :func:`accuracy_score`).
 
+        Note that if ``pos_label`` is given in binary classification with
+        `average != 'binary'`, only that positive class is reported. This
+        behavior is deprecated and will change in version 0.18.
+
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
 
@@ -1075,15 +1106,19 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
         Integer array of labels.
 
     pos_label : str or int, 1 by default
-        If ``average`` is not ``None`` and the classification target is binary,
-        only this class's scores will be returned.
+        The class to report if ``average='binary'``. Until version 0.18 it is
+        necessary to set ``pos_label=None`` if seeking to use another averaging
+        method over binary targets.
 
-    average : one of [None, 'micro', 'macro', 'samples', 'weighted']
+    average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
+                       'weighted']
         This parameter is required for multiclass/multilabel targets.
-        If ``None``, the scores for each class are returned. Otherwise,
-        unless ``pos_label`` is given in binary classification, this
+        If ``None``, the scores for each class are returned. Otherwise, this
         determines the type of averaging performed on the data:
 
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
         ``'micro'``:
             Calculate metrics globally by counting the total true positives,
             false negatives and false positives.
@@ -1100,6 +1135,10 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
             meaningful for multilabel classification where this differs from
             :func:`accuracy_score`).
 
+        Note that if ``pos_label`` is given in binary classification with
+        `average != 'binary'`, only that positive class is reported. This
+        behavior is deprecated and will change in version 0.18.
+
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -140,18 +140,23 @@ def test_precision_recall_f1_score_binary():
 
     # individual scoring function that can be used for grid search: in the
     # binary class case the score is the value of the measure for the positive
-    # class (e.g. label == 1)
-    ps = precision_score(y_true, y_pred)
-    assert_array_almost_equal(ps, 0.85, 2)
+    # class (e.g. label == 1). This is deprecated for average != 'binary'.
+    assert_dep_warning = partial(assert_warns, DeprecationWarning)
+    for kwargs, my_assert in [({}, assert_no_warnings),
+                              ({'average': 'binary'}, assert_no_warnings),
+                              ({'average': 'micro'}, assert_dep_warning)]:
+        ps = my_assert(precision_score, y_true, y_pred, **kwargs)
+        assert_array_almost_equal(ps, 0.85, 2)
 
-    rs = recall_score(y_true, y_pred)
-    assert_array_almost_equal(rs, 0.68, 2)
+        rs = my_assert(recall_score, y_true, y_pred, **kwargs)
+        assert_array_almost_equal(rs, 0.68, 2)
 
-    fs = f1_score(y_true, y_pred)
-    assert_array_almost_equal(fs, 0.76, 2)
+        fs = my_assert(f1_score, y_true, y_pred, **kwargs)
+        assert_array_almost_equal(fs, 0.76, 2)
 
-    assert_almost_equal(fbeta_score(y_true, y_pred, beta=2),
-                        (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
+        assert_almost_equal(my_assert(fbeta_score, y_true, y_pred, beta=2,
+                                      **kwargs),
+                            (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
 
 
 @ignore_warnings
@@ -206,6 +211,7 @@ def test_average_precision_score_tied_values():
     assert_not_equal(average_precision_score(y_true, y_score), 1.)
 
 
+@ignore_warnings
 def test_precision_recall_fscore_support_errors():
     y_true, y_pred, _ = make_prediction(binary=True)
 
@@ -1012,6 +1018,8 @@ def test_prf_average_compat():
     """
     y_true = [1, 2, 3, 3]
     y_pred = [1, 2, 3, 1]
+    y_true_bin = [0, 1, 1]
+    y_pred_bin = [0, 1, 0]
 
     for metric in [precision_score, recall_score, f1_score,
                    partial(fbeta_score, beta=2)]:
@@ -1022,7 +1030,16 @@ def test_prf_average_compat():
                      'average does not act like "weighted" by default')
 
         # check binary passes without warning
-        assert_no_warnings(metric, [0, 1, 1], [0, 1, 0])
+        assert_no_warnings(metric, y_true_bin, y_pred_bin)
+
+        # but binary with pos_label=None should behave like multiclass
+        score = assert_warns(DeprecationWarning, metric,
+                             y_true_bin, y_pred_bin, pos_label=None)
+        score_weighted = assert_no_warnings(metric, y_true_bin, y_pred_bin,
+                                            pos_label=None, average='weighted')
+        assert_equal(score, score_weighted,
+                     'average does not act like "weighted" by default with '
+                     'binary data and pos_label=None')
 
 
 @ignore_warnings  # sequence of sequences is deprecated
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
@@ -221,6 +221,7 @@
 
     "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",
 
+    # pos_label support deprecated; to be removed in 0.18:
     "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
     "weighted_precision_score", "weighted_recall_score",
 
@@ -232,7 +233,7 @@
 ]
 
 # Metrics with a "labels" argument
-# XXX: Handle multi_class metrics that has a labels argument as well as a
+# TODO: Handle multi_class metrics that has a labels argument as well as a
 # decision function argument. e.g hinge_loss
 METRICS_WITH_LABELS = [
     "confusion_matrix",
@@ -942,7 +943,7 @@ def check_sample_weight_invariance(name, metric, y1, y2):
         unweighted_score,
         metric(y1, y2, sample_weight=np.ones(shape=len(y1))),
         err_msg="For %s sample_weight=None is not equivalent to "
-            "sample_weight=ones" % name)
+                "sample_weight=ones" % name)
 
     # check that the weighted and unweighted scores are unequal
     weighted_score = metric(y1, y2, sample_weight=sample_weight)
@@ -957,8 +958,8 @@ def check_sample_weight_invariance(name, metric, y1, y2):
     assert_almost_equal(
         weighted_score, weighted_score_list,
         err_msg="Weighted scores for array and list sample_weight input are "
-            "not equal (%f != %f) for %s" % (
-                weighted_score, weighted_score_list, name))
+                "not equal (%f != %f) for %s" % (
+                    weighted_score, weighted_score_list, name))
 
     # check that integer weights is the same as repeated samples
     repeat_weighted_score = metric(
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
@@ -172,8 +172,7 @@ def test_classification_scores():
     # test fbeta score that takes an argument
     scorer = make_scorer(fbeta_score, beta=2)
     score1 = scorer(clf, X_test, y_test)
-    score2 = fbeta_score(y_test, clf.predict(X_test), beta=2,
-                         average='weighted')
+    score2 = fbeta_score(y_test, clf.predict(X_test), beta=2)
     assert_almost_equal(score1, score2)
 
     # test that custom scorer can be pickled