Skip to content

Commit 6813528

Browse files
committed
Merge pull request #4192 from jnothman/binary_iff_binary
[MRG+2] P/R/F: in future, average='binary' iff 2 labels in y one of which is pos_label
2 parents 9e20fb6 + 4fcf20a commit 6813528

File tree

4 files changed

+108
-52
lines changed

4 files changed

+108
-52
lines changed

sklearn/metrics/classification.py

+75-36
Original file line numberDiff line numberDiff line change
@@ -503,15 +503,19 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
503503
Integer array of labels.
504504
505505
pos_label : str or int, 1 by default
506-
If ``average`` is not ``None`` and the classification target is binary,
507-
only this class's scores will be returned.
506+
The class to report if ``average='binary'``. Until version 0.18 it is
507+
necessary to set ``pos_label=None`` if seeking to use another averaging
508+
method over binary targets.
508509
509-
average : one of [None, 'micro', 'macro', 'samples', 'weighted']
510+
average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
511+
'weighted']
510512
This parameter is required for multiclass/multilabel targets.
511-
If ``None``, the scores for each class are returned. Otherwise,
512-
unless ``pos_label`` is given in binary classification, this
513+
If ``None``, the scores for each class are returned. Otherwise, this
513514
determines the type of averaging performed on the data:
514515
516+
``'binary'``:
517+
Only report results for the class specified by ``pos_label``.
518+
This is applicable only if targets (``y_{true,pred}``) are binary.
515519
``'micro'``:
516520
Calculate metrics globally by counting the total true positives,
517521
false negatives and false positives.
@@ -528,6 +532,10 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
528532
meaningful for multilabel classification where this differs from
529533
:func:`accuracy_score`).
530534
535+
Note that if ``pos_label`` is given in binary classification with
536+
`average != 'binary'`, only that positive class is reported. This
537+
behavior is deprecated and will change in version 0.18.
538+
531539
sample_weight : array-like of shape = [n_samples], optional
532540
Sample weights.
533541
@@ -590,15 +598,19 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
590598
Integer array of labels.
591599
592600
pos_label : str or int, 1 by default
593-
If ``average`` is not ``None`` and the classification target is binary,
594-
only this class's scores will be returned.
601+
The class to report if ``average='binary'``. Until version 0.18 it is
602+
necessary to set ``pos_label=None`` if seeking to use another averaging
603+
method over binary targets.
595604
596-
average : one of [None, 'micro', 'macro', 'samples', 'weighted']
605+
average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
606+
'weighted']
597607
This parameter is required for multiclass/multilabel targets.
598-
If ``None``, the scores for each class are returned. Otherwise,
599-
unless ``pos_label`` is given in binary classification, this
608+
If ``None``, the scores for each class are returned. Otherwise, this
600609
determines the type of averaging performed on the data:
601610
611+
``'binary'``:
612+
Only report results for the class specified by ``pos_label``.
613+
This is applicable only if targets (``y_{true,pred}``) are binary.
602614
``'micro'``:
603615
Calculate metrics globally by counting the total true positives,
604616
false negatives and false positives.
@@ -615,6 +627,10 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
615627
meaningful for multilabel classification where this differs from
616628
:func:`accuracy_score`).
617629
630+
Note that if ``pos_label`` is given in binary classification with
631+
`average != 'binary'`, only that positive class is reported. This
632+
behavior is deprecated and will change in version 0.18.
633+
618634
sample_weight : array-like of shape = [n_samples], optional
619635
Sample weights.
620636
@@ -750,14 +766,18 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
750766
Integer array of labels.
751767
752768
pos_label : str or int, 1 by default
753-
If ``average`` is not ``None`` and the classification target is binary,
754-
only this class's scores will be returned.
769+
The class to report if ``average='binary'``. Until version 0.18 it is
770+
necessary to set ``pos_label=None`` if seeking to use another averaging
771+
method over binary targets.
755772
756-
average : string, [None (default), 'micro', 'macro', 'samples', 'weighted']
757-
If ``None``, the scores for each class are returned. Otherwise,
758-
unless ``pos_label`` is given in binary classification, this
773+
average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \
774+
'weighted']
775+
If ``None``, the scores for each class are returned. Otherwise, this
759776
determines the type of averaging performed on the data:
760777
778+
``'binary'``:
779+
Only report results for the class specified by ``pos_label``.
780+
This is applicable only if targets (``y_{true,pred}``) are binary.
761781
``'micro'``:
762782
Calculate metrics globally by counting the total true positives,
763783
false negatives and false positives.
@@ -774,6 +794,10 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
774794
meaningful for multilabel classification where this differs from
775795
:func:`accuracy_score`).
776796
797+
Note that if ``pos_label`` is given in binary classification with
798+
`average != 'binary'`, only that positive class is reported. This
799+
behavior is deprecated and will change in version 0.18.
800+
777801
warn_for : tuple or set, for internal use
778802
This determines which warnings will be made in the case that this
779803
function is being used to return only one of its metrics.
@@ -834,11 +858,11 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
834858

835859
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
836860

837-
if average == 'binary' and y_type != 'binary':
861+
if average == 'binary' and (y_type != 'binary' or pos_label is None):
838862
warnings.warn('The default `weighted` averaging is deprecated, '
839863
'and from version 0.18, use of precision, recall or '
840-
'F-score with multiclass or multilabel data will result '
841-
'in an exception. '
864+
'F-score with multiclass or multilabel data or '
865+
'pos_label=None will result in an exception. '
842866
'Please set an explicit value for `average`, one of '
843867
'%s. In cross validation use, for instance, '
844868
'scoring="f1_weighted" instead of scoring="f1".'
@@ -900,14 +924,12 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
900924
### Select labels to keep ###
901925

902926
if y_type == 'binary' and average is not None and pos_label is not None:
903-
if average != 'binary' and label_order is not None \
904-
and len(label_order) == 2:
905-
warnings.warn('In the future, providing two `labels` values, as '
906-
'well as `average!=`binary`` will average over '
907-
'those labels. For now, please use `labels=None` '
908-
'with `pos_label` to evaluate precision, recall and '
909-
'F-score for the positive label only.',
910-
FutureWarning)
927+
if average != 'binary':
928+
warnings.warn('From version 0.18, binary input will not be '
929+
'handled specially when using averaged '
930+
'precision/recall/F-score. '
931+
'Please use average=\'binary\' to report only the '
932+
'positive class performance.', DeprecationWarning)
911933
if pos_label not in labels:
912934
if len(labels) == 1:
913935
# Only negative labels
@@ -955,6 +977,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
955977
weights = None
956978

957979
if average is not None:
980+
assert average != 'binary' or len(precision) == 1
958981
precision = np.average(precision, weights=weights)
959982
recall = np.average(recall, weights=weights)
960983
f_score = np.average(f_score, weights=weights)
@@ -992,15 +1015,19 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
9921015
Integer array of labels.
9931016
9941017
pos_label : str or int, 1 by default
995-
If ``average`` is not ``None`` and the classification target is binary,
996-
only this class's scores will be returned.
1018+
The class to report if ``average='binary'``. Until version 0.18 it is
1019+
necessary to set ``pos_label=None`` if seeking to use another averaging
1020+
method over binary targets.
9971021
998-
average : one of [None, 'micro', 'macro', 'samples', 'weighted']
1022+
average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
1023+
'weighted']
9991024
This parameter is required for multiclass/multilabel targets.
1000-
If ``None``, the scores for each class are returned. Otherwise,
1001-
unless ``pos_label`` is given in binary classification, this
1025+
If ``None``, the scores for each class are returned. Otherwise, this
10021026
determines the type of averaging performed on the data:
10031027
1028+
``'binary'``:
1029+
Only report results for the class specified by ``pos_label``.
1030+
This is applicable only if targets (``y_{true,pred}``) are binary.
10041031
``'micro'``:
10051032
Calculate metrics globally by counting the total true positives,
10061033
false negatives and false positives.
@@ -1017,6 +1044,10 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
10171044
meaningful for multilabel classification where this differs from
10181045
:func:`accuracy_score`).
10191046
1047+
Note that if ``pos_label`` is given in binary classification with
1048+
`average != 'binary'`, only that positive class is reported. This
1049+
behavior is deprecated and will change in version 0.18.
1050+
10201051
sample_weight : array-like of shape = [n_samples], optional
10211052
Sample weights.
10221053
@@ -1075,15 +1106,19 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
10751106
Integer array of labels.
10761107
10771108
pos_label : str or int, 1 by default
1078-
If ``average`` is not ``None`` and the classification target is binary,
1079-
only this class's scores will be returned.
1109+
The class to report if ``average='binary'``. Until version 0.18 it is
1110+
necessary to set ``pos_label=None`` if seeking to use another averaging
1111+
method over binary targets.
10801112
1081-
average : one of [None, 'micro', 'macro', 'samples', 'weighted']
1113+
average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
1114+
'weighted']
10821115
This parameter is required for multiclass/multilabel targets.
1083-
If ``None``, the scores for each class are returned. Otherwise,
1084-
unless ``pos_label`` is given in binary classification, this
1116+
If ``None``, the scores for each class are returned. Otherwise, this
10851117
determines the type of averaging performed on the data:
10861118
1119+
``'binary'``:
1120+
Only report results for the class specified by ``pos_label``.
1121+
This is applicable only if targets (``y_{true,pred}``) are binary.
10871122
``'micro'``:
10881123
Calculate metrics globally by counting the total true positives,
10891124
false negatives and false positives.
@@ -1100,6 +1135,10 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
11001135
meaningful for multilabel classification where this differs from
11011136
:func:`accuracy_score`).
11021137
1138+
Note that if ``pos_label`` is given in binary classification with
1139+
`average != 'binary'`, only that positive class is reported. This
1140+
behavior is deprecated and will change in version 0.18.
1141+
11031142
sample_weight : array-like of shape = [n_samples], optional
11041143
Sample weights.
11051144

sklearn/metrics/tests/test_classification.py

+27-10
Original file line numberDiff line numberDiff line change
@@ -140,18 +140,23 @@ def test_precision_recall_f1_score_binary():
140140

141141
# individual scoring function that can be used for grid search: in the
142142
# binary class case the score is the value of the measure for the positive
143-
# class (e.g. label == 1)
144-
ps = precision_score(y_true, y_pred)
145-
assert_array_almost_equal(ps, 0.85, 2)
143+
# class (e.g. label == 1). This is deprecated for average != 'binary'.
144+
assert_dep_warning = partial(assert_warns, DeprecationWarning)
145+
for kwargs, my_assert in [({}, assert_no_warnings),
146+
({'average': 'binary'}, assert_no_warnings),
147+
({'average': 'micro'}, assert_dep_warning)]:
148+
ps = my_assert(precision_score, y_true, y_pred, **kwargs)
149+
assert_array_almost_equal(ps, 0.85, 2)
146150

147-
rs = recall_score(y_true, y_pred)
148-
assert_array_almost_equal(rs, 0.68, 2)
151+
rs = my_assert(recall_score, y_true, y_pred, **kwargs)
152+
assert_array_almost_equal(rs, 0.68, 2)
149153

150-
fs = f1_score(y_true, y_pred)
151-
assert_array_almost_equal(fs, 0.76, 2)
154+
fs = my_assert(f1_score, y_true, y_pred, **kwargs)
155+
assert_array_almost_equal(fs, 0.76, 2)
152156

153-
assert_almost_equal(fbeta_score(y_true, y_pred, beta=2),
154-
(1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
157+
assert_almost_equal(my_assert(fbeta_score, y_true, y_pred, beta=2,
158+
**kwargs),
159+
(1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
155160

156161

157162
@ignore_warnings
@@ -206,6 +211,7 @@ def test_average_precision_score_tied_values():
206211
assert_not_equal(average_precision_score(y_true, y_score), 1.)
207212

208213

214+
@ignore_warnings
209215
def test_precision_recall_fscore_support_errors():
210216
y_true, y_pred, _ = make_prediction(binary=True)
211217

@@ -1012,6 +1018,8 @@ def test_prf_average_compat():
10121018
"""
10131019
y_true = [1, 2, 3, 3]
10141020
y_pred = [1, 2, 3, 1]
1021+
y_true_bin = [0, 1, 1]
1022+
y_pred_bin = [0, 1, 0]
10151023

10161024
for metric in [precision_score, recall_score, f1_score,
10171025
partial(fbeta_score, beta=2)]:
@@ -1022,7 +1030,16 @@ def test_prf_average_compat():
10221030
'average does not act like "weighted" by default')
10231031

10241032
# check binary passes without warning
1025-
assert_no_warnings(metric, [0, 1, 1], [0, 1, 0])
1033+
assert_no_warnings(metric, y_true_bin, y_pred_bin)
1034+
1035+
# but binary with pos_label=None should behave like multiclass
1036+
score = assert_warns(DeprecationWarning, metric,
1037+
y_true_bin, y_pred_bin, pos_label=None)
1038+
score_weighted = assert_no_warnings(metric, y_true_bin, y_pred_bin,
1039+
pos_label=None, average='weighted')
1040+
assert_equal(score, score_weighted,
1041+
'average does not act like "weighted" by default with '
1042+
'binary data and pos_label=None')
10261043

10271044

10281045
@ignore_warnings # sequence of sequences is deprecated

sklearn/metrics/tests/test_common.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@
221221

222222
"precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",
223223

224+
# pos_label support deprecated; to be removed in 0.18:
224225
"weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
225226
"weighted_precision_score", "weighted_recall_score",
226227

@@ -232,7 +233,7 @@
232233
]
233234

234235
# Metrics with a "labels" argument
235-
# XXX: Handle multi_class metrics that has a labels argument as well as a
236+
# TODO: Handle multi_class metrics that has a labels argument as well as a
236237
# decision function argument. e.g hinge_loss
237238
METRICS_WITH_LABELS = [
238239
"confusion_matrix",
@@ -942,7 +943,7 @@ def check_sample_weight_invariance(name, metric, y1, y2):
942943
unweighted_score,
943944
metric(y1, y2, sample_weight=np.ones(shape=len(y1))),
944945
err_msg="For %s sample_weight=None is not equivalent to "
945-
"sample_weight=ones" % name)
946+
"sample_weight=ones" % name)
946947

947948
# check that the weighted and unweighted scores are unequal
948949
weighted_score = metric(y1, y2, sample_weight=sample_weight)
@@ -957,8 +958,8 @@ def check_sample_weight_invariance(name, metric, y1, y2):
957958
assert_almost_equal(
958959
weighted_score, weighted_score_list,
959960
err_msg="Weighted scores for array and list sample_weight input are "
960-
"not equal (%f != %f) for %s" % (
961-
weighted_score, weighted_score_list, name))
961+
"not equal (%f != %f) for %s" % (
962+
weighted_score, weighted_score_list, name))
962963

963964
# check that integer weights is the same as repeated samples
964965
repeat_weighted_score = metric(

sklearn/metrics/tests/test_score_objects.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,7 @@ def test_classification_scores():
172172
# test fbeta score that takes an argument
173173
scorer = make_scorer(fbeta_score, beta=2)
174174
score1 = scorer(clf, X_test, y_test)
175-
score2 = fbeta_score(y_test, clf.predict(X_test), beta=2,
176-
average='weighted')
175+
score2 = fbeta_score(y_test, clf.predict(X_test), beta=2)
177176
assert_almost_equal(score1, score2)
178177

179178
# test that custom scorer can be pickled

0 commit comments

Comments
 (0)