Skip to content

Commit 193670c

Browse files
authored
FIX select the probability estimates or transform the decision values when pos_label is provided (#18114)
1 parent 06c710a commit 193670c

File tree

3 files changed

+283
-20
lines changed

3 files changed

+283
-20
lines changed

doc/whats_new/v0.24.rst

+7
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,13 @@ Changelog
418418
``labels`` parameter.
419419
:pr:`17935` by :user:`Cary Goltermann <Ultramann>`.
420420

421+
- |Fix| Fix scorers that accept a pos_label parameter and compute their metrics
422+
from values returned by `decision_function` or `predict_proba`. Previously,
423+
they would return erroneous values when pos_label was not corresponding to
424+
`classifier.classes_[1]`. This is especially important when training
425+
classifiers directly with string labeled target classes.
426+
:pr:`#18114` by :user:`Guillaume Lemaitre <glemaitre>`.
427+
421428
:mod:`sklearn.model_selection`
422429
..............................
423430

sklearn/metrics/_scorer.py

+53-17
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,43 @@ def __init__(self, score_func, sign, kwargs):
127127
self._score_func = score_func
128128
self._sign = sign
129129

130+
@staticmethod
131+
def _check_pos_label(pos_label, classes):
132+
if pos_label not in list(classes):
133+
raise ValueError(
134+
f"pos_label={pos_label} is not a valid label: {classes}"
135+
)
136+
137+
def _select_proba_binary(self, y_pred, classes):
138+
"""Select the column of the positive label in `y_pred` when
139+
probabilities are provided.
140+
141+
Parameters
142+
----------
143+
y_pred : ndarray of shape (n_samples, n_classes)
144+
The prediction given by `predict_proba`.
145+
146+
classes : ndarray of shape (n_classes,)
147+
The class labels for the estimator.
148+
149+
Returns
150+
-------
151+
y_pred : ndarray of shape (n_samples,)
152+
Probability predictions of the positive class.
153+
"""
154+
if y_pred.shape[1] == 2:
155+
pos_label = self._kwargs.get("pos_label", classes[1])
156+
self._check_pos_label(pos_label, classes)
157+
col_idx = np.flatnonzero(classes == pos_label)[0]
158+
return y_pred[:, col_idx]
159+
160+
err_msg = (
161+
f"Got predict_proba of shape {y_pred.shape}, but need "
162+
f"classifier with two classes for {self._score_func.__name__} "
163+
f"scoring"
164+
)
165+
raise ValueError(err_msg)
166+
130167
def __repr__(self):
131168
kwargs_string = "".join([", %s=%s" % (str(k), str(v))
132169
for k, v in self._kwargs.items()])
@@ -237,14 +274,11 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
237274

238275
y_type = type_of_target(y)
239276
y_pred = method_caller(clf, "predict_proba", X)
240-
if y_type == "binary":
241-
if y_pred.shape[1] == 2:
242-
y_pred = y_pred[:, 1]
243-
elif y_pred.shape[1] == 1: # not multiclass
244-
raise ValueError('got predict_proba of shape {},'
245-
' but need classifier with two'
246-
' classes for {} scoring'.format(
247-
y_pred.shape, self._score_func.__name__))
277+
if y_type == "binary" and y_pred.shape[1] <= 2:
278+
# `y_type` could be equal to "binary" even in a multi-class
279+
# problem: (when only 2 class are given to `y_true` during scoring)
280+
# Thus, we need to check for the shape of `y_pred`.
281+
y_pred = self._select_proba_binary(y_pred, clf.classes_)
248282
if sample_weight is not None:
249283
return self._sign * self._score_func(y, y_pred,
250284
sample_weight=sample_weight,
@@ -298,22 +332,24 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
298332
try:
299333
y_pred = method_caller(clf, "decision_function", X)
300334

301-
# For multi-output multi-class estimator
302335
if isinstance(y_pred, list):
336+
# For multi-output multi-class estimator
303337
y_pred = np.vstack([p for p in y_pred]).T
338+
elif y_type == "binary" and "pos_label" in self._kwargs:
339+
self._check_pos_label(
340+
self._kwargs["pos_label"], clf.classes_
341+
)
342+
if self._kwargs["pos_label"] == clf.classes_[0]:
343+
# The implicit positive class of the binary classifier
344+
# does not match `pos_label`: we need to invert the
345+
# predictions
346+
y_pred *= -1
304347

305348
except (NotImplementedError, AttributeError):
306349
y_pred = method_caller(clf, "predict_proba", X)
307350

308351
if y_type == "binary":
309-
if y_pred.shape[1] == 2:
310-
y_pred = y_pred[:, 1]
311-
else:
312-
raise ValueError('got predict_proba of shape {},'
313-
' but need classifier with two'
314-
' classes for {} scoring'.format(
315-
y_pred.shape,
316-
self._score_func.__name__))
352+
y_pred = self._select_proba_binary(y_pred, clf.classes_)
317353
elif isinstance(y_pred, list):
318354
y_pred = np.vstack([p[:, -1] for p in y_pred]).T
319355

sklearn/metrics/tests/test_score_objects.py

+223-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from copy import deepcopy
12
import pickle
23
import tempfile
34
import shutil
@@ -16,9 +17,18 @@
1617
from sklearn.utils._testing import ignore_warnings
1718

1819
from sklearn.base import BaseEstimator
19-
from sklearn.metrics import (f1_score, r2_score, roc_auc_score, fbeta_score,
20-
log_loss, precision_score, recall_score,
21-
jaccard_score)
20+
from sklearn.metrics import (
21+
average_precision_score,
22+
brier_score_loss,
23+
f1_score,
24+
fbeta_score,
25+
jaccard_score,
26+
log_loss,
27+
precision_score,
28+
r2_score,
29+
recall_score,
30+
roc_auc_score,
31+
)
2232
from sklearn.metrics import cluster as cluster_module
2333
from sklearn.metrics import check_scoring
2434
from sklearn.metrics._scorer import (_PredictScorer, _passthrough_scorer,
@@ -618,6 +628,8 @@ def test_multimetric_scorer_calls_method_once(scorers, expected_predict_count,
618628
mock_est.predict = predict_func
619629
mock_est.predict_proba = predict_proba_func
620630
mock_est.decision_function = decision_function_func
631+
# add the classes that would be found during fit
632+
mock_est.classes_ = np.array([0, 1])
621633

622634
scorer_dict = _check_multimetric_scoring(LogisticRegression(), scorers)
623635
multi_scorer = _MultimetricScorer(**scorer_dict)
@@ -747,3 +759,211 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name):
747759
msg = "'Perceptron' object has no attribute 'predict_proba'"
748760
with pytest.raises(AttributeError, match=msg):
749761
scorer(lr, X, y)
762+
763+
764+
@pytest.fixture
765+
def string_labeled_classification_problem():
766+
"""Train a classifier on binary problem with string target.
767+
768+
The classifier is trained on a binary classification problem where the
769+
minority class of interest has a string label that is intentionally not the
770+
greatest class label using the lexicographic order. In this case, "cancer"
771+
is the positive label, and `classifier.classes_` is
772+
`["cancer", "not cancer"]`.
773+
774+
In addition, the dataset is imbalanced to better identify problems when
775+
using non-symmetric performance metrics such as f1-score, average precision
776+
and so on.
777+
778+
Returns
779+
-------
780+
classifier : estimator object
781+
Trained classifier on the binary problem.
782+
X_test : ndarray of shape (n_samples, n_features)
783+
Data to be used as testing set in tests.
784+
y_test : ndarray of shape (n_samples,), dtype=object
785+
Binary target where labels are strings.
786+
y_pred : ndarray of shape (n_samples,), dtype=object
787+
Prediction of `classifier` when predicting for `X_test`.
788+
y_pred_proba : ndarray of shape (n_samples, 2), dtype=np.float64
789+
Probabilities of `classifier` when predicting for `X_test`.
790+
y_pred_decision : ndarray of shape (n_samples,), dtype=np.float64
791+
Decision function values of `classifier` when predicting on `X_test`.
792+
"""
793+
from sklearn.datasets import load_breast_cancer
794+
from sklearn.utils import shuffle
795+
796+
X, y = load_breast_cancer(return_X_y=True)
797+
# create an highly imbalanced classification task
798+
idx_positive = np.flatnonzero(y == 1)
799+
idx_negative = np.flatnonzero(y == 0)
800+
idx_selected = np.hstack([idx_negative, idx_positive[:25]])
801+
X, y = X[idx_selected], y[idx_selected]
802+
X, y = shuffle(X, y, random_state=42)
803+
# only use 2 features to make the problem even harder
804+
X = X[:, :2]
805+
y = np.array(
806+
["cancer" if c == 1 else "not cancer" for c in y], dtype=object
807+
)
808+
X_train, X_test, y_train, y_test = train_test_split(
809+
X, y, stratify=y, random_state=0,
810+
)
811+
classifier = LogisticRegression().fit(X_train, y_train)
812+
y_pred = classifier.predict(X_test)
813+
y_pred_proba = classifier.predict_proba(X_test)
814+
y_pred_decision = classifier.decision_function(X_test)
815+
816+
return classifier, X_test, y_test, y_pred, y_pred_proba, y_pred_decision
817+
818+
819+
def test_average_precision_pos_label(string_labeled_classification_problem):
820+
# check that _ThresholdScorer will lead to the right score when passing
821+
# `pos_label`. Currently, only `average_precision_score` is defined to
822+
# be such a scorer.
823+
clf, X_test, y_test, _, y_pred_proba, y_pred_decision = \
824+
string_labeled_classification_problem
825+
826+
pos_label = "cancer"
827+
# we need to select the positive column or reverse the decision values
828+
y_pred_proba = y_pred_proba[:, 0]
829+
y_pred_decision = y_pred_decision * -1
830+
assert clf.classes_[0] == pos_label
831+
832+
# check that when calling the scoring function, probability estimates and
833+
# decision values lead to the same results
834+
ap_proba = average_precision_score(
835+
y_test, y_pred_proba, pos_label=pos_label
836+
)
837+
ap_decision_function = average_precision_score(
838+
y_test, y_pred_decision, pos_label=pos_label
839+
)
840+
assert ap_proba == pytest.approx(ap_decision_function)
841+
842+
# create a scorer which would require to pass a `pos_label`
843+
# check that it fails if `pos_label` is not provided
844+
average_precision_scorer = make_scorer(
845+
average_precision_score, needs_threshold=True,
846+
)
847+
err_msg = "pos_label=1 is not a valid label. It should be one of "
848+
with pytest.raises(ValueError, match=err_msg):
849+
average_precision_scorer(clf, X_test, y_test)
850+
851+
# otherwise, the scorer should give the same results than calling the
852+
# scoring function
853+
average_precision_scorer = make_scorer(
854+
average_precision_score, needs_threshold=True, pos_label=pos_label
855+
)
856+
ap_scorer = average_precision_scorer(clf, X_test, y_test)
857+
858+
assert ap_scorer == pytest.approx(ap_proba)
859+
860+
# The above scorer call is using `clf.decision_function`. We will force
861+
# it to use `clf.predict_proba`.
862+
clf_without_predict_proba = deepcopy(clf)
863+
864+
def _predict_proba(self, X):
865+
raise NotImplementedError
866+
867+
clf_without_predict_proba.predict_proba = partial(
868+
_predict_proba, clf_without_predict_proba
869+
)
870+
# sanity check
871+
with pytest.raises(NotImplementedError):
872+
clf_without_predict_proba.predict_proba(X_test)
873+
874+
ap_scorer = average_precision_scorer(
875+
clf_without_predict_proba, X_test, y_test
876+
)
877+
assert ap_scorer == pytest.approx(ap_proba)
878+
879+
880+
def test_brier_score_loss_pos_label(string_labeled_classification_problem):
881+
# check that _ProbaScorer leads to the right score when `pos_label` is
882+
# provided. Currently only the `brier_score_loss` is defined to be such
883+
# a scorer.
884+
clf, X_test, y_test, _, y_pred_proba, _ = \
885+
string_labeled_classification_problem
886+
887+
pos_label = "cancer"
888+
assert clf.classes_[0] == pos_label
889+
890+
# brier score loss is symmetric
891+
brier_pos_cancer = brier_score_loss(
892+
y_test, y_pred_proba[:, 0], pos_label="cancer"
893+
)
894+
brier_pos_not_cancer = brier_score_loss(
895+
y_test, y_pred_proba[:, 1], pos_label="not cancer"
896+
)
897+
assert brier_pos_cancer == pytest.approx(brier_pos_not_cancer)
898+
899+
brier_scorer = make_scorer(
900+
brier_score_loss, needs_proba=True, pos_label=pos_label,
901+
)
902+
assert brier_scorer(clf, X_test, y_test) == pytest.approx(brier_pos_cancer)
903+
904+
905+
@pytest.mark.parametrize(
906+
"score_func", [f1_score, precision_score, recall_score, jaccard_score]
907+
)
908+
def test_non_symmetric_metric_pos_label(
909+
score_func, string_labeled_classification_problem
910+
):
911+
# check that _PredictScorer leads to the right score when `pos_label` is
912+
# provided. We check for all possible metric supported.
913+
# Note: At some point we may end up having "scorer tags".
914+
clf, X_test, y_test, y_pred, _, _ = string_labeled_classification_problem
915+
916+
pos_label = "cancer"
917+
assert clf.classes_[0] == pos_label
918+
919+
score_pos_cancer = score_func(y_test, y_pred, pos_label="cancer")
920+
score_pos_not_cancer = score_func(y_test, y_pred, pos_label="not cancer")
921+
922+
assert score_pos_cancer != pytest.approx(score_pos_not_cancer)
923+
924+
scorer = make_scorer(score_func, pos_label=pos_label)
925+
assert scorer(clf, X_test, y_test) == pytest.approx(score_pos_cancer)
926+
927+
928+
@pytest.mark.parametrize(
929+
"scorer",
930+
[
931+
make_scorer(
932+
average_precision_score, needs_threshold=True, pos_label="xxx"
933+
),
934+
make_scorer(brier_score_loss, needs_proba=True, pos_label="xxx"),
935+
make_scorer(f1_score, pos_label="xxx")
936+
],
937+
ids=["ThresholdScorer", "ProbaScorer", "PredictScorer"],
938+
)
939+
def test_scorer_select_proba_error(scorer):
940+
# check that we raise the the proper error when passing an unknown
941+
# pos_label
942+
X, y = make_classification(
943+
n_classes=2, n_informative=3, n_samples=20, random_state=0
944+
)
945+
lr = LogisticRegression().fit(X, y)
946+
assert scorer._kwargs["pos_label"] not in np.unique(y).tolist()
947+
948+
err_msg = "is not a valid label"
949+
with pytest.raises(ValueError, match=err_msg):
950+
scorer(lr, X, y)
951+
952+
953+
def test_scorer_no_op_multiclass_select_proba():
954+
# check that calling a ProbaScorer on a multiclass problem do not raise
955+
# even if `y_true` would be binary during the scoring.
956+
# `_select_proba_binary` should not be called in this case.
957+
X, y = make_classification(
958+
n_classes=3, n_informative=3, n_samples=20, random_state=0
959+
)
960+
lr = LogisticRegression().fit(X, y)
961+
962+
mask_last_class = y == lr.classes_[-1]
963+
X_test, y_test = X[~mask_last_class], y[~mask_last_class]
964+
assert_array_equal(np.unique(y_test), lr.classes_[:-1])
965+
966+
scorer = make_scorer(
967+
roc_auc_score, needs_proba=True, multi_class="ovo", labels=lr.classes_,
968+
)
969+
scorer(lr, X_test, y_test)

0 commit comments

Comments
 (0)