scikit-learn · glemaitre · Apr 26, 2019 · Apr 12, 2019 · Apr 13, 2019 · Apr 15, 2019
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -480,7 +480,11 @@ Support for Python 3.4 and below has been officially dropped.
   and now it returns NaN and raises :class:`exceptions.UndefinedMetricWarning`.
   :pr:`12855` by :user:`Pawel Sendyk <psendyk>`.
 
-- |Fix| Fixed a bug in :func:`metrics.label_ranking_average_precision_score` 
+- |Fix| Fixed a bug where :func:`metrics.brier_score_loss` will sometimes
+  return incorrect result when there's only one class in ``y_true``.
+  :pr:`13628` by :user:`Hanmin Qin <qinhanmin2014>`.
+
+- |Fix| Fixed a bug in :func:`metrics.label_ranking_average_precision_score`
   where sample_weight wasn't taken into account for samples with degenerate
   labels.
   :pr:`13447` by :user:`Dan Ellis <dpwe>`.

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
@@ -25,7 +25,6 @@
 from .isotonic import IsotonicRegression
 from .svm import LinearSVC
 from .model_selection import check_cv
-from .metrics.classification import _check_binary_probabilistic_predictions
 
 
 class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
@@ -572,14 +571,19 @@ def calibration_curve(y_true, y_prob, normalize=False, n_bins=5,
     """
     y_true = column_or_1d(y_true)
     y_prob = column_or_1d(y_prob)
+    check_consistent_length(y_true, y_prob)
 
     if normalize:  # Normalize predicted values into interval [0, 1]
         y_prob = (y_prob - y_prob.min()) / (y_prob.max() - y_prob.min())
     elif y_prob.min() < 0 or y_prob.max() > 1:
         raise ValueError("y_prob has values outside [0, 1] and normalize is "
                          "set to False.")
 
-    y_true = _check_binary_probabilistic_predictions(y_true, y_prob)
+    labels = np.unique(y_true)
+    if len(labels) > 2:
+        raise ValueError("Only binary classification is supported. "
+                         "Provided labels %s." % labels)
+    y_true = label_binarize(y_true, labels)[:, 0]
 
     if strategy == 'quantile':  # Determine bin edges by distribution of data
         quantiles = np.linspace(0, 1, n_bins + 1)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
@@ -28,7 +28,7 @@
 from scipy.sparse import coo_matrix
 from scipy.sparse import csr_matrix
 
-from ..preprocessing import LabelBinarizer, label_binarize
+from ..preprocessing import LabelBinarizer
 from ..preprocessing import LabelEncoder
 from ..utils import assert_all_finite
 from ..utils import check_array
@@ -2301,25 +2301,6 @@ def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None):
     return np.average(losses, weights=sample_weight)
 
 
-def _check_binary_probabilistic_predictions(y_true, y_prob):
-    """Check that y_true is binary and y_prob contains valid probabilities"""
-    check_consistent_length(y_true, y_prob)
-
-    labels = np.unique(y_true)
-
-    if len(labels) > 2:
-        raise ValueError("Only binary classification is supported. "
-                         "Provided labels %s." % labels)
-
-    if y_prob.max() > 1:
-        raise ValueError("y_prob contains values greater than 1.")
-
-    if y_prob.min() < 0:
-        raise ValueError("y_prob contains values less than 0.")
-
-    return label_binarize(y_true, labels)[:, 0]
-
-
 def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None):
     """Compute the Brier score.
     The smaller the Brier score, the better, hence the naming with "loss".
@@ -2353,8 +2334,9 @@ def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None):
         Sample weights.
 
     pos_label : int or str, default=None
-        Label of the positive class. If None, the maximum label is used as
-        positive class
+        Label of the positive class.
+        Defaults to the greater label unless y_true is all 0 or all -1
+        in which case pos_label defaults to 1.
 
     Returns
     -------
@@ -2389,8 +2371,25 @@ def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None):
     assert_all_finite(y_prob)
     check_consistent_length(y_true, y_prob, sample_weight)
 
+    labels = np.unique(y_true)
+    if len(labels) > 2:
+        raise ValueError("Only binary classification is supported. "
+                         "Labels in y_true: %s." % labels)
+    if y_prob.max() > 1:
+        raise ValueError("y_prob contains values greater than 1.")
+    if y_prob.min() < 0:
+        raise ValueError("y_prob contains values less than 0.")
+
+    # if pos_label=None, when y_true is in {-1, 1} or {0, 1},
+    # pos_labe is set to 1 (consistent with precision_recall_curve/roc_curve),
+    # otherwise pos_label is set to the greater label
+    # (different from precision_recall_curve/roc_curve,
+    # the purpose is to keep backward compatibility).
     if pos_label is None:
-        pos_label = y_true.max()
+        if (np.array_equal(labels, [0]) or
+                np.array_equal(labels, [-1])):
+            pos_label = 1
+        else:
+            pos_label = y_true.max()
     y_true = np.array(y_true == pos_label, int)
-    y_true = _check_binary_probabilistic_predictions(y_true, y_prob)
     return np.average((y_true - y_prob) ** 2, weights=sample_weight)
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
@@ -469,13 +469,16 @@ def precision_recall_curve(y_true, probas_pred, pos_label=None,
     Parameters
     ----------
     y_true : array, shape = [n_samples]
-        True targets of binary classification in range {-1, 1} or {0, 1}.
+        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
+        pos_label should be explicitly given.
 
     probas_pred : array, shape = [n_samples]
         Estimated probabilities or decision function.
 
     pos_label : int or str, default=None
-        The label of the positive class
+        The label of the positive class.
+        When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},
+        ``pos_label`` is set to 1, otherwise an error will be raised.
 
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
@@ -552,7 +555,9 @@ def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
         (as returned by "decision_function" on some classifiers).
 
     pos_label : int or str, default=None
-        Label considered as positive and others are considered negative.
+        The label of the positive class.
+        When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},
+        ``pos_label`` is set to 1, otherwise an error will be raised.
 
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -1997,9 +1997,23 @@ def test_brier_score_loss():
     assert_raises(ValueError, brier_score_loss, y_true, y_pred[1:])
     assert_raises(ValueError, brier_score_loss, y_true, y_pred + 1.)
     assert_raises(ValueError, brier_score_loss, y_true, y_pred - 1.)
-    # calculate even if only single class in y_true (#6980)
-    assert_almost_equal(brier_score_loss([0], [0.5]), 0.25)
-    assert_almost_equal(brier_score_loss([1], [0.5]), 0.25)
+
+    # ensure to raise an error for multiclass y_true
+    y_true = np.array([0, 1, 2, 0])
+    y_pred = np.array([0.8, 0.6, 0.4, 0.2])
+    error_message = ("Only binary classification is supported. Labels "
+                     "in y_true: {}".format(np.array([0, 1, 2])))
+    assert_raise_message(ValueError, error_message, brier_score_loss,
+                         y_true, y_pred)
+
+    # calculate correctly when there's only one class in y_true
+    assert_almost_equal(brier_score_loss([-1], [0.4]), 0.16)
+    assert_almost_equal(brier_score_loss([0], [0.4]), 0.16)
+    assert_almost_equal(brier_score_loss([1], [0.4]), 0.36)
+    assert_almost_equal(
+        brier_score_loss(['foo'], [0.4], pos_label='bar'), 0.16)
+    assert_almost_equal(
+        brier_score_loss(['foo'], [0.4], pos_label='foo'), 0.36)
 
 
 def test_balanced_accuracy_score_unseen():