scikit-learn · jjerphan · Jan 9, 2024 · Jan 2, 2024 · Jan 2, 2024 · Jan 7, 2024
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -220,6 +220,17 @@ See :ref:`array_api` for more details.
 - :class:`preprocessing.MinMaxScaler` in :pr:`26243` by `Tim Head`_;
 - :class:`preprocessing.Normalizer` in :pr:`27558` by :user:`Edoardo Abati <EdAbati>`.
 
+Private Loss Function Module
+----------------------------
+
+- |FIX| The gradient computation of the binomial log loss is now numerically
+  more stable for very large, in absolute value, input (raw predictions). Before, it
+  could result in `np.nan`. Among the models that profit from this change are
+  :class:`ensemble.GradientBoostingClassifier`,
+  :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`linear_model.LogisticRegression`.
+  :pr:`28048` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 Changelog
 ---------
 

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
@@ -695,9 +695,8 @@ cdef inline double cgradient_half_binomial(
     double y_true,
     double raw_prediction
 ) noexcept nogil:
-    # y_pred - y_true = expit(raw_prediction) - y_true
-    # Numerically more stable, see
-    # http://fa.bianp.net/blog/2019/evaluate_logistic/
+    # gradient = y_pred - y_true = expit(raw_prediction) - y_true
+    # Numerically more stable, see http://fa.bianp.net/blog/2019/evaluate_logistic/
     #     if raw_prediction < 0:
     #         exp_tmp = exp(raw_prediction)
     #         return ((1 - y_true) * exp_tmp - y_true) / (1 + exp_tmp)
@@ -708,34 +707,47 @@ cdef inline double cgradient_half_binomial(
     #     return expit(raw_prediction) - y_true
     # i.e. no "if else" and an own inline implementation of expit instead of
     #     from scipy.special.cython_special cimport expit
-    # The case distinction raw_prediction < 0 in the stable implementation
-    # does not provide significant better precision. Therefore we go without
-    # it.
+    # The case distinction raw_prediction < 0 in the stable implementation does not
+    # provide significant better precision apart from protecting overflow of exp(..).
+    # The branch (if else), however, can incur runtime costs of up to 30%.
+    # Instead, we help branch prediction by almost always ending in the first if clause
+    # and making the second branch (else) a bit simpler. This has the exact same
+    # precision but is faster than the stable implementation.
+    # As branching criteria, we use the same cutoff as in log1pexp. Note that the
+    # maximal value to get gradient = -1 with y_true = 1 is -37.439198610162731
+    # (based on mpmath), and scipy.special.logit(np.finfo(float).eps) ~ -36.04365.
     cdef double exp_tmp
-    exp_tmp = exp(-raw_prediction)
-    return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
+    if raw_prediction > -37:
+        exp_tmp = exp(-raw_prediction)
+        return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
+    else:
+        # expit(raw_prediction) = exp(raw_prediction) for raw_prediction <= -37
+        return exp(raw_prediction) - y_true
 
 
 cdef inline double_pair closs_grad_half_binomial(
     double y_true,
     double raw_prediction
 ) noexcept nogil:
     cdef double_pair lg
-    if raw_prediction <= 0:
+    # Same if else conditions as in log1pexp.
+    if raw_prediction <= -37:
         lg.val2 = exp(raw_prediction)  # used as temporary
-        if raw_prediction <= -37:
-            lg.val1 = lg.val2 - y_true * raw_prediction              # loss
-        else:
-            lg.val1 = log1p(lg.val2) - y_true * raw_prediction       # loss
+        lg.val1 = lg.val2 - y_true * raw_prediction                  # loss
+        lg.val2 -= y_true                                            # gradient
+    elif raw_prediction <= -2:
+        lg.val2 = exp(raw_prediction)  # used as temporary
+        lg.val1 = log1p(lg.val2) - y_true * raw_prediction           # loss
         lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2)  # gradient
+    elif raw_prediction <= 18:
+        lg.val2 = exp(-raw_prediction)  # used as temporary
+        # log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
+        lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction     # loss
+        lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2)  # gradient
     else:
         lg.val2 = exp(-raw_prediction)  # used as temporary
-        if raw_prediction <= 18:
-            # log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
-            lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction  # loss
-        else:
-            lg.val1 = lg.val2 + (1 - y_true) * raw_prediction         # loss
-        lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2)   # gradient
+        lg.val1 = lg.val2 + (1 - y_true) * raw_prediction            # loss
+        lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2)  # gradient
     return lg
 
 
@@ -747,9 +759,15 @@ cdef inline double_pair cgrad_hess_half_binomial(
     # hessian = y_pred * (1 - y_pred) = exp( raw) / (1 + exp( raw))**2
     #                                 = exp(-raw) / (1 + exp(-raw))**2
     cdef double_pair gh
-    gh.val2 = exp(-raw_prediction)  # used as temporary
-    gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2)  # gradient
-    gh.val2 = gh.val2 / (1 + gh.val2)**2                         # hessian
+    # See comment in cgradient_half_binomial.
+    if raw_prediction > -37:
+        gh.val2 = exp(-raw_prediction)  # used as temporary
+        gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2)  # gradient
+        gh.val2 = gh.val2 / (1 + gh.val2)**2                         # hessian
+    else:
+        gh.val2 = exp(raw_prediction)
+        gh.val1 = gh.val2 - y_true
+        gh.val2 *= (1 - gh.val2)
     return gh
 
 

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
@@ -224,48 +224,150 @@ def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
 
 
 @pytest.mark.parametrize(
-    "loss, y_true, raw_prediction, loss_true",
+    "loss, y_true, raw_prediction, loss_true, gradient_true, hessian_true",
     [
-        (HalfSquaredError(), 1.0, 5.0, 8),
-        (AbsoluteError(), 1.0, 5.0, 4),
-        (PinballLoss(quantile=0.5), 1.0, 5.0, 2),
-        (PinballLoss(quantile=0.25), 1.0, 5.0, 4 * (1 - 0.25)),
-        (PinballLoss(quantile=0.25), 5.0, 1.0, 4 * 0.25),
-        (HuberLoss(quantile=0.5, delta=3), 1.0, 5.0, 3 * (4 - 3 / 2)),
-        (HuberLoss(quantile=0.5, delta=3), 1.0, 3.0, 0.5 * 2**2),
-        (HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4)),
-        (HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4),
-        (HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4**2),
-        (HalfTweedieLossIdentity(power=1), 2.0, 4.0, 2 - 2 * np.log(2)),
-        (HalfTweedieLossIdentity(power=2), 2.0, 4.0, np.log(2) - 1 / 2),
-        (HalfTweedieLossIdentity(power=3), 2.0, 4.0, -1 / 4 + 1 / 4**2 + 1 / 2 / 2),
-        (HalfBinomialLoss(), 0.25, np.log(4), np.log(5) - 0.25 * np.log(4)),
+        (HalfSquaredError(), 1.0, 5.0, 8, 4, 1),
+        (AbsoluteError(), 1.0, 5.0, 4.0, 1.0, None),
+        (PinballLoss(quantile=0.5), 1.0, 5.0, 2, 0.5, None),
+        (PinballLoss(quantile=0.25), 1.0, 5.0, 4 * (1 - 0.25), 1 - 0.25, None),
+        (PinballLoss(quantile=0.25), 5.0, 1.0, 4 * 0.25, -0.25, None),
+        (HuberLoss(quantile=0.5, delta=3), 1.0, 5.0, 3 * (4 - 3 / 2), None, None),
+        (HuberLoss(quantile=0.5, delta=3), 1.0, 3.0, 0.5 * 2**2, None, None),
+        (HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4), 4 - 2, 4),
+        (HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4, 1 - 2 / 4, 2 / 4),
+        (HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4**2, None, None),
+        (HalfTweedieLossIdentity(power=1), 2.0, 4.0, 2 - 2 * np.log(2), None, None),
+        (HalfTweedieLossIdentity(power=2), 2.0, 4.0, np.log(2) - 1 / 2, None, None),
+        (
+            HalfTweedieLossIdentity(power=3),
+            2.0,
+            4.0,
+            -1 / 4 + 1 / 4**2 + 1 / 2 / 2,
+            None,
+            None,
+        ),
+        (
+            HalfBinomialLoss(),
+            0.25,
+            np.log(4),
+            np.log1p(4) - 0.25 * np.log(4),
+            None,
+            None,
+        ),
+        # Extreme log loss cases, checked with mpmath:
+        # import mpmath as mp
+        #
+        # # Stolen from scipy
+        # def mpf2float(x):
+        #     return float(mp.nstr(x, 17, min_fixed=0, max_fixed=0))
+        #
+        # def mp_logloss(y_true, raw):
+        #     with mp.workdps(100):
+        #         y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw))
+        #         out = mp.log1p(mp.exp(raw)) - y_true * raw
+        #     return mpf2float(out)
+        #
+        # def mp_gradient(y_true, raw):
+        #     with mp.workdps(100):
+        #         y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw))
+        #         out = mp.mpf(1) / (mp.mpf(1) + mp.exp(-raw)) - y_true
+        #     return mpf2float(out)
+        #
+        # def mp_hessian(y_true, raw):
+        #     with mp.workdps(100):
+        #         y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw))
+        #         p = mp.mpf(1) / (mp.mpf(1) + mp.exp(-raw))
+        #         out = p * (mp.mpf(1) - p)
+        #     return mpf2float(out)
+        #
+        # y, raw = 0.0, 37.
+        # mp_logloss(y, raw), mp_gradient(y, raw), mp_hessian(y, raw)
+        (HalfBinomialLoss(), 0.0, -1e20, 0, 0, 0),
+        (HalfBinomialLoss(), 1.0, -1e20, 1e20, -1, 0),
+        (HalfBinomialLoss(), 0.0, -1e3, 0, 0, 0),
+        (HalfBinomialLoss(), 1.0, -1e3, 1e3, -1, 0),
+        (HalfBinomialLoss(), 1.0, -37.5, 37.5, -1, 0),
+        (HalfBinomialLoss(), 1.0, -37.0, 37, 1e-16 - 1, 8.533047625744065e-17),
+        (HalfBinomialLoss(), 0.0, -37.0, *[8.533047625744065e-17] * 3),
+        (HalfBinomialLoss(), 1.0, -36.9, 36.9, 1e-16 - 1, 9.430476078526806e-17),
+        (HalfBinomialLoss(), 0.0, -36.9, *[9.430476078526806e-17] * 3),
+        (HalfBinomialLoss(), 0.0, 37.0, 37, 1 - 1e-16, 8.533047625744065e-17),
+        (HalfBinomialLoss(), 1.0, 37.0, *[8.533047625744066e-17] * 3),
+        (HalfBinomialLoss(), 0.0, 37.5, 37.5, 1, 5.175555005801868e-17),
+        (HalfBinomialLoss(), 0.0, 232.8, 232.8, 1, 1.4287342391028437e-101),
+        (HalfBinomialLoss(), 1.0, 1e20, 0, 0, 0),
+        (HalfBinomialLoss(), 0.0, 1e20, 1e20, 1, 0),
+        (
+            HalfBinomialLoss(),
+            1.0,
+            232.8,
+            0,
+            -1.4287342391028437e-101,
+            1.4287342391028437e-101,
+        ),
+        (HalfBinomialLoss(), 1.0, 232.9, 0, 0, 0),
+        (HalfBinomialLoss(), 1.0, 1e3, 0, 0, 0),
+        (HalfBinomialLoss(), 0.0, 1e3, 1e3, 1, 0),
         (
             HalfMultinomialLoss(n_classes=3),
             0.0,
             [0.2, 0.5, 0.3],
             logsumexp([0.2, 0.5, 0.3]) - 0.2,
+            None,
+            None,
         ),
         (
             HalfMultinomialLoss(n_classes=3),
             1.0,
             [0.2, 0.5, 0.3],
             logsumexp([0.2, 0.5, 0.3]) - 0.5,
+            None,
+            None,
         ),
         (
             HalfMultinomialLoss(n_classes=3),
             2.0,
             [0.2, 0.5, 0.3],
             logsumexp([0.2, 0.5, 0.3]) - 0.3,
+            None,
+            None,
+        ),
+        (
+            HalfMultinomialLoss(n_classes=3),
+            2.0,
+            [1e4, 0, 7e-7],
+            logsumexp([1e4, 0, 7e-7]) - (7e-7),
+            None,
+            None,
         ),
     ],
     ids=loss_instance_name,
 )
-def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
-    """Test losses at specific values."""
-    assert loss(
+def test_loss_on_specific_values(
+    loss, y_true, raw_prediction, loss_true, gradient_true, hessian_true
+):
+    """Test losses, gradients and hessians at specific values."""
+    loss1 = loss(y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction]))
+    grad1 = loss.gradient(
+        y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
+    )
+    loss2, grad2 = loss.loss_gradient(
+        y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
+    )
+    grad3, hess = loss.gradient_hessian(
         y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
-    ) == approx(loss_true, rel=1e-11, abs=1e-12)
+    )
+
+    assert loss1 == approx(loss_true, rel=1e-15, abs=1e-15)
+    assert loss2 == approx(loss_true, rel=1e-15, abs=1e-15)
+
+    if gradient_true is not None:
+        assert grad1 == approx(gradient_true, rel=1e-15, abs=1e-15)
+        assert grad2 == approx(gradient_true, rel=1e-15, abs=1e-15)
+        assert grad3 == approx(gradient_true, rel=1e-15, abs=1e-15)
+
+    if hessian_true is not None:
+        assert hess == approx(hessian_true, rel=1e-15, abs=1e-15)
 
 
 @pytest.mark.parametrize("loss", ALL_LOSSES)