Skip to content

FIX more precise log loss gradient and hessian #28048

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jan 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions doc/whats_new/v1.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,17 @@ See :ref:`array_api` for more details.
- :class:`preprocessing.MinMaxScaler` in :pr:`26243` by `Tim Head`_;
- :class:`preprocessing.Normalizer` in :pr:`27558` by :user:`Edoardo Abati <EdAbati>`.

Private Loss Function Module
----------------------------

- |FIX| The gradient computation of the binomial log loss is now numerically
more stable for very large, in absolute value, input (raw predictions). Before, it
could result in `np.nan`. Among the models that profit from this change are
:class:`ensemble.GradientBoostingClassifier`,
:class:`ensemble.HistGradientBoostingClassifier` and
:class:`linear_model.LogisticRegression`.
:pr:`28048` by :user:`Christian Lorentzen <lorentzenchr>`.

Changelog
---------

Expand Down
62 changes: 40 additions & 22 deletions sklearn/_loss/_loss.pyx.tp
Original file line number Diff line number Diff line change
Expand Up @@ -695,9 +695,8 @@ cdef inline double cgradient_half_binomial(
double y_true,
double raw_prediction
) noexcept nogil:
# y_pred - y_true = expit(raw_prediction) - y_true
# Numerically more stable, see
# http://fa.bianp.net/blog/2019/evaluate_logistic/
# gradient = y_pred - y_true = expit(raw_prediction) - y_true
# Numerically more stable, see http://fa.bianp.net/blog/2019/evaluate_logistic/
# if raw_prediction < 0:
# exp_tmp = exp(raw_prediction)
# return ((1 - y_true) * exp_tmp - y_true) / (1 + exp_tmp)
Expand All @@ -708,34 +707,47 @@ cdef inline double cgradient_half_binomial(
# return expit(raw_prediction) - y_true
# i.e. no "if else" and an own inline implementation of expit instead of
# from scipy.special.cython_special cimport expit
# The case distinction raw_prediction < 0 in the stable implementation
# does not provide significant better precision. Therefore we go without
# it.
# The case distinction raw_prediction < 0 in the stable implementation does not
# provide significant better precision apart from protecting overflow of exp(..).
# The branch (if else), however, can incur runtime costs of up to 30%.
# Instead, we help branch prediction by almost always ending in the first if clause
# and making the second branch (else) a bit simpler. This has the exact same
# precision but is faster than the stable implementation.
# As branching criteria, we use the same cutoff as in log1pexp. Note that the
# maximal value to get gradient = -1 with y_true = 1 is -37.439198610162731
# (based on mpmath), and scipy.special.logit(np.finfo(float).eps) ~ -36.04365.
cdef double exp_tmp
exp_tmp = exp(-raw_prediction)
return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
if raw_prediction > -37:
exp_tmp = exp(-raw_prediction)
return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
else:
# expit(raw_prediction) = exp(raw_prediction) for raw_prediction <= -37
return exp(raw_prediction) - y_true


cdef inline double_pair closs_grad_half_binomial(
double y_true,
double raw_prediction
) noexcept nogil:
cdef double_pair lg
if raw_prediction <= 0:
# Same if else conditions as in log1pexp.
if raw_prediction <= -37:
lg.val2 = exp(raw_prediction) # used as temporary
if raw_prediction <= -37:
lg.val1 = lg.val2 - y_true * raw_prediction # loss
else:
lg.val1 = log1p(lg.val2) - y_true * raw_prediction # loss
lg.val1 = lg.val2 - y_true * raw_prediction # loss
lg.val2 -= y_true # gradient
elif raw_prediction <= -2:
lg.val2 = exp(raw_prediction) # used as temporary
lg.val1 = log1p(lg.val2) - y_true * raw_prediction # loss
lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2) # gradient
elif raw_prediction <= 18:
lg.val2 = exp(-raw_prediction) # used as temporary
# log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction # loss
lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient
else:
lg.val2 = exp(-raw_prediction) # used as temporary
if raw_prediction <= 18:
# log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction # loss
else:
lg.val1 = lg.val2 + (1 - y_true) * raw_prediction # loss
lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient
lg.val1 = lg.val2 + (1 - y_true) * raw_prediction # loss
lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient
return lg


Expand All @@ -747,9 +759,15 @@ cdef inline double_pair cgrad_hess_half_binomial(
# hessian = y_pred * (1 - y_pred) = exp( raw) / (1 + exp( raw))**2
# = exp(-raw) / (1 + exp(-raw))**2
cdef double_pair gh
gh.val2 = exp(-raw_prediction) # used as temporary
gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2) # gradient
gh.val2 = gh.val2 / (1 + gh.val2)**2 # hessian
# See comment in cgradient_half_binomial.
if raw_prediction > -37:
gh.val2 = exp(-raw_prediction) # used as temporary
gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2) # gradient
gh.val2 = gh.val2 / (1 + gh.val2)**2 # hessian
else:
gh.val2 = exp(raw_prediction)
gh.val1 = gh.val2 - y_true
gh.val2 *= (1 - gh.val2)
return gh


Expand Down
140 changes: 121 additions & 19 deletions sklearn/_loss/tests/test_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,48 +224,150 @@ def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):


@pytest.mark.parametrize(
"loss, y_true, raw_prediction, loss_true",
"loss, y_true, raw_prediction, loss_true, gradient_true, hessian_true",
[
(HalfSquaredError(), 1.0, 5.0, 8),
(AbsoluteError(), 1.0, 5.0, 4),
(PinballLoss(quantile=0.5), 1.0, 5.0, 2),
(PinballLoss(quantile=0.25), 1.0, 5.0, 4 * (1 - 0.25)),
(PinballLoss(quantile=0.25), 5.0, 1.0, 4 * 0.25),
(HuberLoss(quantile=0.5, delta=3), 1.0, 5.0, 3 * (4 - 3 / 2)),
(HuberLoss(quantile=0.5, delta=3), 1.0, 3.0, 0.5 * 2**2),
(HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4)),
(HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4),
(HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4**2),
(HalfTweedieLossIdentity(power=1), 2.0, 4.0, 2 - 2 * np.log(2)),
(HalfTweedieLossIdentity(power=2), 2.0, 4.0, np.log(2) - 1 / 2),
(HalfTweedieLossIdentity(power=3), 2.0, 4.0, -1 / 4 + 1 / 4**2 + 1 / 2 / 2),
(HalfBinomialLoss(), 0.25, np.log(4), np.log(5) - 0.25 * np.log(4)),
(HalfSquaredError(), 1.0, 5.0, 8, 4, 1),
(AbsoluteError(), 1.0, 5.0, 4.0, 1.0, None),
(PinballLoss(quantile=0.5), 1.0, 5.0, 2, 0.5, None),
(PinballLoss(quantile=0.25), 1.0, 5.0, 4 * (1 - 0.25), 1 - 0.25, None),
(PinballLoss(quantile=0.25), 5.0, 1.0, 4 * 0.25, -0.25, None),
(HuberLoss(quantile=0.5, delta=3), 1.0, 5.0, 3 * (4 - 3 / 2), None, None),
(HuberLoss(quantile=0.5, delta=3), 1.0, 3.0, 0.5 * 2**2, None, None),
(HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4), 4 - 2, 4),
(HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4, 1 - 2 / 4, 2 / 4),
(HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4**2, None, None),
(HalfTweedieLossIdentity(power=1), 2.0, 4.0, 2 - 2 * np.log(2), None, None),
(HalfTweedieLossIdentity(power=2), 2.0, 4.0, np.log(2) - 1 / 2, None, None),
(
HalfTweedieLossIdentity(power=3),
2.0,
4.0,
-1 / 4 + 1 / 4**2 + 1 / 2 / 2,
None,
None,
),
(
HalfBinomialLoss(),
0.25,
np.log(4),
np.log1p(4) - 0.25 * np.log(4),
None,
None,
),
# Extreme log loss cases, checked with mpmath:
# import mpmath as mp
#
# # Stolen from scipy
# def mpf2float(x):
# return float(mp.nstr(x, 17, min_fixed=0, max_fixed=0))
#
# def mp_logloss(y_true, raw):
# with mp.workdps(100):
# y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw))
# out = mp.log1p(mp.exp(raw)) - y_true * raw
# return mpf2float(out)
#
# def mp_gradient(y_true, raw):
# with mp.workdps(100):
# y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw))
# out = mp.mpf(1) / (mp.mpf(1) + mp.exp(-raw)) - y_true
# return mpf2float(out)
#
# def mp_hessian(y_true, raw):
# with mp.workdps(100):
# y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw))
# p = mp.mpf(1) / (mp.mpf(1) + mp.exp(-raw))
# out = p * (mp.mpf(1) - p)
# return mpf2float(out)
#
# y, raw = 0.0, 37.
# mp_logloss(y, raw), mp_gradient(y, raw), mp_hessian(y, raw)
(HalfBinomialLoss(), 0.0, -1e20, 0, 0, 0),
(HalfBinomialLoss(), 1.0, -1e20, 1e20, -1, 0),
(HalfBinomialLoss(), 0.0, -1e3, 0, 0, 0),
(HalfBinomialLoss(), 1.0, -1e3, 1e3, -1, 0),
(HalfBinomialLoss(), 1.0, -37.5, 37.5, -1, 0),
(HalfBinomialLoss(), 1.0, -37.0, 37, 1e-16 - 1, 8.533047625744065e-17),
(HalfBinomialLoss(), 0.0, -37.0, *[8.533047625744065e-17] * 3),
(HalfBinomialLoss(), 1.0, -36.9, 36.9, 1e-16 - 1, 9.430476078526806e-17),
(HalfBinomialLoss(), 0.0, -36.9, *[9.430476078526806e-17] * 3),
(HalfBinomialLoss(), 0.0, 37.0, 37, 1 - 1e-16, 8.533047625744065e-17),
(HalfBinomialLoss(), 1.0, 37.0, *[8.533047625744066e-17] * 3),
(HalfBinomialLoss(), 0.0, 37.5, 37.5, 1, 5.175555005801868e-17),
(HalfBinomialLoss(), 0.0, 232.8, 232.8, 1, 1.4287342391028437e-101),
(HalfBinomialLoss(), 1.0, 1e20, 0, 0, 0),
(HalfBinomialLoss(), 0.0, 1e20, 1e20, 1, 0),
(
HalfBinomialLoss(),
1.0,
232.8,
0,
-1.4287342391028437e-101,
1.4287342391028437e-101,
),
(HalfBinomialLoss(), 1.0, 232.9, 0, 0, 0),
(HalfBinomialLoss(), 1.0, 1e3, 0, 0, 0),
(HalfBinomialLoss(), 0.0, 1e3, 1e3, 1, 0),
(
HalfMultinomialLoss(n_classes=3),
0.0,
[0.2, 0.5, 0.3],
logsumexp([0.2, 0.5, 0.3]) - 0.2,
None,
None,
),
(
HalfMultinomialLoss(n_classes=3),
1.0,
[0.2, 0.5, 0.3],
logsumexp([0.2, 0.5, 0.3]) - 0.5,
None,
None,
),
(
HalfMultinomialLoss(n_classes=3),
2.0,
[0.2, 0.5, 0.3],
logsumexp([0.2, 0.5, 0.3]) - 0.3,
None,
None,
),
(
HalfMultinomialLoss(n_classes=3),
2.0,
[1e4, 0, 7e-7],
logsumexp([1e4, 0, 7e-7]) - (7e-7),
None,
None,
),
],
ids=loss_instance_name,
)
def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
"""Test losses at specific values."""
assert loss(
def test_loss_on_specific_values(
loss, y_true, raw_prediction, loss_true, gradient_true, hessian_true
):
"""Test losses, gradients and hessians at specific values."""
loss1 = loss(y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction]))
grad1 = loss.gradient(
y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
)
loss2, grad2 = loss.loss_gradient(
y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
)
grad3, hess = loss.gradient_hessian(
y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
) == approx(loss_true, rel=1e-11, abs=1e-12)
)

assert loss1 == approx(loss_true, rel=1e-15, abs=1e-15)
assert loss2 == approx(loss_true, rel=1e-15, abs=1e-15)

if gradient_true is not None:
assert grad1 == approx(gradient_true, rel=1e-15, abs=1e-15)
assert grad2 == approx(gradient_true, rel=1e-15, abs=1e-15)
assert grad3 == approx(gradient_true, rel=1e-15, abs=1e-15)

if hessian_true is not None:
assert hess == approx(hessian_true, rel=1e-15, abs=1e-15)


@pytest.mark.parametrize("loss", ALL_LOSSES)
Expand Down