scikit-learn · thomasjpfan · Mar 28, 2022 · Feb 18, 2022 · Feb 18, 2022 · Feb 19, 2022
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
@@ -1032,7 +1032,7 @@ reproductive exponential dispersion model (EDM) [11]_).
 
 The minimization problem becomes:
 
-.. math::    \min_{w} \frac{1}{2 n_{\text{samples}}} \sum_i d(y_i, \hat{y}_i) + \frac{\alpha}{2} ||w||_2,
+.. math::    \min_{w} \frac{1}{2 n_{\text{samples}}} \sum_i d(y_i, \hat{y}_i) + \frac{\alpha}{2} ||w||_2^2,
 
 where :math:`\alpha` is the L2 regularization penalty. When sample weights are
 provided, the average becomes a weighted average.

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -557,6 +557,12 @@ Changelog
   :pr:`21808`, :pr:`20567` and :pr:`21814` by
   :user:`Christian Lorentzen <lorentzenchr>`.
 
+- |Enhancement| :class:`~linear_model.GammaRegressor`,
+  :class:`~linear_model.PoissonRegressor` and :class:`~linear_model.TweedieRegressor`
+  are faster for ``solvers="lbfgs"``.
+  :pr:`22548`, :pr:`21808` and :pr:`20567` by
+  :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Enhancement| Rename parameter `base_estimator` to `estimator` in
   :class:`linear_model.RANSACRegressor` to improve readability and consistency.
   `base_estimator` is deprecated and will be removed in 1.3.
@@ -590,6 +596,12 @@ Changelog
   sub-problem while now all of them are recorded. :pr:`21998` by
   :user:`Olivier Grisel <ogrisel>`.
 
+- |Fix| The property `family` of :class:`linear_model.TweedieRegressor` is not
+  validated in `__init__` anymore. Instead, this (private) property is deprecated in
+  :class:`linear_model.GammaRegressor`, :class:`linear_model.PoissonRegressor` and
+  :class:`linear_model.TweedieRegressor`, and will be removed in 1.3.
+  :pr:`22548` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Enhancement| :class:`linear_model.BayesianRidge` and
   :class:`linear_model.ARDRegression` now preserve float32 dtype. :pr:`9087` by
   :user:`Arthur Imbert <Henley13>` and :pr:`22525` by :user:`Meekail Zain <micky774>`.

diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py
@@ -10,6 +10,7 @@
     HalfPoissonLoss,
     HalfGammaLoss,
     HalfTweedieLoss,
+    HalfTweedieLossIdentity,
     HalfBinomialLoss,
     HalfMultinomialLoss,
 )
@@ -22,6 +23,7 @@
     "HalfPoissonLoss",
     "HalfGammaLoss",
     "HalfTweedieLoss",
+    "HalfTweedieLossIdentity",
     "HalfBinomialLoss",
     "HalfMultinomialLoss",
 ]
diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
@@ -69,6 +69,13 @@ cdef class CyHalfTweedieLoss(CyLossFunction):
     cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
+cdef class CyHalfTweedieLossIdentity(CyLossFunction):
+    cdef readonly double power  # readonly makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
+
+
 cdef class CyHalfBinomialLoss(CyLossFunction):
     cdef double cy_loss(self, double y_true, double raw_prediction) nogil
     cdef double cy_gradient(self, double y_true, double raw_prediction) nogil

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
@@ -1,7 +1,7 @@
 {{py:
 
 """
-Template file for easily generate loops over samples using Tempita
+Template file to easily generate loops over samples using Tempita
 (https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
 
 Generated file: _loss.pyx
@@ -117,6 +117,28 @@ doc_HalfTweedieLoss = (
     """
 )
 
+doc_HalfTweedieLossIdentity = (
+    """Half Tweedie deviance loss with identity link.
+
+    Domain:
+    y_true in real numbers if p <= 0
+    y_true in non-negative real numbers if 0 < p < 2
+    y_true in positive real numbers if p >= 2
+    y_pred and power in positive real numbers, y_pred may be negative for p=0.
+
+    Link:
+    y_pred = raw_prediction
+
+    Half Tweedie deviance with identity link and p=power is
+        max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * y_pred**(1-p) / (1-p)
+        + y_pred**(2-p) / (2-p)
+
+    Notes:
+    - Here, we do not drop constant terms in contrast to the version with log-link.
+    """
+)
+
 doc_HalfBinomialLoss = (
     """Half Binomial deviance loss with logit link.
 
@@ -151,6 +173,9 @@ class_list = [
     ("CyHalfTweedieLoss", doc_HalfTweedieLoss, "power",
      "closs_half_tweedie", "closs_grad_half_tweedie",
      "cgradient_half_tweedie", "cgrad_hess_half_tweedie"),
+    ("CyHalfTweedieLossIdentity", doc_HalfTweedieLossIdentity, "power",
+     "closs_half_tweedie_identity", "closs_grad_half_tweedie_identity",
+     "cgradient_half_tweedie_identity", "cgrad_hess_half_tweedie_identity"),
     ("CyHalfBinomialLoss", doc_HalfBinomialLoss, None,
      "closs_half_binomial", "closs_grad_half_binomial",
      "cgradient_half_binomial", "cgrad_hess_half_binomial"),
@@ -194,7 +219,7 @@ from cython.parallel import parallel, prange
 import numpy as np
 cimport numpy as np
 
-from libc.math cimport exp, fabs, log, log1p
+from libc.math cimport exp, fabs, log, log1p, pow
 from libc.stdlib cimport malloc, free
 
 np.import_array()
@@ -420,7 +445,7 @@ cdef inline double_pair cgrad_hess_half_gamma(
 
 
 # Half Tweedie Deviance with Log-Link, dropping constant terms
-# Note that by dropping constants this is no longer smooth in parameter power.
+# Note that by dropping constants this is no longer continuous in parameter power.
 cdef inline double closs_half_tweedie(
     double y_true,
     double raw_prediction,
@@ -501,6 +526,102 @@ cdef inline double_pair cgrad_hess_half_tweedie(
     return gh
 
 
+# Half Tweedie Deviance with identity link, without dropping constant terms!
+# Therefore, best loss value is zero.
+cdef inline double closs_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) nogil:
+    cdef double tmp
+    if power == 0.:
+        return closs_half_squared_error(y_true, raw_prediction)
+    elif power == 1.:
+        if y_true == 0:
+            return raw_prediction
+        else:
+            return y_true * log(y_true/raw_prediction) + raw_prediction - y_true
+    elif power == 2.:
+        return log(raw_prediction/y_true) + y_true/raw_prediction - 1.
+    else:
+        tmp = pow(raw_prediction, 1. - power)
+        tmp = raw_prediction * tmp / (2. - power) - y_true * tmp / (1. - power)
+        if y_true > 0:
+            tmp += pow(y_true, 2. - power) / ((1. - power) * (2. - power))
+        return tmp
+
+
+cdef inline double cgradient_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) nogil:
+    if power == 0.:
+        return raw_prediction - y_true
+    elif power == 1.:
+        return 1. - y_true / raw_prediction
+    elif power == 2.:
+        return (raw_prediction - y_true) / (raw_prediction * raw_prediction)
+    else:
+        return pow(raw_prediction, -power) * (raw_prediction - y_true)
+
+
+cdef inline double_pair closs_grad_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) nogil:
+    cdef double_pair lg
+    cdef double tmp
+    if power == 0.:
+        lg.val2 = raw_prediction - y_true  # gradient
+        lg.val1 = 0.5 * lg.val2 * lg.val2  # loss
+    elif power == 1.:
+        if y_true == 0:
+            lg.val1 = raw_prediction
+        else:
+            lg.val1 = (y_true * log(y_true/raw_prediction)  # loss
+                       + raw_prediction - y_true)
+        lg.val2 = 1. - y_true / raw_prediction              # gradient
+    elif power == 2.:
+        lg.val1 = log(raw_prediction/y_true) + y_true/raw_prediction - 1.  # loss
+        tmp = raw_prediction * raw_prediction
+        lg.val2 = (raw_prediction - y_true) / tmp                          # gradient
+    else:
+        tmp = pow(raw_prediction, 1. - power)
+        lg.val1 = (raw_prediction * tmp / (2. - power)  # loss
+                   - y_true * tmp / (1. - power))
+        if y_true > 0:
+            lg.val1 += (pow(y_true, 2. - power)
+                        / ((1. - power) * (2. - power)))
+        lg.val2 = tmp * (1. - y_true / raw_prediction)    # gradient
+    return lg
+
+
+cdef inline double_pair cgrad_hess_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) nogil:
+    cdef double_pair gh
+    cdef double tmp
+    if power == 0.:
+        gh.val1 = raw_prediction - y_true  # gradient
+        gh.val2 = 1.                       # hessian
+    elif power == 1.:
+        gh.val1 = 1. - y_true / raw_prediction                # gradient
+        gh.val2 = y_true / (raw_prediction * raw_prediction)  # hessian
+    elif power == 2.:
+        tmp = raw_prediction * raw_prediction
+        gh.val1 = (raw_prediction - y_true) / tmp             # gradient
+        gh.val2 = (-1. + 2. * y_true / raw_prediction) / tmp  # hessian
+    else:
+        tmp = pow(raw_prediction, -power)
+        gh.val1 = tmp * (raw_prediction - y_true)                         # gradient
+        gh.val2 = tmp * ((1. - power) + power * y_true / raw_prediction)  # hessian
+    return gh
+
+
 # Half Binomial deviance with logit-link, aka log-loss or binary cross entropy
 cdef inline double closs_half_binomial(
     double y_true,

diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py
@@ -4,6 +4,10 @@
 
 # Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
 # License: BSD 3 clause
+#
+# TODO(1.3): remove file
+#       This is only used for backward compatibility in _GeneralizedLinearRegressor
+#       for the deprecated family attribute.
 
 from abc import ABCMeta, abstractmethod
 from collections import namedtuple

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
@@ -1,7 +1,7 @@
 """
 Module contains classes for invertible (and differentiable) link functions.
 """
-# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -23,7 +23,7 @@ def __post_init__(self):
         """Check that low <= high"""
         if self.low > self.high:
             raise ValueError(
-                f"On must have low <= high; got low={self.low}, high={self.high}."
+                f"One must have low <= high; got low={self.low}, high={self.high}."
             )
 
     def includes(self, x):

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
@@ -25,6 +25,7 @@
     CyHalfPoissonLoss,
     CyHalfGammaLoss,
     CyHalfTweedieLoss,
+    CyHalfTweedieLossIdentity,
     CyHalfBinomialLoss,
     CyHalfMultinomialLoss,
 )
@@ -770,6 +771,52 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
             return term
 
 
+class HalfTweedieLossIdentity(BaseLoss):
+    """Half Tweedie deviance loss with identity link, for regression.
+
+    Domain:
+    y_true in real numbers for power <= 0
+    y_true in non-negative real numbers for 0 < power < 2
+    y_true in positive real numbers for 2 <= power
+    y_pred in positive real numbers for power != 0
+    y_pred in real numbers for power = 0
+    power in real numbers
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, half Tweedie deviance loss with p=power is defined
+    as::
+
+        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
+                    - y_true_i * raw_prediction_i**(1-p) / (1-p)
+                    + raw_prediction_i**(2-p) / (2-p)
+
+    Note that the minimum value of this loss is 0.
+
+    Note furthermore that although no Tweedie distribution exists for
+    0 < power < 1, it still gives a strictly consistent scoring function for
+    the expectation.
+    """
+
+    def __init__(self, sample_weight=None, power=1.5):
+        super().__init__(
+            closs=CyHalfTweedieLossIdentity(power=float(power)),
+            link=IdentityLink(),
+        )
+        if self.closs.power <= 0:
+            self.interval_y_true = Interval(-np.inf, np.inf, False, False)
+        elif self.closs.power < 2:
+            self.interval_y_true = Interval(0, np.inf, True, False)
+        else:
+            self.interval_y_true = Interval(0, np.inf, False, False)
+
+        if self.closs.power == 0:
+            self.interval_y_pred = Interval(-np.inf, np.inf, False, False)
+        else:
+            self.interval_y_pred = Interval(0, np.inf, False, False)
+
+
 class HalfBinomialLoss(BaseLoss):
     """Half Binomial deviance loss with logit link, for binary classification.
 

diff --git a/sklearn/_loss/tests/test_glm_distribution.py b/sklearn/_loss/tests/test_glm_distribution.py
@@ -1,6 +1,8 @@
 # Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
 #
 # License: BSD 3 clause
+#
+# TODO(1.3): remove file
 import numpy as np
 from numpy.testing import (
     assert_allclose,

diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
@@ -16,7 +16,7 @@
 def test_interval_raises():
     """Test that interval with low > high raises ValueError."""
     with pytest.raises(
-        ValueError, match="On must have low <= high; got low=1, high=0."
+        ValueError, match="One must have low <= high; got low=1, high=0."
     ):
         Interval(1, 0, False, False)