From decbb5a91e16875411802c59b42ae82d9ddf7f58 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 18 Feb 2022 11:47:12 +0100
Subject: [PATCH 01/43] DOC fix missing power of 2

---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 6d176e8482537..24dfa901b1d42 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -1032,7 +1032,7 @@ reproductive exponential dispersion model (EDM) [11]_).
 
 The minimization problem becomes:
 
-.. math::    \min_{w} \frac{1}{2 n_{\text{samples}}} \sum_i d(y_i, \hat{y}_i) + \frac{\alpha}{2} ||w||_2,
+.. math::    \min_{w} \frac{1}{2 n_{\text{samples}}} \sum_i d(y_i, \hat{y}_i) + \frac{\alpha}{2} ||w||_2^2,
 
 where :math:`\alpha` is the L2 regularization penalty. When sample weights are
 provided, the average becomes a weighted average.

From 26c08366c1211ffe18cd6350f976d8b5f9eb9326 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 18 Feb 2022 14:47:29 +0100
Subject: [PATCH 02/43] FIX typo in error message

---
 sklearn/_loss/link.py            | 2 +-
 sklearn/_loss/tests/test_link.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index 18ad5901d1f3c..5104ae2ca6ec5 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -23,7 +23,7 @@ def __post_init__(self):
         """Check that low <= high"""
         if self.low > self.high:
             raise ValueError(
-                f"On must have low <= high; got low={self.low}, high={self.high}."
+                f"One must have low <= high; got low={self.low}, high={self.high}."
             )
 
     def includes(self, x):
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index b363a45109989..435361eaa50f1 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -16,7 +16,7 @@
 def test_interval_raises():
     """Test that interval with low > high raises ValueError."""
     with pytest.raises(
-        ValueError, match="On must have low <= high; got low=1, high=0."
+        ValueError, match="One must have low <= high; got low=1, high=0."
     ):
         Interval(1, 0, False, False)
 

From fb662412a4e84cfaddaf47cac5128702d1194614 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 10:16:51 +0100
Subject: [PATCH 03/43] FIX bug in loss tests

---
 sklearn/_loss/tests/test_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 72f7bae097a33..fc4c8fbcaed04 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -71,7 +71,7 @@ def random_y_true_raw_prediction(
         y_true = np.arange(n_samples).astype(float) % loss.n_classes
     else:
         raw_prediction = rng.uniform(
-            low=raw_bound[0], high=raw_bound[0], size=n_samples
+            low=raw_bound[0], high=raw_bound[1], size=n_samples
         )
         # generate a y_true in valid range
         low, high = _inclusive_low_high(loss.interval_y_true)

From 395743a0fad29fac167e23e1e713a550a402e7b8 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 10:26:44 +0100
Subject: [PATCH 04/43] FEA add HalfTweedieLossIdentity

---
 sklearn/_loss/__init__.py        |   2 +
 sklearn/_loss/_loss.pxd          |   7 ++
 sklearn/_loss/_loss.pyx.tp       | 130 ++++++++++++++++++++++++++++++-
 sklearn/_loss/loss.py            |  46 +++++++++++
 sklearn/_loss/tests/test_loss.py |  35 +++++++++
 5 files changed, 218 insertions(+), 2 deletions(-)

diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py
index 14548c62231a2..63ae3038df8ae 100644
--- a/sklearn/_loss/__init__.py
+++ b/sklearn/_loss/__init__.py
@@ -10,6 +10,7 @@
     HalfPoissonLoss,
     HalfGammaLoss,
     HalfTweedieLoss,
+    HalfTweedieLossIdentity,
     HalfBinomialLoss,
     HalfMultinomialLoss,
 )
@@ -22,6 +23,7 @@
     "HalfPoissonLoss",
     "HalfGammaLoss",
     "HalfTweedieLoss",
+    "HalfTweedieLossIdentity",
     "HalfBinomialLoss",
     "HalfMultinomialLoss",
 ]
diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
index 7255243d331dc..36fc91586a1df 100644
--- a/sklearn/_loss/_loss.pxd
+++ b/sklearn/_loss/_loss.pxd
@@ -69,6 +69,13 @@ cdef class CyHalfTweedieLoss(CyLossFunction):
     cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
+cdef class CyHalfTweedieLossIdentity(CyLossFunction):
+    cdef readonly double power  # readonly makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
+
+
 cdef class CyHalfBinomialLoss(CyLossFunction):
     cdef double cy_loss(self, double y_true, double raw_prediction) nogil
     cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 5565cc108af07..208a6f257d421 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -117,6 +117,31 @@ doc_HalfTweedieLoss = (
     """
 )
 
+doc_HalfTweedieLossIdentity = (
+    """Half Tweedie deviance loss with identity link.
+
+    Domain:
+    y_true in real numbers if p <= 0
+    y_true in non-negative real numbers if 0 < p < 2
+    y_true in positive real numbers if p >= 2
+    y_pred and power in positive real numbers, y_pred may be negative for p=0.
+
+    Link:
+    y_pred = raw_prediction
+
+    Half Tweedie deviance with identity link and p=power is
+        max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * y_pred**(1-p) / (1-p)
+        + y_pred**(2-p) / (2-p)
+        = max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+        + exp((2-p) * raw_prediction) / (2-p)
+
+    Notes:
+    - Here, we do not drop constant terms in contrast to the version with log-link.
+    """
+)
+
 doc_HalfBinomialLoss = (
     """Half Binomial deviance loss with logit link.
 
@@ -151,6 +176,9 @@ class_list = [
     ("CyHalfTweedieLoss", doc_HalfTweedieLoss, "power",
      "closs_half_tweedie", "closs_grad_half_tweedie",
      "cgradient_half_tweedie", "cgrad_hess_half_tweedie"),
+    ("CyHalfTweedieLossIdentity", doc_HalfTweedieLossIdentity, "power",
+     "closs_half_tweedie_identity", "closs_grad_half_tweedie_identity",
+     "cgradient_half_tweedie_identity", "cgrad_hess_half_tweedie_identity"),
     ("CyHalfBinomialLoss", doc_HalfBinomialLoss, None,
      "closs_half_binomial", "closs_grad_half_binomial",
      "cgradient_half_binomial", "cgrad_hess_half_binomial"),
@@ -194,7 +222,7 @@ from cython.parallel import parallel, prange
 import numpy as np
 cimport numpy as np
 
-from libc.math cimport exp, fabs, log, log1p
+from libc.math cimport exp, fabs, log, log1p, pow
 from libc.stdlib cimport malloc, free
 
 np.import_array()
@@ -420,7 +448,7 @@ cdef inline double_pair cgrad_hess_half_gamma(
 
 
 # Half Tweedie Deviance with Log-Link, dropping constant terms
-# Note that by dropping constants this is no longer smooth in parameter power.
+# Note that by dropping constants this is no longer continuous in parameter power.
 cdef inline double closs_half_tweedie(
     double y_true,
     double raw_prediction,
@@ -501,6 +529,104 @@ cdef inline double_pair cgrad_hess_half_tweedie(
     return gh
 
 
+# Half Tweedie Deviance with identity link, without dropping constant terms!
+# Therefore, best loss value is zero.
+cdef inline double closs_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) nogil:
+    cdef double tmp
+    if power == 0.:
+        return closs_half_squared_error(y_true, raw_prediction)
+    elif power == 1.:
+        if y_true == 0:
+            return raw_prediction
+        else:
+            return y_true * log(y_true/raw_prediction) + raw_prediction - y_true
+    elif power == 2.:
+        return log(raw_prediction/y_true) + y_true/raw_prediction - 1.
+    else:
+        tmp = pow(raw_prediction, 1. - power)
+        tmp = raw_prediction * tmp / (2. - power) - y_true * tmp / (1. - power)
+        if y_true > 0:
+            tmp += pow(y_true, 2. - power) / ((1. - power) * (2. - power))
+        return tmp
+
+
+cdef inline double cgradient_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) nogil:
+    cdef double tmp
+    if power == 0.:
+        return raw_prediction - y_true
+    elif power == 1.:
+        return 1. - y_true / raw_prediction
+    elif power == 2.:
+        return (raw_prediction - y_true) / (raw_prediction * raw_prediction)
+    else:
+        tmp = pow(raw_prediction, -power)
+        return tmp * (raw_prediction - y_true)
+
+
+cdef inline double_pair closs_grad_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) nogil:
+    cdef double_pair lg
+    cdef double tmp
+    if power == 0.:
+        lg.val2 = raw_prediction - y_true  # gradient
+        lg.val1 = 0.5 * lg.val2 * lg.val2  # loss
+    elif power == 1.:
+        if y_true == 0:
+            lg.val1 = raw_prediction
+        else:
+            lg.val1 = (y_true * log(y_true/raw_prediction)  # loss
+                       + raw_prediction - y_true)
+        lg.val2 = 1. - y_true / raw_prediction              # gradient
+    elif power == 2.:
+        lg.val1 = log(raw_prediction/y_true) + y_true/raw_prediction - 1.  # loss
+        tmp = raw_prediction * raw_prediction
+        lg.val2 = (raw_prediction - y_true) / tmp                          # gradient
+    else:
+        tmp = pow(raw_prediction, 1. - power)
+        lg.val1 = (raw_prediction * tmp / (2. - power)  # loss
+                   - y_true * tmp / (1. - power))
+        if y_true > 0:
+            lg.val1 += (pow(y_true, 2. - power)
+                        / ((1. - power) * (2. - power)))
+        lg.val2 = tmp * (1. - y_true / raw_prediction)    # gradient
+    return lg
+
+
+cdef inline double_pair cgrad_hess_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) nogil:
+    cdef double_pair gh
+    cdef double tmp
+    if power == 0.:
+        gh.val1 = raw_prediction - y_true  # gradient
+        gh.val2 = 1.                       # hessian
+    elif power == 1.:
+        gh.val1 = 1. - y_true / raw_prediction                # gradient
+        gh.val2 = y_true / (raw_prediction * raw_prediction)  # hessian
+    elif power == 2.:
+        tmp = raw_prediction * raw_prediction
+        gh.val1 = (raw_prediction - y_true) / tmp             # gradient
+        gh.val2 = (-1. + 2. * y_true / raw_prediction) / tmp  # hessian
+    else:
+        tmp = pow(raw_prediction, -power)
+        gh.val1 = tmp * (raw_prediction - y_true)                         # gradient
+        gh.val2 = tmp * ((1. - power) + power * y_true / raw_prediction)  # hessian
+    return gh
+
+
 # Half Binomial deviance with logit-link, aka log-loss or binary cross entropy
 cdef inline double closs_half_binomial(
     double y_true,
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 1a2353d18df3b..b3f358248ffe4 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -24,6 +24,7 @@
     CyHalfPoissonLoss,
     CyHalfGammaLoss,
     CyHalfTweedieLoss,
+    CyHalfTweedieLossIdentity,
     CyHalfBinomialLoss,
     CyHalfMultinomialLoss,
 )
@@ -757,6 +758,51 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
             return term
 
 
+class HalfTweedieLossIdentity(BaseLoss):
+    """Half Tweedie deviance loss with identity link, for regression.
+
+    Domain:
+    y_true in real numbers for power <= 0
+    y_true in non-negative real numbers for 0 < power < 2
+    y_true in positive real numbers for 2 <= power
+    y_pred in positive real numbers, all reals for power = 0
+    power in real numbers
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, half Tweedie deviance loss with p=power is defined
+    as::
+
+        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
+                    - y_true_i * raw_prediction_i**(1-p) / (1-p)
+                    + raw_prediction_i**(2-p) / (2-p)
+
+    Note that the minimum loss is 0.
+
+    Note furthermore that although no Tweedie distribution exists for
+    0 < power < 1, it still gives a strictly consistent scoring function for
+    the expectation.
+    """
+
+    def __init__(self, sample_weight=None, power=1.5):
+        super().__init__(
+            closs=CyHalfTweedieLossIdentity(power=float(power)),
+            link=IdentityLink(),
+        )
+        if self.closs.power <= 0:
+            self.interval_y_true = Interval(-np.inf, np.inf, False, False)
+        elif self.closs.power < 2:
+            self.interval_y_true = Interval(0, np.inf, True, False)
+        else:
+            self.interval_y_true = Interval(0, np.inf, False, False)
+
+        if self.closs.power == 0:
+            self.interval_y_pred = Interval(-np.inf, np.inf, False, False)
+        else:
+            self.interval_y_pred = Interval(0, np.inf, False, False)
+
+
 class HalfBinomialLoss(BaseLoss):
     """Half Binomial deviance loss with logit link, for binary classification.
 
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index fc4c8fbcaed04..1c8940a2b5dc2 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -22,6 +22,7 @@
     HalfPoissonLoss,
     HalfSquaredError,
     HalfTweedieLoss,
+    HalfTweedieLossIdentity,
     PinballLoss,
 )
 from sklearn.utils import assert_all_finite
@@ -40,6 +41,10 @@
     HalfTweedieLoss(power=1),
     HalfTweedieLoss(power=2),
     HalfTweedieLoss(power=3.0),
+    HalfTweedieLossIdentity(power=0),
+    HalfTweedieLossIdentity(power=1),
+    HalfTweedieLossIdentity(power=2),
+    HalfTweedieLossIdentity(power=3.0),
 ]
 
 
@@ -70,6 +75,12 @@ def random_y_true_raw_prediction(
         )
         y_true = np.arange(n_samples).astype(float) % loss.n_classes
     else:
+        # If link is identity, we must respect the interval of y_pred:
+        if isinstance(loss.link, IdentityLink):
+            low, high = _inclusive_low_high(loss.interval_y_pred)
+            low = np.amax([low, raw_bound[0]])
+            high = np.amin([high, raw_bound[1]])
+            raw_bound = (low, high)
         raw_prediction = rng.uniform(
             low=raw_bound[0], high=raw_bound[1], size=n_samples
         )
@@ -149,6 +160,11 @@ def test_loss_boundary(loss):
     (HalfTweedieLoss(power=1.5), [0.1, 100], [-np.inf, -3, -0.1, np.inf]),
     (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
     (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (HalfTweedieLossIdentity(power=-3), [0.1, 100], [-np.inf, np.inf]),
+    (HalfTweedieLossIdentity(power=0), [0.1, 100], [-np.inf, np.inf]),
+    (HalfTweedieLossIdentity(power=1.5), [0.1, 100], [-np.inf, -3, -0.1, np.inf]),
+    (HalfTweedieLossIdentity(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (HalfTweedieLossIdentity(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
     (HalfBinomialLoss(), [0.1, 0.5, 0.9], [-np.inf, -1, 2, np.inf]),
     (HalfMultinomialLoss(), [], [-np.inf, -1, 1.1, np.inf]),
 ]
@@ -160,6 +176,9 @@ def test_loss_boundary(loss):
     (HalfTweedieLoss(power=-3), [-100, -0.1, 0], []),
     (HalfTweedieLoss(power=0), [-100, 0], []),
     (HalfTweedieLoss(power=1.5), [0], []),
+    (HalfTweedieLossIdentity(power=-3), [-100, -0.1, 0], []),
+    (HalfTweedieLossIdentity(power=0), [-100, 0], []),
+    (HalfTweedieLossIdentity(power=1.5), [0], []),
     (HalfBinomialLoss(), [0, 1], []),
     (HalfMultinomialLoss(), [0.0, 1.0, 2], []),
 ]
@@ -169,6 +188,9 @@ def test_loss_boundary(loss):
     (HalfTweedieLoss(power=-3), [], [-3, -0.1, 0]),
     (HalfTweedieLoss(power=0), [], [-3, -0.1, 0]),
     (HalfTweedieLoss(power=1.5), [], [0]),
+    (HalfTweedieLossIdentity(power=-3), [], [-3, -0.1, 0]),
+    (HalfTweedieLossIdentity(power=0), [-3, -0.1, 0], []),
+    (HalfTweedieLossIdentity(power=1.5), [], [0]),
     (HalfBinomialLoss(), [], [0, 1]),
     (HalfMultinomialLoss(), [0.1, 0.5], [0, 1]),
 ]
@@ -207,6 +229,9 @@ def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
         (HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4)),
         (HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4),
         (HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4**2),
+        (HalfTweedieLossIdentity(power=1), 2.0, 4.0, 2 - 2 * np.log(2)),
+        (HalfTweedieLossIdentity(power=2), 2.0, 4.0, np.log(2) - 1 / 2),
+        (HalfTweedieLossIdentity(power=3), 2.0, 4.0, -1 / 4 + 1 / 4**2 + 1 / 2 / 2),
         (HalfBinomialLoss(), 0.25, np.log(4), np.log(5) - 0.25 * np.log(4)),
         (
             HalfMultinomialLoss(n_classes=3),
@@ -604,6 +629,16 @@ def test_loss_of_perfect_prediction(loss, sample_weight):
     if not loss.is_multiclass:
         # Use small values such that exp(value) is not nan.
         raw_prediction = np.array([-10, -0.1, 0, 0.1, 3, 10])
+        # If link is identity, we must respect the interval of y_pred:
+        if isinstance(loss.link, IdentityLink):
+            eps = 1e-10
+            low = loss.interval_y_pred.low
+            if not loss.interval_y_pred.low_inclusive:
+                low = low + eps
+            high = loss.interval_y_pred.high
+            if not loss.interval_y_pred.high_inclusive:
+                high = high - eps
+            raw_prediction = np.clip(raw_prediction, low, high)
         y_true = loss.link.inverse(raw_prediction)
     else:
         # HalfMultinomialLoss

From c029daddc4f88e42229dd6af84566e1a2e155e4b Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 10:27:37 +0100
Subject: [PATCH 05/43] DOC comment about linear predictor

---
 sklearn/linear_model/_linear_loss.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
index 93a7684aea5b6..88b42c952b509 100644
--- a/sklearn/linear_model/_linear_loss.py
+++ b/sklearn/linear_model/_linear_loss.py
@@ -9,6 +9,8 @@
 class LinearModelLoss:
     """General class for loss functions with raw_prediction = X @ coef + intercept.
 
+    Note that raw_prediction is also known as linear predictor.
+
     The loss is the sum of per sample losses and includes a term for L2
     regularization::
 

From 2eb28f48f79f691293f346bcd04a97bc53327126 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 11:48:37 +0100
Subject: [PATCH 06/43] ENH refactor glm.py with LinearModelLoss

---
 sklearn/linear_model/_glm/glm.py            | 403 ++++++++++----------
 sklearn/linear_model/_glm/tests/test_glm.py | 244 ++++++------
 2 files changed, 335 insertions(+), 312 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index d7af8ae60d8b6..cc3e61a69d810 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -11,57 +11,53 @@
 import numpy as np
 import scipy.optimize
 
+from ..._loss.loss import (
+    BaseLoss,
+    HalfGammaLoss,
+    HalfPoissonLoss,
+    HalfSquaredError,
+    HalfTweedieLoss,
+    HalfTweedieLossIdentity,
+)
 from ...base import BaseEstimator, RegressorMixin
 from ...utils.optimize import _check_optimize_result
-from ...utils import check_scalar
+from ...utils import check_scalar, check_array
 from ...utils.validation import check_is_fitted, _check_sample_weight
-from ..._loss.glm_distribution import (
-    ExponentialDispersionModel,
-    TweedieDistribution,
-    EDM_DISTRIBUTIONS,
-)
-from .link import (
-    BaseLink,
-    IdentityLink,
-    LogLink,
-)
+from .._linear_loss import LinearModelLoss
 
 
-def _safe_lin_pred(X, coef):
-    """Compute the linear predictor taking care if intercept is present."""
-    if coef.size == X.shape[1] + 1:
-        return X @ coef[1:] + coef[0]
-    else:
-        return X @ coef
+class HalfInverseGaussianLoss(HalfTweedieLoss):
+    """Half inverse Gaussian deviance loss with log-link, for regression.
 
+    This is equivalent to HalfTweedieLoss(power=3).
+    """
 
-def _y_pred_deviance_derivative(coef, X, y, weights, family, link):
-    """Compute y_pred and the derivative of the deviance w.r.t coef."""
-    lin_pred = _safe_lin_pred(X, coef)
-    y_pred = link.inverse(lin_pred)
-    d1 = link.inverse_derivative(lin_pred)
-    temp = d1 * family.deviance_derivative(y, y_pred, weights)
-    if coef.size == X.shape[1] + 1:
-        devp = np.concatenate(([temp.sum()], temp @ X))
-    else:
-        devp = temp @ X  # same as X.T @ temp
-    return y_pred, devp
+    def __init__(self, sample_weight=None):
+        super().__init__(sample_weight=sample_weight, power=3)
 
 
+# TODO: We could allow strings for base_loss_class (as before for the now removed
+# family parameter): 'normal', 'poisson', 'gamma', 'inverse-gaussian'
 class GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
     """Regression via a penalized Generalized Linear Model (GLM).
 
-    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
-    fitting and predicting the mean of the target y as y_pred=h(X*w).
-    Therefore, the fit minimizes the following objective function with L2
-    priors as regularizer::
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and
+    predicting the mean of the target y as y_pred=h(X*w) with coefficients w.
+    Therefore, the fit minimizes the following objective function with L2priors as
+    regularizer::
 
-            1/(2*sum(s)) * deviance(y, h(X*w); s)
-            + 1/2 * alpha * |w|_2
+        1/(2*sum(s_i)) * sum(s_i * deviance(y_i, h(x_i*w)) + 1/2 * alpha * ||w||_2^2
 
-    with inverse link function h and s=sample_weight.
+    with inverse link function h, s=sample_weight and per observation (unit) deviance
+    deviance(y_i, h(x_i*w). Note that for an EDM 1/2 * deviance is the negative
+    log-likelihood up to a constant (in w) term.
     The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
 
+    Instead of implementing the EDM family and a link function seperately, we directly
+    use the loss functions `from sklearn._loss` which have the link functions included
+    in them for performance reasons. We pick the loss functions that implement
+    (1/2 times) EDM deviances.
+
     Read more in the :ref:`User Guide <Generalized_linear_regression>`.
 
     .. versionadded:: 0.23
@@ -79,19 +75,28 @@ class GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
-    family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} \
-            or an ExponentialDispersionModel instance, default='normal'
-        The distributional assumption of the GLM, i.e. which distribution from
-        the EDM, specifies the loss function to be minimized.
+    base_loss_class : subclass of BaseLoss, default=HalfSquaredError
+        A `base_loss_class` contains a specific loss function as well as the link
+        function. The loss to be minimized specifies the distributional assumption of
+        the GLM, i.e. the distribution from the EDM. Here are some examples:
+
+        =======================  ========  ==========================
+        base_loss_class          Link       Target Domain
+        =======================  ========  ==========================
+        HalfSquaredError         identity  y any real number
+        HalfPoissonLoss          log       0 <= y
+        HalfGammaLoss            log       0 < y
+        HalfInverseGaussianLoss  log       0 < y
+        HalfTweedieLoss          log       dependend on tweedie power
+        =======================  ========  ==========================
 
-    link : {'auto', 'identity', 'log'} or an instance of class BaseLink, \
-            default='auto'
         The link function of the GLM, i.e. mapping from linear predictor
-        `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
-        the link depending on the chosen family as follows:
+        `X @ coeff + intercept` to prediction `y_pred`. For instance, with a log link,
+        we have `y_pred = exp(X @ coeff + intercept)`.
 
-        - 'identity' for Normal distribution
-        - 'log' for Poisson,  Gamma and Inverse Gaussian distributions
+    base_loss_params : dictionary, default={}
+        Arguments to be passed to base_loss_class, e.g. {"power": 1.5} with
+        `base_loss_class=HalfTweedieLoss`.
 
     solver : 'lbfgs', default='lbfgs'
         Algorithm to use in the optimization problem:
@@ -136,8 +141,8 @@ def __init__(
         *,
         alpha=1.0,
         fit_intercept=True,
-        family="normal",
-        link="auto",
+        base_loss_class=HalfSquaredError,
+        base_loss_params={},
         solver="lbfgs",
         max_iter=100,
         tol=1e-4,
@@ -146,8 +151,8 @@ def __init__(
     ):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
-        self.family = family
-        self.link = link
+        self.base_loss_class = base_loss_class
+        self.base_loss_params = base_loss_params
         self.solver = solver
         self.max_iter = max_iter
         self.tol = tol
@@ -173,47 +178,13 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted model.
         """
-        if isinstance(self.family, ExponentialDispersionModel):
-            self._family_instance = self.family
-        elif self.family in EDM_DISTRIBUTIONS:
-            self._family_instance = EDM_DISTRIBUTIONS[self.family]()
-        else:
+        if not issubclass(self.base_loss_class, BaseLoss):
             raise ValueError(
-                "The family must be an instance of class"
-                " ExponentialDispersionModel or an element of"
-                " ['normal', 'poisson', 'gamma', 'inverse-gaussian']"
-                "; got (family={0})".format(self.family)
+                "The base_loss_class must be an subclass of"
+                " sklearn._loss.loss.BaseLoss; ; got"
+                f" (base_loss_class={self.base_loss})"
             )
 
-        # Guarantee that self._link_instance is set to an instance of
-        # class BaseLink
-        if isinstance(self.link, BaseLink):
-            self._link_instance = self.link
-        else:
-            if self.link == "auto":
-                if isinstance(self._family_instance, TweedieDistribution):
-                    if self._family_instance.power <= 0:
-                        self._link_instance = IdentityLink()
-                    if self._family_instance.power >= 1:
-                        self._link_instance = LogLink()
-                else:
-                    raise ValueError(
-                        "No default link known for the "
-                        "specified distribution family. Please "
-                        "set link manually, i.e. not to 'auto'; "
-                        "got (link='auto', family={})".format(self.family)
-                    )
-            elif self.link == "identity":
-                self._link_instance = IdentityLink()
-            elif self.link == "log":
-                self._link_instance = LogLink()
-            else:
-                raise ValueError(
-                    "The link must be an instance of class Link or "
-                    "an element of ['auto', 'identity', 'log']; "
-                    "got (link={0})".format(self.link)
-                )
-
         check_scalar(
             self.alpha,
             name="alpha",
@@ -257,9 +228,6 @@ def fit(self, X, y, sample_weight=None):
                 "The argument warm_start must be bool; got {0}".format(self.warm_start)
             )
 
-        family = self._family_instance
-        link = self._link_instance
-
         X, y = self._validate_data(
             X,
             y,
@@ -268,58 +236,66 @@ def fit(self, X, y, sample_weight=None):
             y_numeric=True,
             multi_output=False,
         )
+        # required by losses
+        y = check_array(y, dtype=[np.float64, np.float32], order="C", ensure_2d=False)
+
+        # TODO: We could support samples_weight=None as the losses support it.
+        # Note that _check_sample_weight calls check_array(order="C") required by
+        # losses.
+        sample_weight = _check_sample_weight(
+            sample_weight, X, dtype=[np.float64, np.float32]
+        )
 
-        weights = _check_sample_weight(sample_weight, X)
+        n_samples, n_features = X.shape
 
-        _, n_features = X.shape
+        self._linear_loss = LinearModelLoss(
+            base_loss=self._get_base_loss_instance(),
+            fit_intercept=self.fit_intercept,
+        )
 
-        if not np.all(family.in_y_range(y)):
+        if not self._linear_loss.base_loss.in_y_true_range(y):
             raise ValueError(
-                "Some value(s) of y are out of the valid range for family {0}".format(
-                    family.__class__.__name__
-                )
+                "Some value(s) of y are out of the valid range of the loss"
+                f" {self.base_loss_class.__name__}."
             )
+
         # TODO: if alpha=0 check that X is not rank deficient
 
-        # rescaling of sample_weight
-        #
-        # IMPORTANT NOTE: Since we want to minimize
-        # 1/(2*sum(sample_weight)) * deviance + L2,
-        # deviance = sum(sample_weight * unit_deviance),
-        # we rescale weights such that sum(weights) = 1 and this becomes
-        # 1/2*deviance + L2 with deviance=sum(weights * unit_deviance)
-        weights = weights / weights.sum()
+        # IMPORTANT NOTE: Rescaling of sample_weight:
+        # We want to minimize
+        #     obj = 1/(2*sum(sample_weight)) * sum(sample_weight * deviance)
+        #         + 1/2 * alpha * L2,
+        # with
+        #     deviance = 2 * loss.
+        # The objective is invariant to multiplying sample_weight by a constant. We
+        # choose this constant such that sum(sample_weight) = 1. Thus, we end up with
+        #     obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
+        # Note that LinearModelLoss.loss() computes sum(sample_weight * loss).
+        sample_weight = sample_weight / sample_weight.sum()
 
         if self.warm_start and hasattr(self, "coef_"):
             if self.fit_intercept:
-                coef = np.concatenate((np.array([self.intercept_]), self.coef_))
+                # LinearModelLoss needs intercept at the end of coefficient array.
+                coef = np.concatenate((self.coef_, np.array([self.intercept_])))
             else:
                 coef = self.coef_
         else:
             if self.fit_intercept:
-                coef = np.zeros(n_features + 1)
-                coef[0] = link(np.average(y, weights=weights))
+                coef = np.zeros(n_features + 1, dtype=X.dtype)
+                coef[-1] = self._linear_loss.base_loss.link.link(
+                    np.average(y, weights=sample_weight)
+                )
             else:
-                coef = np.zeros(n_features)
-
-        # algorithms for optimization
+                coef = np.zeros(n_features, dtype=X.dtype)
 
+        # Algorithms for optimization:
+        # Note again that our losses implement 1/2 * deviance.
         if solver == "lbfgs":
-
-            def func(coef, X, y, weights, alpha, family, link):
-                y_pred, devp = _y_pred_deviance_derivative(
-                    coef, X, y, weights, family, link
-                )
-                dev = family.deviance(y, y_pred, weights)
-                # offset if coef[0] is intercept
-                offset = 1 if self.fit_intercept else 0
-                coef_scaled = alpha * coef[offset:]
-                obj = 0.5 * dev + 0.5 * (coef[offset:] @ coef_scaled)
-                objp = 0.5 * devp
-                objp[offset:] += coef_scaled
-                return obj, objp
-
-            args = (X, y, weights, self.alpha, family, link)
+            func = self._linear_loss.loss_gradient
+            l2_reg_strength = self.alpha
+            # TODO: In the future, we would like
+            # n_threads = _openmp_effective_n_threads()
+            n_threads = 1
 
             opt_res = scipy.optimize.minimize(
                 func,
@@ -332,14 +308,14 @@ def func(coef, X, y, weights, alpha, family, link):
                     "gtol": self.tol,
                     "ftol": 1e3 * np.finfo(float).eps,
                 },
-                args=args,
+                args=(X, y, sample_weight, l2_reg_strength, n_threads),
             )
             self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
             coef = opt_res.x
 
         if self.fit_intercept:
-            self.intercept_ = coef[0]
-            self.coef_ = coef[1:]
+            self.intercept_ = coef[-1]
+            self.coef_ = coef[:-1]
         else:
             # set intercept to zero as the other linear models do
             self.intercept_ = 0.0
@@ -350,6 +326,8 @@ def func(coef, X, y, weights, alpha, family, link):
     def _linear_predictor(self, X):
         """Compute the linear_predictor = `X @ coef_ + intercept_`.
 
+        Note that we often use the term raw_prediction instead of linear predictor.
+
         Parameters
         ----------
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -385,8 +363,8 @@ def predict(self, X):
             Returns predicted values.
         """
         # check_array is done in _linear_predictor
-        eta = self._linear_predictor(X)
-        y_pred = self._link_instance.inverse(eta)
+        raw_prediction = self._linear_predictor(X)
+        y_pred = self._linear_loss.base_loss.link.inverse(raw_prediction)
         return y_pred
 
     def score(self, X, y, sample_weight=None):
@@ -394,7 +372,7 @@ def score(self, X, y, sample_weight=None):
 
         D^2 is a generalization of the coefficient of determination R^2.
         R^2 uses squared error and D^2 deviance. Note that those two are equal
-        for ``family='normal'``.
+        for ``base_loss_class=HalfSquaredError``.
 
         D^2 is defined as
         :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
@@ -423,24 +401,57 @@ def score(self, X, y, sample_weight=None):
         # Note, default score defined in RegressorMixin is R^2 score.
         # TODO: make D^2 a score function in module metrics (and thereby get
         #       input validation and so on)
-        weights = _check_sample_weight(sample_weight, X)
-        y_pred = self.predict(X)
-        dev = self._family_instance.deviance(y, y_pred, weights=weights)
-        y_mean = np.average(y, weights=weights)
-        dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
-        return 1 - dev / dev_null
+        raw_prediction = self._linear_predictor(X)  # validates X
+        # required by losses
+        y = check_array(y, dtype=[np.float64, np.float32], order="C", ensure_2d=False)
+
+        if sample_weight is not None:
+            # Note that _check_sample_weight calls check_array(order="C") required by
+            # losses.
+            sample_weight = _check_sample_weight(
+                sample_weight, X, dtype=[np.float64, np.float32]
+            )
+
+        base_loss = self._linear_loss.base_loss
+
+        if not base_loss.in_y_true_range(y):
+            raise ValueError(
+                "Some value(s) of y are out of the valid range of the loss"
+                f" {self.base_loss_class.__name__}."
+            )
+
+        # Note that constant_to_optimal_zero is already multiplied by sample_weight.
+        constant = np.mean(base_loss.constant_to_optimal_zero(y_true=y))
+        if sample_weight is not None:
+            constant *= sample_weight.shape[0] / np.sum(sample_weight)
+
+        # Missing factor of 2 in deviance cancels out.
+        deviance = base_loss(
+            y_true=y,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            n_threads=1,
+        )
+        y_mean = base_loss.link.link(np.average(y, weights=sample_weight))
+        deviance_null = base_loss(
+            y_true=y,
+            raw_prediction=np.tile(y_mean, y.shape[0]),
+            sample_weight=sample_weight,
+            n_threads=1,
+        )
+        return 1 - (deviance + constant) / (deviance_null + constant)
 
     def _more_tags(self):
-        # create the _family_instance if fit wasn't called yet.
-        if hasattr(self, "_family_instance"):
-            _family_instance = self._family_instance
-        elif isinstance(self.family, ExponentialDispersionModel):
-            _family_instance = self.family
-        elif self.family in EDM_DISTRIBUTIONS:
-            _family_instance = EDM_DISTRIBUTIONS[self.family]()
-        else:
-            raise ValueError
-        return {"requires_positive_y": not _family_instance.in_y_range(-1.0)}
+        # create instance of BaseLoss if fit wasn't called yet.
+        base_loss = self.base_loss_class(**self.base_loss_params)
+        return {"requires_positive_y": not base_loss.in_y_true_range(-1.0)}
+
+    def _get_base_loss_instance(self):
+        # This is only necessary because of the link and power arguments of the
+        # TweedieRegressor.
+        # Note that we do not need to pass sample_weight to the loss class as this is
+        # only needed to set loss.constant_hessian on which GLMs do not rely.
+        return self.base_loss_class(**self.base_loss_params)
 
 
 class PoissonRegressor(GeneralizedLinearRegressor):
@@ -540,28 +551,22 @@ def __init__(
         warm_start=False,
         verbose=0,
     ):
-
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
-            family="poisson",
-            link="log",
+            base_loss_class=HalfPoissonLoss,
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
             verbose=verbose,
         )
 
-    @property
-    def family(self):
-        """Return the string `'poisson'`."""
-        # Make this attribute read-only to avoid mis-uses e.g. in GridSearch.
-        return "poisson"
-
-    @family.setter
-    def family(self, value):
-        if value != "poisson":
-            raise ValueError("PoissonRegressor.family must be 'poisson'!")
+    def _get_base_loss_instance(self):
+        if self.base_loss_class != HalfPoissonLoss:
+            raise ValueError(
+                "PoissonRegressor.base_loss_class must be HalfPoissonLoss!"
+            )
+        return HalfPoissonLoss()
 
 
 class GammaRegressor(GeneralizedLinearRegressor):
@@ -661,28 +666,20 @@ def __init__(
         warm_start=False,
         verbose=0,
     ):
-
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
-            family="gamma",
-            link="log",
+            base_loss_class=HalfGammaLoss,
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
             verbose=verbose,
         )
 
-    @property
-    def family(self):
-        """Return the family of the regressor."""
-        # Make this attribute read-only to avoid mis-uses e.g. in GridSearch.
-        return "gamma"
-
-    @family.setter
-    def family(self, value):
-        if value != "gamma":
-            raise ValueError("GammaRegressor.family must be 'gamma'!")
+    def _get_base_loss_instance(self):
+        if self.base_loss_class != HalfGammaLoss:
+            raise ValueError("GammaRegressor.base_loss_class must be HalfGammaLoss!")
+        return HalfGammaLoss()
 
 
 class TweedieRegressor(GeneralizedLinearRegressor):
@@ -731,10 +728,11 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     link : {'auto', 'identity', 'log'}, default='auto'
         The link function of the GLM, i.e. mapping from linear predictor
         `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
-        the link depending on the chosen family as follows:
+        the link depending on the chosen base_loss_class (EDM family) as follows:
 
-        - 'identity' for Normal distribution
-        - 'log' for Poisson,  Gamma and Inverse Gaussian distributions
+        - 'identity' for ``power <= 0``, e.g. for the Normal distribution
+        - 'log' for ``power > 0``, e.g. for Poisson,  Gamma and Inverse Gaussian
+          distributions
 
     max_iter : int, default=100
         The maximal number of iterations for the solver.
@@ -813,33 +811,48 @@ def __init__(
         warm_start=False,
         verbose=0,
     ):
-
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
-            family=TweedieDistribution(power=power),
-            link=link,
+            base_loss_class=HalfTweedieLoss,
+            base_loss_params={"power": power},
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
             verbose=verbose,
         )
+        self.link = link
+        self.power = power
+
+    def _get_base_loss_instance(self):
+        # This is only necessary because of the link and power arguments of the
+        # TweedieRegressor.
+        if self.base_loss_class not in [HalfTweedieLoss, HalfTweedieLossIdentity]:
+            raise ValueError(
+                "TweedieRegressor.base_loss_class must be HalfTweedieLoss or"
+                " HalfTweedieLossIdentity!"
+            )
+
+        if list(self.base_loss_params.keys()) != ["power"]:
+            raise ValueError(
+                "TweedieRegressor must have base_loss_params={'power': some float}."
+            )
+        # In case parameter power was reset, make it consistent with base_loss_param
+        self.base_loss_params["power"] = self.power
 
-    @property
-    def family(self):
-        """Return the family of the regressor."""
-        # We use a property with a setter to make sure that the family is
-        # always a Tweedie distribution, and that self.power and
-        # self.family.power are identical by construction.
-        dist = TweedieDistribution(power=self.power)
-        # TODO: make the returned object immutable
-        return dist
-
-    @family.setter
-    def family(self, value):
-        if isinstance(value, TweedieDistribution):
-            self.power = value.power
+        if self.link == "auto":
+            if self.power <= 0:
+                # identity link
+                return HalfTweedieLossIdentity(power=self.power)
+            else:
+                # log link
+                return HalfTweedieLoss(power=self.power)
+        elif self.link == "log":
+            return HalfTweedieLoss(power=self.power)
+        elif self.link == "identity":
+            return HalfTweedieLossIdentity(power=self.power)
         else:
-            raise TypeError(
-                "TweedieRegressor.family must be of type TweedieDistribution!"
+            raise ValueError(
+                "The link must be an element of ['auto', 'identity', 'log']; "
+                "got (link={0})".format(self.link)
             )
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 87fe2b51f4d28..10c598c9b5bc8 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -2,27 +2,27 @@
 #
 # License: BSD 3 clause
 
+import re
 import numpy as np
 from numpy.testing import assert_allclose
 import pytest
 import warnings
 
+from sklearn._loss.link import IdentityLink, LogLink
+from sklearn._loss.loss import (
+    HalfGammaLoss,
+    HalfPoissonLoss,
+    HalfSquaredError,
+    HalfTweedieLoss,
+    HalfTweedieLossIdentity,
+)
 from sklearn.datasets import make_regression
 from sklearn.linear_model._glm import GeneralizedLinearRegressor
+from sklearn.linear_model._glm.glm import HalfInverseGaussianLoss
 from sklearn.linear_model import TweedieRegressor, PoissonRegressor, GammaRegressor
-from sklearn.linear_model._glm.link import (
-    IdentityLink,
-    LogLink,
-)
-from sklearn._loss.glm_distribution import (
-    TweedieDistribution,
-    NormalDistribution,
-    PoissonDistribution,
-    GammaDistribution,
-    InverseGaussianDistribution,
-)
 from sklearn.linear_model import Ridge
 from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics import d2_tweedie_score
 from sklearn.model_selection import train_test_split
 
 
@@ -57,59 +57,6 @@ def test_sample_weights_validation():
         glm.fit(X, y, weights)
 
 
-@pytest.mark.parametrize(
-    "name, instance",
-    [
-        ("normal", NormalDistribution()),
-        ("poisson", PoissonDistribution()),
-        ("gamma", GammaDistribution()),
-        ("inverse-gaussian", InverseGaussianDistribution()),
-    ],
-)
-def test_glm_family_argument(name, instance):
-    """Test GLM family argument set as string."""
-    y = np.array([0.1, 0.5])  # in range of all distributions
-    X = np.array([[1], [2]])
-    glm = GeneralizedLinearRegressor(family=name, alpha=0).fit(X, y)
-    assert isinstance(glm._family_instance, instance.__class__)
-
-    glm = GeneralizedLinearRegressor(family="not a family")
-    with pytest.raises(ValueError, match="family must be"):
-        glm.fit(X, y)
-
-
-@pytest.mark.parametrize(
-    "name, instance", [("identity", IdentityLink()), ("log", LogLink())]
-)
-def test_glm_link_argument(name, instance):
-    """Test GLM link argument set as string."""
-    y = np.array([0.1, 0.5])  # in range of all distributions
-    X = np.array([[1], [2]])
-    glm = GeneralizedLinearRegressor(family="normal", link=name).fit(X, y)
-    assert isinstance(glm._link_instance, instance.__class__)
-
-    glm = GeneralizedLinearRegressor(family="normal", link="not a link")
-    with pytest.raises(ValueError, match="link must be"):
-        glm.fit(X, y)
-
-
-@pytest.mark.parametrize(
-    "family, expected_link_class",
-    [
-        ("normal", IdentityLink),
-        ("poisson", LogLink),
-        ("gamma", LogLink),
-        ("inverse-gaussian", LogLink),
-    ],
-)
-def test_glm_link_auto(family, expected_link_class):
-    # Make sure link='auto' delivers the expected link function
-    y = np.array([0.1, 0.5])  # in range of all distributions
-    X = np.array([[1], [2]])
-    glm = GeneralizedLinearRegressor(family=family, link="auto").fit(X, y)
-    assert isinstance(glm._link_instance, expected_link_class)
-
-
 @pytest.mark.parametrize("fit_intercept", ["not bool", 1, 0, [True]])
 def test_glm_fit_intercept_argument(fit_intercept):
     """Test GLM for invalid fit_intercept argument."""
@@ -213,8 +160,7 @@ def test_glm_identity_regression(fit_intercept):
     y = np.dot(X, coef)
     glm = GeneralizedLinearRegressor(
         alpha=0,
-        family="normal",
-        link="identity",
+        base_loss_class=HalfSquaredError,
         fit_intercept=fit_intercept,
         tol=1e-12,
     )
@@ -229,8 +175,10 @@ def test_glm_identity_regression(fit_intercept):
 
 @pytest.mark.parametrize("fit_intercept", [False, True])
 @pytest.mark.parametrize("alpha", [0.0, 1.0])
-@pytest.mark.parametrize("family", ["normal", "poisson", "gamma"])
-def test_glm_sample_weight_consistentcy(fit_intercept, alpha, family):
+@pytest.mark.parametrize(
+    "base_loss_class", [HalfSquaredError, HalfPoissonLoss, HalfGammaLoss]
+)
+def test_glm_sample_weight_consistentcy(fit_intercept, alpha, base_loss_class):
     """Test that the impact of sample_weight is consistent"""
     rng = np.random.RandomState(0)
     n_samples, n_features = 10, 5
@@ -238,7 +186,7 @@ def test_glm_sample_weight_consistentcy(fit_intercept, alpha, family):
     X = rng.rand(n_samples, n_features)
     y = rng.rand(n_samples)
     glm_params = dict(
-        alpha=alpha, family=family, link="auto", fit_intercept=fit_intercept
+        alpha=alpha, base_loss_class=base_loss_class, fit_intercept=fit_intercept
     )
 
     glm = GeneralizedLinearRegressor(**glm_params).fit(X, y)
@@ -280,28 +228,32 @@ def test_glm_sample_weight_consistentcy(fit_intercept, alpha, family):
 
 @pytest.mark.parametrize("fit_intercept", [True, False])
 @pytest.mark.parametrize(
-    "family",
+    "base_loss_class, base_loss_param",
     [
-        NormalDistribution(),
-        PoissonDistribution(),
-        GammaDistribution(),
-        InverseGaussianDistribution(),
-        TweedieDistribution(power=1.5),
-        TweedieDistribution(power=4.5),
+        (HalfPoissonLoss, {}),
+        (HalfGammaLoss, {}),
+        (HalfInverseGaussianLoss, {}),
+        (HalfTweedieLoss, {"power": 0}),
+        (HalfTweedieLoss, {"power": 1.5}),
+        (HalfTweedieLoss, {"power": 4.5}),
     ],
 )
-def test_glm_log_regression(fit_intercept, family):
+def test_glm_log_regression(fit_intercept, base_loss_class, base_loss_param):
     """Test GLM regression with log link on a simple dataset."""
     coef = [0.2, -0.1]
-    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
+    X = np.array([[0, 1, 2, 3, 4], [1, 1, 1, 1, 1]]).T
     y = np.exp(np.dot(X, coef))
     glm = GeneralizedLinearRegressor(
-        alpha=0, family=family, link="log", fit_intercept=fit_intercept, tol=1e-7
+        alpha=0,
+        base_loss_class=base_loss_class,
+        base_loss_params=base_loss_param,
+        fit_intercept=fit_intercept,
+        tol=1e-8,
     )
     if fit_intercept:
-        res = glm.fit(X[:, 1:], y)
-        assert_allclose(res.coef_, coef[1:], rtol=1e-6)
-        assert_allclose(res.intercept_, coef[0], rtol=1e-6)
+        res = glm.fit(X[:, :-1], y)
+        assert_allclose(res.coef_, coef[:-1], rtol=1e-6)
+        assert_allclose(res.intercept_, coef[-1], rtol=1e-6)
     else:
         res = glm.fit(X, y)
         assert_allclose(res.coef_, coef, rtol=2e-6)
@@ -391,8 +343,7 @@ def test_normal_ridge_comparison(
 
     glm = GeneralizedLinearRegressor(
         alpha=alpha,
-        family="normal",
-        link="identity",
+        base_loss_class=HalfSquaredError,
         fit_intercept=fit_intercept,
         max_iter=300,
         tol=1e-5,
@@ -423,8 +374,7 @@ def test_poisson_glmnet():
     glm = GeneralizedLinearRegressor(
         alpha=1,
         fit_intercept=True,
-        family="poisson",
-        link="log",
+        base_loss_class=HalfPoissonLoss,
         tol=1e-7,
         max_iter=300,
     )
@@ -441,47 +391,107 @@ def test_convergence_warning(regression_data):
         est.fit(X, y)
 
 
-def test_poisson_regression_family(regression_data):
-    # Make sure the family attribute is read-only to prevent searching over it
-    # e.g. in a grid search
-    est = PoissonRegressor()
-    est.family == "poisson"
+@pytest.mark.parametrize(
+    "estimator, base_loss",
+    [(PoissonRegressor, HalfPoissonLoss), (GammaRegressor, HalfGammaLoss)],
+)
+def test_raise_on_reset_base_loss_class(regression_data, estimator, base_loss):
+    # Make sure to raise an appropriate error if base_loss_class was reset.
+    X, y = regression_data
 
-    msg = "PoissonRegressor.family must be 'poisson'!"
+    est = estimator()
+    est.base_loss_class = HalfSquaredError
+    msg = f"{estimator.__name__}.base_loss_class must be {base_loss.__name__}!"
     with pytest.raises(ValueError, match=msg):
-        est.family = 0
-
+        est.fit(X, y)
 
-def test_gamma_regression_family(regression_data):
-    # Make sure the family attribute is read-only to prevent searching over it
-    # e.g. in a grid search
-    est = GammaRegressor()
-    est.family == "gamma"
-
-    msg = "GammaRegressor.family must be 'gamma'!"
-    with pytest.raises(ValueError, match=msg):
-        est.family = 0
 
+def test_tweedie_raise_on_reset_base_loss_class(regression_data):
+    # Make sure to raise an appropriate error if base_loss_class or base_loss_params
+    # was inconsistently reset for TweedieDistribution
+    X, y = regression_data
+    # make y positive
+    y = np.abs(y) + 1.0
 
-def test_tweedie_regression_family(regression_data):
-    # Make sure the family attribute is always a TweedieDistribution and that
-    # the power attribute is properly updated
     power = 2.0
     est = TweedieRegressor(power=power)
-    assert isinstance(est.family, TweedieDistribution)
-    assert est.family.power == power
+    assert est.base_loss_class, HalfTweedieLoss
+    assert est.base_loss_params == {"power": power}
     assert est.power == power
 
+    # reset power
     new_power = 0
-    new_family = TweedieDistribution(power=new_power)
-    est.family = new_family
-    assert isinstance(est.family, TweedieDistribution)
-    assert est.family.power == new_power
-    assert est.power == new_power
-
-    msg = "TweedieRegressor.family must be of type TweedieDistribution!"
-    with pytest.raises(TypeError, match=msg):
-        est.family = None
+    est.power = new_power
+    est.fit(X, y)  # fit resets base_loss_param["power"]
+    assert est.power == est.base_loss_params["power"]
+
+    # reset base_loss_params
+    est.base_loss_params = {"should error": True}
+    msg = re.escape(
+        "TweedieRegressor must have base_loss_params={'power': some float}."
+    )
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+    # reset base_loss_class
+    est = TweedieRegressor(power=power)
+    est.base_loss_class = HalfTweedieLossIdentity
+    est.fit(X, y)
+    est.base_loss_class = HalfSquaredError
+    msg = (
+        "TweedieRegressor.base_loss_class must be HalfTweedieLoss or"
+        " HalfTweedieLossIdentity!"
+    )
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "name, link_class", [("identity", IdentityLink), ("log", LogLink)]
+)
+def test_tweedie_link_argument(name, link_class):
+    """Test GLM link argument set as string."""
+    y = np.array([0.1, 0.5])  # in range of all distributions
+    X = np.array([[1], [2]])
+    glm = TweedieRegressor(power=1, link=name).fit(X, y)
+    assert isinstance(glm._linear_loss.base_loss.link, link_class)
+
+    glm = TweedieRegressor(power=1, link="not a link")
+    with pytest.raises(
+        ValueError,
+        match=re.escape("The link must be an element of ['auto', 'identity', 'log']"),
+    ):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "power, expected_link_class",
+    [
+        (0, IdentityLink),  # normal
+        (1, LogLink),  # poisson
+        (2, LogLink),  # gamma
+        (3, LogLink),  # inverse-gaussian
+    ],
+)
+def test_tweedie_link_auto(power, expected_link_class):
+    """Test that link='auto' delivers the expected link function"""
+    y = np.array([0.1, 0.5])  # in range of all distributions
+    X = np.array([[1], [2]])
+    glm = TweedieRegressor(link="auto", power=power).fit(X, y)
+    assert isinstance(glm._linear_loss.base_loss.link, expected_link_class)
+
+
+@pytest.mark.parametrize("power", [0, 1, 1.5, 2, 3])
+@pytest.mark.parametrize("link", ["log", "identity"])
+def test_tweedie_score(regression_data, power, link):
+    """Test that GLM score equals d2_tweedie_score for Tweedie losses."""
+    X, y = regression_data
+    # make y positive
+    y = np.abs(y) + 1.0
+    glm = TweedieRegressor(power=power, link=link).fit(X, y)
+    assert glm.score(X, y) == pytest.approx(
+        d2_tweedie_score(y, glm.predict(X), power=power)
+    )
 
 
 @pytest.mark.parametrize(

From fd8d1ca28fbd9ebd2b44898fa1012d3a547f95f6 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 11:50:14 +0100
Subject: [PATCH 07/43] MAINT remove link.py in _glm submodule

---
 sklearn/linear_model/_glm/link.py            | 110 -------------------
 sklearn/linear_model/_glm/tests/test_link.py |  43 --------
 2 files changed, 153 deletions(-)
 delete mode 100644 sklearn/linear_model/_glm/link.py
 delete mode 100644 sklearn/linear_model/_glm/tests/test_link.py

diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py
deleted file mode 100644
index 878d8e835bc42..0000000000000
--- a/sklearn/linear_model/_glm/link.py
+++ /dev/null
@@ -1,110 +0,0 @@
-"""
-Link functions used in GLM
-"""
-
-# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
-# License: BSD 3 clause
-
-from abc import ABCMeta, abstractmethod
-
-import numpy as np
-from scipy.special import expit, logit
-
-
-class BaseLink(metaclass=ABCMeta):
-    """Abstract base class for Link functions."""
-
-    @abstractmethod
-    def __call__(self, y_pred):
-        """Compute the link function g(y_pred).
-
-        The link function links the mean y_pred=E[Y] to the so called linear
-        predictor (X*w), i.e. g(y_pred) = linear predictor.
-
-        Parameters
-        ----------
-        y_pred : array of shape (n_samples,)
-            Usually the (predicted) mean.
-        """
-
-    @abstractmethod
-    def derivative(self, y_pred):
-        """Compute the derivative of the link g'(y_pred).
-
-        Parameters
-        ----------
-        y_pred : array of shape (n_samples,)
-            Usually the (predicted) mean.
-        """
-
-    @abstractmethod
-    def inverse(self, lin_pred):
-        """Compute the inverse link function h(lin_pred).
-
-        Gives the inverse relationship between linear predictor and the mean
-        y_pred=E[Y], i.e. h(linear predictor) = y_pred.
-
-        Parameters
-        ----------
-        lin_pred : array of shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-
-    @abstractmethod
-    def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse link function h'(lin_pred).
-
-        Parameters
-        ----------
-        lin_pred : array of shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-
-
-class IdentityLink(BaseLink):
-    """The identity link function g(x)=x."""
-
-    def __call__(self, y_pred):
-        return y_pred
-
-    def derivative(self, y_pred):
-        return np.ones_like(y_pred)
-
-    def inverse(self, lin_pred):
-        return lin_pred
-
-    def inverse_derivative(self, lin_pred):
-        return np.ones_like(lin_pred)
-
-
-class LogLink(BaseLink):
-    """The log link function g(x)=log(x)."""
-
-    def __call__(self, y_pred):
-        return np.log(y_pred)
-
-    def derivative(self, y_pred):
-        return 1 / y_pred
-
-    def inverse(self, lin_pred):
-        return np.exp(lin_pred)
-
-    def inverse_derivative(self, lin_pred):
-        return np.exp(lin_pred)
-
-
-class LogitLink(BaseLink):
-    """The logit link function g(x)=logit(x)."""
-
-    def __call__(self, y_pred):
-        return logit(y_pred)
-
-    def derivative(self, y_pred):
-        return 1 / (y_pred * (1 - y_pred))
-
-    def inverse(self, lin_pred):
-        return expit(lin_pred)
-
-    def inverse_derivative(self, lin_pred):
-        ep = expit(lin_pred)
-        return ep * (1 - ep)
diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py
deleted file mode 100644
index a52d05b7cff6e..0000000000000
--- a/sklearn/linear_model/_glm/tests/test_link.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
-#
-# License: BSD 3 clause
-import numpy as np
-from numpy.testing import assert_allclose
-import pytest
-from scipy.optimize import check_grad
-
-from sklearn.linear_model._glm.link import (
-    IdentityLink,
-    LogLink,
-    LogitLink,
-)
-
-
-LINK_FUNCTIONS = [IdentityLink, LogLink, LogitLink]
-
-
-@pytest.mark.parametrize("Link", LINK_FUNCTIONS)
-def test_link_properties(Link):
-    """Test link inverse and derivative."""
-    rng = np.random.RandomState(42)
-    x = rng.rand(100) * 100
-    link = Link()
-    if isinstance(link, LogitLink):
-        # careful for large x, note expit(36) = 1
-        # limit max eta to 15
-        x = x / 100 * 15
-    assert_allclose(link(link.inverse(x)), x)
-    # if g(h(x)) = x, then g'(h(x)) = 1/h'(x)
-    # g = link, h = link.inverse
-    assert_allclose(link.derivative(link.inverse(x)), 1 / link.inverse_derivative(x))
-
-
-@pytest.mark.parametrize("Link", LINK_FUNCTIONS)
-def test_link_derivative(Link):
-    link = Link()
-    x = np.random.RandomState(0).rand(1)
-    err = check_grad(link, link.derivative, x) / link.derivative(x)
-    assert abs(err) < 1e-6
-
-    err = check_grad(link.inverse, link.inverse_derivative, x) / link.derivative(x)
-    assert abs(err) < 1e-6

From 09b243a3735dce74d24b602981b487bd84509b59 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 12:03:43 +0100
Subject: [PATCH 08/43] CLN googlemail to gmail

---
 sklearn/_loss/link.py            | 2 +-
 sklearn/linear_model/_glm/glm.py | 2 +-
 sklearn/metrics/_regression.py   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index 5104ae2ca6ec5..68d1b89116b99 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -1,7 +1,7 @@
 """
 Module contains classes for invertible (and differentiable) link functions.
 """
-# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index cc3e61a69d810..d247d762b84b0 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -2,7 +2,7 @@
 Generalized Linear Models with Exponential Dispersion Family
 """
 
-# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index df4f6a74ee849..a037098014faa 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -19,7 +19,7 @@
 #          Manoj Kumar <manojkumarsivaraj334@gmail.com>
 #          Michael Eickenberg <michael.eickenberg@gmail.com>
 #          Konstantin Shmelkov <konstantin.shmelkov@polytechnique.edu>
-#          Christian Lorentzen <lorentzen.ch@googlemail.com>
+#          Christian Lorentzen <lorentzen.ch@gmail.com>
 #          Ashutosh Hathidara <ashutoshhathidara98@gmail.com>
 #          Uttam kumar <bajiraouttamsinha@gmail.com>
 #          Sylvain Marie <sylvain.marie@se.com>

From cecb3288bf0018e5277ec18b8b527c4c76c46ad1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 18:10:16 +0100
Subject: [PATCH 09/43] MAINT copy tweedie deviance to regression metrics

---
 sklearn/metrics/_regression.py | 72 ++++++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 13 deletions(-)

diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index a037098014faa..7315c84daf74a 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -25,15 +25,17 @@
 #          Sylvain Marie <sylvain.marie@se.com>
 # License: BSD 3 clause
 
+import numbers
 import warnings
 
 import numpy as np
+from scipy.special import xlogy
 
-from .._loss.glm_distribution import TweedieDistribution
 from ..exceptions import UndefinedMetricWarning
 from ..utils.validation import (
     check_array,
     check_consistent_length,
+    check_scalar,
     _num_samples,
     column_or_1d,
     _check_sample_weight,
@@ -1026,8 +1028,56 @@ def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0):
         sample_weight = column_or_1d(sample_weight)
         sample_weight = sample_weight[:, np.newaxis]
 
-    dist = TweedieDistribution(power=power)
-    dev = dist.unit_deviance(y_true, y_pred, check_input=True)
+    p = check_scalar(
+        power,
+        name="power",
+        target_type=numbers.Real,
+    )
+
+    message = f"Mean Tweedie deviance error with power={p} can only be used on "
+    if p < 0:
+        # 'Extreme stable', y any real number, y_pred > 0
+        if (y_pred <= 0).any():
+            raise ValueError(message + "strictly positive y_pred.")
+    elif p == 0:
+        # Normal, y and y_pred can be any real number
+        pass
+    elif 0 < p < 1:
+        raise ValueError("Tweedie deviance is only defined for power<=0 and power>=1.")
+    elif 1 <= p < 2:
+        # Poisson and compound Poisson distribution, y >= 0, y_pred > 0
+        if (y_true < 0).any() or (y_pred <= 0).any():
+            raise ValueError(message + "non-negative y and strictly positive y_pred.")
+    elif p >= 2:
+        # Gamma and Extreme stable distribution, y and y_pred > 0
+        if (y_true <= 0).any() or (y_pred <= 0).any():
+            raise ValueError(message + "strictly positive y and y_pred.")
+    else:  # pragma: nocover
+        # Unreachable statement
+        raise ValueError
+
+    if p < 0:
+        # 'Extreme stable', y any real number, y_pred > 0
+        dev = 2 * (
+            np.power(np.maximum(y_true, 0), 2 - p) / ((1 - p) * (2 - p))
+            - y_true * np.power(y_pred, 1 - p) / (1 - p)
+            + np.power(y_pred, 2 - p) / (2 - p)
+        )
+    elif p == 0:
+        # Normal distribution, y and y_pred any real number
+        dev = (y_true - y_pred) ** 2
+    elif p == 1:
+        # Poisson distribution
+        dev = 2 * (xlogy(y_true, y_true / y_pred) - y_true + y_pred)
+    elif p == 2:
+        # Gamma distribution
+        dev = 2 * (np.log(y_pred / y_true) + y_true / y_pred - 1)
+    else:
+        dev = 2 * (
+            np.power(y_true, 2 - p) / ((1 - p) * (2 - p))
+            - y_true * np.power(y_pred, 1 - p) / (1 - p)
+            + np.power(y_pred, 2 - p) / (2 - p)
+        )
 
     return np.average(dev, weights=sample_weight)
 
@@ -1191,17 +1241,13 @@ def d2_tweedie_score(y_true, y_pred, *, sample_weight=None, power=0):
         warnings.warn(msg, UndefinedMetricWarning)
         return float("nan")
 
-    if sample_weight is not None:
-        sample_weight = column_or_1d(sample_weight)
-        sample_weight = sample_weight[:, np.newaxis]
-
-    dist = TweedieDistribution(power=power)
-
-    dev = dist.unit_deviance(y_true, y_pred, check_input=True)
-    numerator = np.average(dev, weights=sample_weight)
+    numerator = mean_tweedie_deviance(
+        y_true, y_pred, sample_weight=sample_weight, power=power
+    )
 
     y_avg = np.average(y_true, weights=sample_weight)
-    dev = dist.unit_deviance(y_true, y_avg, check_input=True)
-    denominator = np.average(dev, weights=sample_weight)
+    denominator = mean_tweedie_deviance(
+        y_true, y_avg, sample_weight=sample_weight, power=power
+    )
 
     return 1 - numerator / denominator

From 719bb1be3ee049be23ae927efb357578d04949f7 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 18:20:53 +0100
Subject: [PATCH 10/43] MAINT remove glm_distribution.py

---
 sklearn/_loss/glm_distribution.py            | 369 -------------------
 sklearn/_loss/tests/test_glm_distribution.py | 121 ------
 2 files changed, 490 deletions(-)
 delete mode 100644 sklearn/_loss/glm_distribution.py
 delete mode 100644 sklearn/_loss/tests/test_glm_distribution.py

diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py
deleted file mode 100644
index dfc512c8b10b7..0000000000000
--- a/sklearn/_loss/glm_distribution.py
+++ /dev/null
@@ -1,369 +0,0 @@
-"""
-Distribution functions used in GLM
-"""
-
-# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
-# License: BSD 3 clause
-
-from abc import ABCMeta, abstractmethod
-from collections import namedtuple
-import numbers
-
-import numpy as np
-from scipy.special import xlogy
-
-
-DistributionBoundary = namedtuple("DistributionBoundary", ("value", "inclusive"))
-
-
-class ExponentialDispersionModel(metaclass=ABCMeta):
-    r"""Base class for reproductive Exponential Dispersion Models (EDM).
-
-    The pdf of :math:`Y\sim \mathrm{EDM}(y_\textrm{pred}, \phi)` is given by
-
-    .. math:: p(y| \theta, \phi) = c(y, \phi)
-        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
-        = \tilde{c}(y, \phi)
-            \exp\left(-\frac{d(y, y_\textrm{pred})}{2\phi}\right)
-
-    with mean :math:`\mathrm{E}[Y] = A'(\theta) = y_\textrm{pred}`,
-    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(y_\textrm{pred})`,
-    unit variance :math:`v(y_\textrm{pred})` and
-    unit deviance :math:`d(y,y_\textrm{pred})`.
-
-    Methods
-    -------
-    deviance
-    deviance_derivative
-    in_y_range
-    unit_deviance
-    unit_deviance_derivative
-    unit_variance
-
-    References
-    ----------
-    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
-    """
-
-    def in_y_range(self, y):
-        """Returns ``True`` if y is in the valid range of Y~EDM.
-
-        Parameters
-        ----------
-        y : array of shape (n_samples,)
-            Target values.
-        """
-        # Note that currently supported distributions have +inf upper bound
-
-        if not isinstance(self._lower_bound, DistributionBoundary):
-            raise TypeError(
-                "_lower_bound attribute must be of type DistributionBoundary"
-            )
-
-        if self._lower_bound.inclusive:
-            return np.greater_equal(y, self._lower_bound.value)
-        else:
-            return np.greater(y, self._lower_bound.value)
-
-    @abstractmethod
-    def unit_variance(self, y_pred):
-        r"""Compute the unit variance function.
-
-        The unit variance :math:`v(y_\textrm{pred})` determines the variance as
-        a function of the mean :math:`y_\textrm{pred}` by
-        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(y_\textrm{pred}_i)`.
-        It can also be derived from the unit deviance
-        :math:`d(y,y_\textrm{pred})` as
-
-        .. math:: v(y_\textrm{pred}) = \frac{2}{
-            \frac{\partial^2 d(y,y_\textrm{pred})}{
-            \partialy_\textrm{pred}^2}}\big|_{y=y_\textrm{pred}}
-
-        See also :func:`variance`.
-
-        Parameters
-        ----------
-        y_pred : array of shape (n_samples,)
-            Predicted mean.
-        """
-
-    @abstractmethod
-    def unit_deviance(self, y, y_pred, check_input=False):
-        r"""Compute the unit deviance.
-
-        The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
-        log-likelihood as
-        :math:`d(y,y_\textrm{pred}) = -2\phi\cdot
-        \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
-
-        Parameters
-        ----------
-        y : array of shape (n_samples,)
-            Target values.
-
-        y_pred : array of shape (n_samples,)
-            Predicted mean.
-
-        check_input : bool, default=False
-            If True raise an exception on invalid y or y_pred values, otherwise
-            they will be propagated as NaN.
-        Returns
-        -------
-        deviance: array of shape (n_samples,)
-            Computed deviance
-        """
-
-    def unit_deviance_derivative(self, y, y_pred):
-        r"""Compute the derivative of the unit deviance w.r.t. y_pred.
-
-        The derivative of the unit deviance is given by
-        :math:`\frac{\partial}{\partialy_\textrm{pred}}d(y,y_\textrm{pred})
-             = -2\frac{y-y_\textrm{pred}}{v(y_\textrm{pred})}`
-        with unit variance :math:`v(y_\textrm{pred})`.
-
-        Parameters
-        ----------
-        y : array of shape (n_samples,)
-            Target values.
-
-        y_pred : array of shape (n_samples,)
-            Predicted mean.
-        """
-        return -2 * (y - y_pred) / self.unit_variance(y_pred)
-
-    def deviance(self, y, y_pred, weights=1):
-        r"""Compute the deviance.
-
-        The deviance is a weighted sum of the per sample unit deviances,
-        :math:`D = \sum_i s_i \cdot d(y_i, y_\textrm{pred}_i)`
-        with weights :math:`s_i` and unit deviance
-        :math:`d(y,y_\textrm{pred})`.
-        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
-        \left(loglike(y,y_\textrm{pred},\frac{phi}{s})
-        - loglike(y,y,\frac{phi}{s})\right)`.
-
-        Parameters
-        ----------
-        y : array of shape (n_samples,)
-            Target values.
-
-        y_pred : array of shape (n_samples,)
-            Predicted mean.
-
-        weights : {int, array of shape (n_samples,)}, default=1
-            Weights or exposure to which variance is inverse proportional.
-        """
-        return np.sum(weights * self.unit_deviance(y, y_pred))
-
-    def deviance_derivative(self, y, y_pred, weights=1):
-        r"""Compute the derivative of the deviance w.r.t. y_pred.
-
-        It gives :math:`\frac{\partial}{\partial y_\textrm{pred}}
-        D(y, \y_\textrm{pred}; weights)`.
-
-        Parameters
-        ----------
-        y : array, shape (n_samples,)
-            Target values.
-
-        y_pred : array, shape (n_samples,)
-            Predicted mean.
-
-        weights : {int, array of shape (n_samples,)}, default=1
-            Weights or exposure to which variance is inverse proportional.
-        """
-        return weights * self.unit_deviance_derivative(y, y_pred)
-
-
-class TweedieDistribution(ExponentialDispersionModel):
-    r"""A class for the Tweedie distribution.
-
-    A Tweedie distribution with mean :math:`y_\textrm{pred}=\mathrm{E}[Y]`
-    is uniquely defined by it's mean-variance relationship
-    :math:`\mathrm{Var}[Y] \propto y_\textrm{pred}^power`.
-
-    Special cases are:
-
-    ===== ================
-    Power Distribution
-    ===== ================
-    0     Normal
-    1     Poisson
-    (1,2) Compound Poisson
-    2     Gamma
-    3     Inverse Gaussian
-
-    Parameters
-    ----------
-    power : float, default=0
-            The variance power of the `unit_variance`
-            :math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`.
-            For ``0<power<1``, no distribution exists.
-    """
-
-    def __init__(self, power=0):
-        self.power = power
-
-    @property
-    def power(self):
-        return self._power
-
-    @power.setter
-    def power(self, power):
-        # We use a property with a setter, to update lower and
-        # upper bound when the power parameter is updated e.g. in grid
-        # search.
-        if not isinstance(power, numbers.Real):
-            raise TypeError("power must be a real number, input was {0}".format(power))
-
-        if power <= 0:
-            # Extreme Stable or Normal distribution
-            self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False)
-        elif 0 < power < 1:
-            raise ValueError(
-                "Tweedie distribution is only defined for power<=0 and power>=1."
-            )
-        elif 1 <= power < 2:
-            # Poisson or Compound Poisson distribution
-            self._lower_bound = DistributionBoundary(0, inclusive=True)
-        elif power >= 2:
-            # Gamma, Positive Stable, Inverse Gaussian distributions
-            self._lower_bound = DistributionBoundary(0, inclusive=False)
-        else:  # pragma: no cover
-            # this branch should be unreachable.
-            raise ValueError
-
-        self._power = power
-
-    def unit_variance(self, y_pred):
-        """Compute the unit variance of a Tweedie distribution
-        v(y_\textrm{pred})=y_\textrm{pred}**power.
-
-        Parameters
-        ----------
-        y_pred : array of shape (n_samples,)
-            Predicted mean.
-        """
-        return np.power(y_pred, self.power)
-
-    def unit_deviance(self, y, y_pred, check_input=False):
-        r"""Compute the unit deviance.
-
-        The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
-        log-likelihood as
-        :math:`d(y,y_\textrm{pred}) = -2\phi\cdot
-        \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
-
-        Parameters
-        ----------
-        y : array of shape (n_samples,)
-            Target values.
-
-        y_pred : array of shape (n_samples,)
-            Predicted mean.
-
-        check_input : bool, default=False
-            If True raise an exception on invalid y or y_pred values, otherwise
-            they will be propagated as NaN.
-        Returns
-        -------
-        deviance: array of shape (n_samples,)
-            Computed deviance
-        """
-        p = self.power
-
-        if check_input:
-            message = (
-                "Mean Tweedie deviance error with power={} can only be used on ".format(
-                    p
-                )
-            )
-            if p < 0:
-                # 'Extreme stable', y any real number, y_pred > 0
-                if (y_pred <= 0).any():
-                    raise ValueError(message + "strictly positive y_pred.")
-            elif p == 0:
-                # Normal, y and y_pred can be any real number
-                pass
-            elif 0 < p < 1:
-                raise ValueError(
-                    "Tweedie deviance is only defined for power<=0 and power>=1."
-                )
-            elif 1 <= p < 2:
-                # Poisson and compound Poisson distribution, y >= 0, y_pred > 0
-                if (y < 0).any() or (y_pred <= 0).any():
-                    raise ValueError(
-                        message + "non-negative y and strictly positive y_pred."
-                    )
-            elif p >= 2:
-                # Gamma and Extreme stable distribution, y and y_pred > 0
-                if (y <= 0).any() or (y_pred <= 0).any():
-                    raise ValueError(message + "strictly positive y and y_pred.")
-            else:  # pragma: nocover
-                # Unreachable statement
-                raise ValueError
-
-        if p < 0:
-            # 'Extreme stable', y any real number, y_pred > 0
-            dev = 2 * (
-                np.power(np.maximum(y, 0), 2 - p) / ((1 - p) * (2 - p))
-                - y * np.power(y_pred, 1 - p) / (1 - p)
-                + np.power(y_pred, 2 - p) / (2 - p)
-            )
-
-        elif p == 0:
-            # Normal distribution, y and y_pred any real number
-            dev = (y - y_pred) ** 2
-        elif p < 1:
-            raise ValueError(
-                "Tweedie deviance is only defined for power<=0 and power>=1."
-            )
-        elif p == 1:
-            # Poisson distribution
-            dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
-        elif p == 2:
-            # Gamma distribution
-            dev = 2 * (np.log(y_pred / y) + y / y_pred - 1)
-        else:
-            dev = 2 * (
-                np.power(y, 2 - p) / ((1 - p) * (2 - p))
-                - y * np.power(y_pred, 1 - p) / (1 - p)
-                + np.power(y_pred, 2 - p) / (2 - p)
-            )
-        return dev
-
-
-class NormalDistribution(TweedieDistribution):
-    """Class for the Normal (aka Gaussian) distribution."""
-
-    def __init__(self):
-        super().__init__(power=0)
-
-
-class PoissonDistribution(TweedieDistribution):
-    """Class for the scaled Poisson distribution."""
-
-    def __init__(self):
-        super().__init__(power=1)
-
-
-class GammaDistribution(TweedieDistribution):
-    """Class for the Gamma distribution."""
-
-    def __init__(self):
-        super().__init__(power=2)
-
-
-class InverseGaussianDistribution(TweedieDistribution):
-    """Class for the scaled InverseGaussianDistribution distribution."""
-
-    def __init__(self):
-        super().__init__(power=3)
-
-
-EDM_DISTRIBUTIONS = {
-    "normal": NormalDistribution,
-    "poisson": PoissonDistribution,
-    "gamma": GammaDistribution,
-    "inverse-gaussian": InverseGaussianDistribution,
-}
diff --git a/sklearn/_loss/tests/test_glm_distribution.py b/sklearn/_loss/tests/test_glm_distribution.py
deleted file mode 100644
index 453f61e2f3214..0000000000000
--- a/sklearn/_loss/tests/test_glm_distribution.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
-#
-# License: BSD 3 clause
-import numpy as np
-from numpy.testing import (
-    assert_allclose,
-    assert_array_equal,
-)
-from scipy.optimize import check_grad
-import pytest
-
-from sklearn._loss.glm_distribution import (
-    TweedieDistribution,
-    NormalDistribution,
-    PoissonDistribution,
-    GammaDistribution,
-    InverseGaussianDistribution,
-    DistributionBoundary,
-)
-
-
-@pytest.mark.parametrize(
-    "family, expected",
-    [
-        (NormalDistribution(), [True, True, True]),
-        (PoissonDistribution(), [False, True, True]),
-        (TweedieDistribution(power=1.5), [False, True, True]),
-        (GammaDistribution(), [False, False, True]),
-        (InverseGaussianDistribution(), [False, False, True]),
-        (TweedieDistribution(power=4.5), [False, False, True]),
-    ],
-)
-def test_family_bounds(family, expected):
-    """Test the valid range of distributions at -1, 0, 1."""
-    result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, expected)
-
-
-def test_invalid_distribution_bound():
-    dist = TweedieDistribution()
-    dist._lower_bound = 0
-    with pytest.raises(TypeError, match="must be of type DistributionBoundary"):
-        dist.in_y_range([-1, 0, 1])
-
-
-def test_tweedie_distribution_power():
-    msg = "distribution is only defined for power<=0 and power>=1"
-    with pytest.raises(ValueError, match=msg):
-        TweedieDistribution(power=0.5)
-
-    with pytest.raises(TypeError, match="must be a real number"):
-        TweedieDistribution(power=1j)
-
-    with pytest.raises(TypeError, match="must be a real number"):
-        dist = TweedieDistribution()
-        dist.power = 1j
-
-    dist = TweedieDistribution()
-    assert isinstance(dist._lower_bound, DistributionBoundary)
-
-    assert dist._lower_bound.inclusive is False
-    dist.power = 1
-    assert dist._lower_bound.value == 0.0
-    assert dist._lower_bound.inclusive is True
-
-
-@pytest.mark.parametrize(
-    "family, chk_values",
-    [
-        (NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
-        (PoissonDistribution(), [0.1, 1.5]),
-        (GammaDistribution(), [0.1, 1.5]),
-        (InverseGaussianDistribution(), [0.1, 1.5]),
-        (TweedieDistribution(power=-2.5), [0.1, 1.5]),
-        (TweedieDistribution(power=-1), [0.1, 1.5]),
-        (TweedieDistribution(power=1.5), [0.1, 1.5]),
-        (TweedieDistribution(power=2.5), [0.1, 1.5]),
-        (TweedieDistribution(power=-4), [0.1, 1.5]),
-    ],
-)
-def test_deviance_zero(family, chk_values):
-    """Test deviance(y,y) = 0 for different families."""
-    for x in chk_values:
-        assert_allclose(family.deviance(x, x), 0, atol=1e-9)
-
-
-@pytest.mark.parametrize(
-    "family",
-    [
-        NormalDistribution(),
-        PoissonDistribution(),
-        GammaDistribution(),
-        InverseGaussianDistribution(),
-        TweedieDistribution(power=-2.5),
-        TweedieDistribution(power=-1),
-        TweedieDistribution(power=1.5),
-        TweedieDistribution(power=2.5),
-        TweedieDistribution(power=-4),
-    ],
-    ids=lambda x: x.__class__.__name__,
-)
-def test_deviance_derivative(family):
-    """Test deviance derivative for different families."""
-    rng = np.random.RandomState(0)
-    y_true = rng.rand(10)
-    # make data positive
-    y_true += np.abs(y_true.min()) + 1e-2
-
-    y_pred = y_true + np.fmax(rng.rand(10), 0.0)
-
-    dev = family.deviance(y_true, y_pred)
-    assert isinstance(dev, float)
-    dev_derivative = family.deviance_derivative(y_true, y_pred)
-    assert dev_derivative.shape == y_pred.shape
-
-    err = check_grad(
-        lambda y_pred: family.deviance(y_true, y_pred),
-        lambda y_pred: family.deviance_derivative(y_true, y_pred),
-        y_pred,
-    ) / np.linalg.norm(dev_derivative)
-    assert abs(err) < 1e-6

From 5f569688f78fb8d44e4ab2afdb432c7221568057 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 18:21:14 +0100
Subject: [PATCH 11/43] CLN fix comment

---
 sklearn/linear_model/_glm/tests/test_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 10c598c9b5bc8..db1bf483085f0 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -408,7 +408,7 @@ def test_raise_on_reset_base_loss_class(regression_data, estimator, base_loss):
 
 def test_tweedie_raise_on_reset_base_loss_class(regression_data):
     # Make sure to raise an appropriate error if base_loss_class or base_loss_params
-    # was inconsistently reset for TweedieDistribution
+    # was inconsistently reset for a Tweedie deviance loss.
     X, y = regression_data
     # make y positive
     y = np.abs(y) + 1.0

From 7adc1bcb7efe193aec0962bf0b5c5aa064424451 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 19:59:24 +0100
Subject: [PATCH 12/43] CLN fix bug in tweedie deviance metrics

---
 sklearn/metrics/_regression.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 7315c84daf74a..00ceca2f585b9 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -1234,18 +1234,18 @@ def d2_tweedie_score(y_true, y_pred, *, sample_weight=None, power=0):
     )
     if y_type == "continuous-multioutput":
         raise ValueError("Multioutput not supported in d2_tweedie_score")
-    check_consistent_length(y_true, y_pred, sample_weight)
 
     if _num_samples(y_pred) < 2:
         msg = "D^2 score is not well-defined with less than two samples."
         warnings.warn(msg, UndefinedMetricWarning)
         return float("nan")
 
+    y_true, y_pred = np.squeeze(y_true), np.squeeze(y_pred)
     numerator = mean_tweedie_deviance(
         y_true, y_pred, sample_weight=sample_weight, power=power
     )
 
-    y_avg = np.average(y_true, weights=sample_weight)
+    y_avg = np.full_like(y_true, np.average(y_true, weights=sample_weight))
     denominator = mean_tweedie_deviance(
         y_true, y_avg, sample_weight=sample_weight, power=power
     )

From 2c13683cc19266d50c018a7cd682f6824320c6d5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 21:09:14 +0100
Subject: [PATCH 13/43] FIX dtype because of lbfgs

---
 sklearn/linear_model/_glm/glm.py     | 19 +++++++++++++------
 sklearn/linear_model/_linear_loss.py | 14 +++++++-------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index d247d762b84b0..c9cf2b70e07a8 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -236,15 +236,21 @@ def fit(self, X, y, sample_weight=None):
             y_numeric=True,
             multi_output=False,
         )
+
         # required by losses
-        y = check_array(y, dtype=[np.float64, np.float32], order="C", ensure_2d=False)
+        if solver == "lbfgs":
+            # lbfgs will force coef and therefore raw_prediction to be float64. The
+            # base_loss needs y, X @ coef and sample_weight all of same dtype
+            # (and contiguous).
+            loss_dtype = np.float64
+        else:
+            loss_dtype = min(max(y.dtype, X.dtype), np.float64)
+        y = check_array(y, dtype=loss_dtype, order="C", ensure_2d=False)
 
         # TODO: We could support samples_weight=None as the losses support it.
         # Note that _check_sample_weight calls check_array(order="C") required by
         # losses.
-        sample_weight = _check_sample_weight(
-            sample_weight, X, dtype=[np.float64, np.float32]
-        )
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
 
         n_samples, n_features = X.shape
 
@@ -279,14 +285,15 @@ def fit(self, X, y, sample_weight=None):
                 coef = np.concatenate((self.coef_, np.array([self.intercept_])))
             else:
                 coef = self.coef_
+            coef = coef.astype(loss_dtype, copy=False)
         else:
             if self.fit_intercept:
-                coef = np.zeros(n_features + 1, dtype=X.dtype)
+                coef = np.zeros(n_features + 1, dtype=loss_dtype)
                 coef[-1] = self._linear_loss.base_loss.link.link(
                     np.average(y, weights=sample_weight)
                 )
             else:
-                coef = np.zeros(n_features, dtype=X.dtype)
+                coef = np.zeros(n_features, dtype=loss_dtype)
 
         # Algorithms for optimization:
         # Note again that our losses implement 1/2 * deviance.
diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
index 88b42c952b509..64a99325dcd7a 100644
--- a/sklearn/linear_model/_linear_loss.py
+++ b/sklearn/linear_model/_linear_loss.py
@@ -196,13 +196,13 @@ def loss_gradient(
 
         if not self.base_loss.is_multiclass:
             loss += 0.5 * l2_reg_strength * (weights @ weights)
-            grad = np.empty_like(coef, dtype=X.dtype)
+            grad = np.empty_like(coef, dtype=weights.dtype)
             grad[:n_features] = X.T @ grad_per_sample + l2_reg_strength * weights
             if self.fit_intercept:
                 grad[-1] = grad_per_sample.sum()
         else:
             loss += 0.5 * l2_reg_strength * squared_norm(weights)
-            grad = np.empty((n_classes, n_dof), dtype=X.dtype, order="F")
+            grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
             # grad_per_sample.shape = (n_samples, n_classes)
             grad[:, :n_features] = grad_per_sample.T @ X + l2_reg_strength * weights
             if self.fit_intercept:
@@ -252,13 +252,13 @@ def gradient(
         )
 
         if not self.base_loss.is_multiclass:
-            grad = np.empty_like(coef, dtype=X.dtype)
+            grad = np.empty_like(coef, dtype=weights.dtype)
             grad[:n_features] = X.T @ grad_per_sample + l2_reg_strength * weights
             if self.fit_intercept:
                 grad[-1] = grad_per_sample.sum()
             return grad
         else:
-            grad = np.empty((n_classes, n_dof), dtype=X.dtype, order="F")
+            grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
             # gradient.shape = (n_samples, n_classes)
             grad[:, :n_features] = grad_per_sample.T @ X + l2_reg_strength * weights
             if self.fit_intercept:
@@ -311,7 +311,7 @@ def gradient_hessian_product(
                 sample_weight=sample_weight,
                 n_threads=n_threads,
             )
-            grad = np.empty_like(coef, dtype=X.dtype)
+            grad = np.empty_like(coef, dtype=weights.dtype)
             grad[:n_features] = X.T @ gradient + l2_reg_strength * weights
             if self.fit_intercept:
                 grad[-1] = gradient.sum()
@@ -358,7 +358,7 @@ def hessp(s):
                 sample_weight=sample_weight,
                 n_threads=n_threads,
             )
-            grad = np.empty((n_classes, n_dof), dtype=X.dtype, order="F")
+            grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
             grad[:, :n_features] = gradient.T @ X + l2_reg_strength * weights
             if self.fit_intercept:
                 grad[:, -1] = gradient.sum(axis=0)
@@ -398,7 +398,7 @@ def hessp(s):
                     tmp *= sample_weight[:, np.newaxis]
                 # hess_prod = empty_like(grad), but we ravel grad below and this
                 # function is run after that.
-                hess_prod = np.empty((n_classes, n_dof), dtype=X.dtype, order="F")
+                hess_prod = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
                 hess_prod[:, :n_features] = tmp.T @ X + l2_reg_strength * s
                 if self.fit_intercept:
                     hess_prod[:, -1] = tmp.sum(axis=0)

From 02dd2de041cbb961ff4cc614077f55700f01eba8 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 21:10:58 +0100
Subject: [PATCH 14/43] TST TweedieRegressor for validate in init

---
 sklearn/tests/test_common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 350e1e95d9882..0f13cd1431a38 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -418,7 +418,6 @@ def test_transformers_get_feature_names_out(transformer):
     "ColumnTransformer",
     "SGDOneClassSVM",
     "TheilSenRegressor",
-    "TweedieRegressor",
 ]
 VALIDATE_ESTIMATOR_INIT = set(VALIDATE_ESTIMATOR_INIT)
 

From 5a455b0baefc1b71815927f08b10490a7a61755c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 21:13:23 +0100
Subject: [PATCH 15/43] ENH use _openmp_effective_n_threads

---
 sklearn/linear_model/_glm/glm.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index c9cf2b70e07a8..72b4749293242 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -23,6 +23,7 @@
 from ...utils.optimize import _check_optimize_result
 from ...utils import check_scalar, check_array
 from ...utils.validation import check_is_fitted, _check_sample_weight
+from ...utils._openmp_helpers import _openmp_effective_n_threads
 from .._linear_loss import LinearModelLoss
 
 
@@ -300,9 +301,7 @@ def fit(self, X, y, sample_weight=None):
         if solver == "lbfgs":
             func = self._linear_loss.loss_gradient
             l2_reg_strength = self.alpha
-            # TODO: In the future, we would like
-            # n_threads = _openmp_effective_n_threads()
-            n_threads = 1
+            n_threads = _openmp_effective_n_threads()
 
             opt_res = scipy.optimize.minimize(
                 func,

From dda943f777ce69ae0bf7fba846843dada5c026f5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 21:19:13 +0100
Subject: [PATCH 16/43] DOC add whatsnew

---
 doc/whats_new/v1.1.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 89392c471d804..2a8f51c65abc1 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -469,6 +469,12 @@ Changelog
   :pr:`21808`, :pr:`20567` and :pr:`21814` by
   :user:`Christian Lorentzen <lorentzenchr>`.
 
+- |Enhancement| :class:`~linear_model.GammaRegressor`,
+  :class:`~linear_model.PoissonRegressor` and :class:`~linear_model.TweedieRegressor`
+  are faster for ``solvers="lbfgs"``.
+  :pr:`22548`, :pr:`21808` and :pr:`20567` by
+  :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Enhancement| Rename parameter `base_estimator` to `estimator` in
   :class:`linear_model.RANSACRegressor` to improve readability and consistency.
   `base_estimator` is deprecated and will be removed in 1.3.
@@ -502,6 +508,10 @@ Changelog
   sub-problem while now all of them are recorded. :pr:`21998` by
   :user:`Olivier Grisel <ogrisel>`.
 
+- |Fix| The input paramter `family` of :class:`linear_model.TweedieRegressor` is not
+  validated in `__init__` anymore. This (private) parameter was removed.
+  :pr:`22548` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 :mod:`sklearn.metrics`
 ......................
 

From 95594643eb25b68274e307a5aee4988f5af3d4d3 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 22:04:15 +0100
Subject: [PATCH 17/43] FIX dtype in score funtion

---
 sklearn/linear_model/_glm/glm.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 72b4749293242..f2fe270121c8b 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -409,14 +409,12 @@ def score(self, X, y, sample_weight=None):
         #       input validation and so on)
         raw_prediction = self._linear_predictor(X)  # validates X
         # required by losses
-        y = check_array(y, dtype=[np.float64, np.float32], order="C", ensure_2d=False)
+        y = check_array(y, dtype=raw_prediction.dtype, order="C", ensure_2d=False)
 
         if sample_weight is not None:
             # Note that _check_sample_weight calls check_array(order="C") required by
             # losses.
-            sample_weight = _check_sample_weight(
-                sample_weight, X, dtype=[np.float64, np.float32]
-            )
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=y.dtype)
 
         base_loss = self._linear_loss.base_loss
 

From 85f616fc5d4de5119a0f5e2a4715ca6b0e5f4459 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 22:19:39 +0100
Subject: [PATCH 18/43] FIX validation of base_loss_class

---
 sklearn/linear_model/_glm/glm.py            | 2 +-
 sklearn/linear_model/_glm/tests/test_glm.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index f2fe270121c8b..1685d20847594 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -183,7 +183,7 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError(
                 "The base_loss_class must be an subclass of"
                 " sklearn._loss.loss.BaseLoss; ; got"
-                f" (base_loss_class={self.base_loss})"
+                f" (base_loss_class={self.base_loss_class})"
             )
 
         check_scalar(
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index db1bf483085f0..821be7a97dfbf 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -77,6 +77,15 @@ def test_glm_solver_argument(solver):
         glm.fit(X, y)
 
 
+def test_glm_validate_base_loss_class():
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(base_loss_class=LogLink)
+    msg = "The base_loss_class must be an subclass of sklearn._loss.loss.BaseLoss"
+    with pytest.raises(ValueError, match=msg):
+        glm.fit(X, y)
+
+
 @pytest.mark.parametrize(
     "Estimator",
     [GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor, TweedieRegressor],

From bc19cf3ef2707dc85f36875665af17056bdf19bb Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 22:21:03 +0100
Subject: [PATCH 19/43] TST validation of y_true range

---
 sklearn/linear_model/_glm/tests/test_glm.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 821be7a97dfbf..853fa935a9ba0 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -161,6 +161,25 @@ def test_glm_warm_start_argument(warm_start):
         glm.fit(X, y)
 
 
+@pytest.mark.parametrize(
+    "glm",
+    [
+        GeneralizedLinearRegressor(
+            base_loss_class=HalfTweedieLoss, base_loss_params={"power": 3}
+        ),
+        PoissonRegressor(),
+        GammaRegressor(),
+        TweedieRegressor(power=1.5),
+    ],
+)
+def test_glm_wrong_y_range(glm):
+    y = np.array([-1, 2])
+    X = np.array([[1], [1]])
+    msg = r"Some value\(s\) of y are out of the valid range of the loss"
+    with pytest.raises(ValueError, match=msg):
+        glm.fit(X, y)
+
+
 @pytest.mark.parametrize("fit_intercept", [False, True])
 def test_glm_identity_regression(fit_intercept):
     """Test GLM regression with identity link on a simple dataset."""

From 9582e8b1647807532fe47e3a85dd6d1d982452d8 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 19 Feb 2022 22:53:18 +0100
Subject: [PATCH 20/43] DOC fix typo

---
 sklearn/linear_model/_glm/glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 1685d20847594..e095208d94e0a 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -44,7 +44,7 @@ class GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
 
     GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and
     predicting the mean of the target y as y_pred=h(X*w) with coefficients w.
-    Therefore, the fit minimizes the following objective function with L2priors as
+    Therefore, the fit minimizes the following objective function with L2 priors as
     regularizer::
 
         1/(2*sum(s_i)) * sum(s_i * deviance(y_i, h(x_i*w)) + 1/2 * alpha * ||w||_2^2

From 3b6beda8b490f6547619a009d576d60315e4050f Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 25 Feb 2022 15:05:19 +0100
Subject: [PATCH 21/43] MNT make GeneralizedLinearRegressor private

---
 sklearn/linear_model/_glm/__init__.py       |  4 +--
 sklearn/linear_model/_glm/glm.py            | 15 ++++-----
 sklearn/linear_model/_glm/tests/test_glm.py | 36 ++++++++++-----------
 3 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py
index e5d944fc225a4..fea9c4d4cf6ba 100644
--- a/sklearn/linear_model/_glm/__init__.py
+++ b/sklearn/linear_model/_glm/__init__.py
@@ -1,14 +1,14 @@
 # License: BSD 3 clause
 
 from .glm import (
-    GeneralizedLinearRegressor,
+    _GeneralizedLinearRegressor,
     PoissonRegressor,
     GammaRegressor,
     TweedieRegressor,
 )
 
 __all__ = [
-    "GeneralizedLinearRegressor",
+    "_GeneralizedLinearRegressor",
     "PoissonRegressor",
     "GammaRegressor",
     "TweedieRegressor",
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index e095208d94e0a..0166cef2c46f6 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -39,7 +39,7 @@ def __init__(self, sample_weight=None):
 
 # TODO: We could allow strings for base_loss_class (as before for the now removed
 # family parameter): 'normal', 'poisson', 'gamma', 'inverse-gaussian'
-class GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
+class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
     """Regression via a penalized Generalized Linear Model (GLM).
 
     GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and
@@ -201,8 +201,8 @@ def fit(self, X, y, sample_weight=None):
             )
         if self.solver not in ["lbfgs"]:
             raise ValueError(
-                "GeneralizedLinearRegressor supports only solvers"
-                "'lbfgs'; got {0}".format(self.solver)
+                f"{self.__class__.__name__} supports only solvers 'lbfgs'; "
+                f"got {self.solver}"
             )
         solver = self.solver
         check_scalar(
@@ -458,7 +458,7 @@ def _get_base_loss_instance(self):
         return self.base_loss_class(**self.base_loss_params)
 
 
-class PoissonRegressor(GeneralizedLinearRegressor):
+class PoissonRegressor(_GeneralizedLinearRegressor):
     """Generalized Linear Model with a Poisson distribution.
 
     This regressor uses the 'log' link function.
@@ -524,8 +524,7 @@ class PoissonRegressor(GeneralizedLinearRegressor):
 
     See Also
     --------
-    GeneralizedLinearRegressor : Generalized Linear Model with a Poisson
-        distribution.
+    TweedieRegressor : Generalized Linear Model with a Tweedie distribution.
 
     Examples
     --------
@@ -573,7 +572,7 @@ def _get_base_loss_instance(self):
         return HalfPoissonLoss()
 
 
-class GammaRegressor(GeneralizedLinearRegressor):
+class GammaRegressor(_GeneralizedLinearRegressor):
     """Generalized Linear Model with a Gamma distribution.
 
     This regressor uses the 'log' link function.
@@ -686,7 +685,7 @@ def _get_base_loss_instance(self):
         return HalfGammaLoss()
 
 
-class TweedieRegressor(GeneralizedLinearRegressor):
+class TweedieRegressor(_GeneralizedLinearRegressor):
     """Generalized Linear Model with a Tweedie distribution.
 
     This estimator can be used to model different GLMs depending on the
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 853fa935a9ba0..c727f21f165e0 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -17,7 +17,7 @@
     HalfTweedieLossIdentity,
 )
 from sklearn.datasets import make_regression
-from sklearn.linear_model._glm import GeneralizedLinearRegressor
+from sklearn.linear_model._glm import _GeneralizedLinearRegressor
 from sklearn.linear_model._glm.glm import HalfInverseGaussianLoss
 from sklearn.linear_model import TweedieRegressor, PoissonRegressor, GammaRegressor
 from sklearn.linear_model import Ridge
@@ -40,7 +40,7 @@ def test_sample_weights_validation():
     X = [[1]]
     y = [1]
     weights = 0
-    glm = GeneralizedLinearRegressor()
+    glm = _GeneralizedLinearRegressor()
 
     # Positive weights are accepted
     glm.fit(X, y, sample_weight=1)
@@ -62,7 +62,7 @@ def test_glm_fit_intercept_argument(fit_intercept):
     """Test GLM for invalid fit_intercept argument."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
+    glm = _GeneralizedLinearRegressor(fit_intercept=fit_intercept)
     with pytest.raises(ValueError, match="fit_intercept must be bool"):
         glm.fit(X, y)
 
@@ -72,7 +72,7 @@ def test_glm_solver_argument(solver):
     """Test GLM for invalid solver argument."""
     y = np.array([1, 2])
     X = np.array([[1], [2]])
-    glm = GeneralizedLinearRegressor(solver=solver)
+    glm = _GeneralizedLinearRegressor(solver=solver)
     with pytest.raises(ValueError):
         glm.fit(X, y)
 
@@ -80,7 +80,7 @@ def test_glm_solver_argument(solver):
 def test_glm_validate_base_loss_class():
     y = np.array([1, 2])
     X = np.array([[1], [2]])
-    glm = GeneralizedLinearRegressor(base_loss_class=LogLink)
+    glm = _GeneralizedLinearRegressor(base_loss_class=LogLink)
     msg = "The base_loss_class must be an subclass of sklearn._loss.loss.BaseLoss"
     with pytest.raises(ValueError, match=msg):
         glm.fit(X, y)
@@ -88,7 +88,7 @@ def test_glm_validate_base_loss_class():
 
 @pytest.mark.parametrize(
     "Estimator",
-    [GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor, TweedieRegressor],
+    [_GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor, TweedieRegressor],
 )
 @pytest.mark.parametrize(
     "params, err_type, err_msg",
@@ -156,7 +156,7 @@ def test_glm_warm_start_argument(warm_start):
     """Test GLM for invalid warm_start argument."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    glm = GeneralizedLinearRegressor(warm_start=warm_start)
+    glm = _GeneralizedLinearRegressor(warm_start=warm_start)
     with pytest.raises(ValueError, match="warm_start must be bool"):
         glm.fit(X, y)
 
@@ -164,7 +164,7 @@ def test_glm_warm_start_argument(warm_start):
 @pytest.mark.parametrize(
     "glm",
     [
-        GeneralizedLinearRegressor(
+        _GeneralizedLinearRegressor(
             base_loss_class=HalfTweedieLoss, base_loss_params={"power": 3}
         ),
         PoissonRegressor(),
@@ -186,7 +186,7 @@ def test_glm_identity_regression(fit_intercept):
     coef = [1.0, 2.0]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
-    glm = GeneralizedLinearRegressor(
+    glm = _GeneralizedLinearRegressor(
         alpha=0,
         base_loss_class=HalfSquaredError,
         fit_intercept=fit_intercept,
@@ -217,7 +217,7 @@ def test_glm_sample_weight_consistentcy(fit_intercept, alpha, base_loss_class):
         alpha=alpha, base_loss_class=base_loss_class, fit_intercept=fit_intercept
     )
 
-    glm = GeneralizedLinearRegressor(**glm_params).fit(X, y)
+    glm = _GeneralizedLinearRegressor(**glm_params).fit(X, y)
     coef = glm.coef_.copy()
 
     # sample_weight=np.ones(..) should be equivalent to sample_weight=None
@@ -246,11 +246,11 @@ def test_glm_sample_weight_consistentcy(fit_intercept, alpha, base_loss_class):
     sample_weight_1 = np.ones(len(y))
     sample_weight_1[: n_samples // 2] = 2
 
-    glm1 = GeneralizedLinearRegressor(**glm_params).fit(
+    glm1 = _GeneralizedLinearRegressor(**glm_params).fit(
         X, y, sample_weight=sample_weight_1
     )
 
-    glm2 = GeneralizedLinearRegressor(**glm_params).fit(X2, y2, sample_weight=None)
+    glm2 = _GeneralizedLinearRegressor(**glm_params).fit(X2, y2, sample_weight=None)
     assert_allclose(glm1.coef_, glm2.coef_)
 
 
@@ -271,7 +271,7 @@ def test_glm_log_regression(fit_intercept, base_loss_class, base_loss_param):
     coef = [0.2, -0.1]
     X = np.array([[0, 1, 2, 3, 4], [1, 1, 1, 1, 1]]).T
     y = np.exp(np.dot(X, coef))
-    glm = GeneralizedLinearRegressor(
+    glm = _GeneralizedLinearRegressor(
         alpha=0,
         base_loss_class=base_loss_class,
         base_loss_params=base_loss_param,
@@ -298,12 +298,12 @@ def test_warm_start(fit_intercept):
         random_state=42,
     )
 
-    glm1 = GeneralizedLinearRegressor(
+    glm1 = _GeneralizedLinearRegressor(
         warm_start=False, fit_intercept=fit_intercept, max_iter=1000
     )
     glm1.fit(X, y)
 
-    glm2 = GeneralizedLinearRegressor(
+    glm2 = _GeneralizedLinearRegressor(
         warm_start=True, fit_intercept=fit_intercept, max_iter=1
     )
     # As we intentionally set max_iter=1, L-BFGS-B will issue a
@@ -369,7 +369,7 @@ def test_normal_ridge_comparison(
     )
     ridge.fit(X_train, y_train, sample_weight=sw_train)
 
-    glm = GeneralizedLinearRegressor(
+    glm = _GeneralizedLinearRegressor(
         alpha=alpha,
         base_loss_class=HalfSquaredError,
         fit_intercept=fit_intercept,
@@ -399,7 +399,7 @@ def test_poisson_glmnet():
     # b            0.03741173122
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
-    glm = GeneralizedLinearRegressor(
+    glm = _GeneralizedLinearRegressor(
         alpha=1,
         fit_intercept=True,
         base_loss_class=HalfPoissonLoss,
@@ -414,7 +414,7 @@ def test_poisson_glmnet():
 def test_convergence_warning(regression_data):
     X, y = regression_data
 
-    est = GeneralizedLinearRegressor(max_iter=1, tol=1e-20)
+    est = _GeneralizedLinearRegressor(max_iter=1, tol=1e-20)
     with pytest.warns(ConvergenceWarning):
         est.fit(X, y)
 

From 0ebe6f2db56ff1773f529388aaa4377930a7a895 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 25 Feb 2022 15:10:24 +0100
Subject: [PATCH 22/43] DOC typo and text improvements

---
 doc/whats_new/v1.1.rst           | 2 +-
 sklearn/linear_model/_glm/glm.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 2a8f51c65abc1..a13dac9979877 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -508,7 +508,7 @@ Changelog
   sub-problem while now all of them are recorded. :pr:`21998` by
   :user:`Olivier Grisel <ogrisel>`.
 
-- |Fix| The input paramter `family` of :class:`linear_model.TweedieRegressor` is not
+- |Fix| The input parameter `family` of :class:`linear_model.TweedieRegressor` is not
   validated in `__init__` anymore. This (private) parameter was removed.
   :pr:`22548` by :user:`Christian Lorentzen <lorentzenchr>`.
 
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 0166cef2c46f6..011d7bfa6327c 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -731,10 +731,10 @@ class TweedieRegressor(_GeneralizedLinearRegressor):
     link : {'auto', 'identity', 'log'}, default='auto'
         The link function of the GLM, i.e. mapping from linear predictor
         `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
-        the link depending on the chosen base_loss_class (EDM family) as follows:
+        the link depending on the chosen `power` parameter as follows:
 
         - 'identity' for ``power <= 0``, e.g. for the Normal distribution
-        - 'log' for ``power > 0``, e.g. for Poisson,  Gamma and Inverse Gaussian
+        - 'log' for ``power > 0``, e.g. for Poisson, Gamma and Inverse Gaussian
           distributions
 
     max_iter : int, default=100

From e314ad91957388fa0d848729610e11e0a800484e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 25 Feb 2022 15:29:36 +0100
Subject: [PATCH 23/43] MNT make base_loss_class private

---
 sklearn/linear_model/_glm/glm.py            | 50 ++++++++++-----------
 sklearn/linear_model/_glm/tests/test_glm.py | 34 +++++++-------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 011d7bfa6327c..6bf3eac955a75 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -37,7 +37,7 @@ def __init__(self, sample_weight=None):
         super().__init__(sample_weight=sample_weight, power=3)
 
 
-# TODO: We could allow strings for base_loss_class (as before for the now removed
+# TODO: We could allow strings for _base_loss_class (as before for the now removed
 # family parameter): 'normal', 'poisson', 'gamma', 'inverse-gaussian'
 class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
     """Regression via a penalized Generalized Linear Model (GLM).
@@ -76,13 +76,13 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
-    base_loss_class : subclass of BaseLoss, default=HalfSquaredError
-        A `base_loss_class` contains a specific loss function as well as the link
+    _base_loss_class : subclass of BaseLoss, default=HalfSquaredError
+        A `_base_loss_class` contains a specific loss function as well as the link
         function. The loss to be minimized specifies the distributional assumption of
         the GLM, i.e. the distribution from the EDM. Here are some examples:
 
         =======================  ========  ==========================
-        base_loss_class          Link       Target Domain
+        _base_loss_class          Link       Target Domain
         =======================  ========  ==========================
         HalfSquaredError         identity  y any real number
         HalfPoissonLoss          log       0 <= y
@@ -96,8 +96,8 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         we have `y_pred = exp(X @ coeff + intercept)`.
 
     base_loss_params : dictionary, default={}
-        Arguments to be passed to base_loss_class, e.g. {"power": 1.5} with
-        `base_loss_class=HalfTweedieLoss`.
+        Arguments to be passed to _base_loss_class, e.g. {"power": 1.5} with
+        `_base_loss_class=HalfTweedieLoss`.
 
     solver : 'lbfgs', default='lbfgs'
         Algorithm to use in the optimization problem:
@@ -142,7 +142,7 @@ def __init__(
         *,
         alpha=1.0,
         fit_intercept=True,
-        base_loss_class=HalfSquaredError,
+        _base_loss_class=HalfSquaredError,
         base_loss_params={},
         solver="lbfgs",
         max_iter=100,
@@ -152,7 +152,7 @@ def __init__(
     ):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
-        self.base_loss_class = base_loss_class
+        self._base_loss_class = _base_loss_class
         self.base_loss_params = base_loss_params
         self.solver = solver
         self.max_iter = max_iter
@@ -179,11 +179,11 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted model.
         """
-        if not issubclass(self.base_loss_class, BaseLoss):
+        if not issubclass(self._base_loss_class, BaseLoss):
             raise ValueError(
-                "The base_loss_class must be an subclass of"
+                "The _base_loss_class must be an subclass of"
                 " sklearn._loss.loss.BaseLoss; ; got"
-                f" (base_loss_class={self.base_loss_class})"
+                f" (_base_loss_class={self._base_loss_class})"
             )
 
         check_scalar(
@@ -263,7 +263,7 @@ def fit(self, X, y, sample_weight=None):
         if not self._linear_loss.base_loss.in_y_true_range(y):
             raise ValueError(
                 "Some value(s) of y are out of the valid range of the loss"
-                f" {self.base_loss_class.__name__}."
+                f" {self._base_loss_class.__name__}."
             )
 
         # TODO: if alpha=0 check that X is not rank deficient
@@ -378,7 +378,7 @@ def score(self, X, y, sample_weight=None):
 
         D^2 is a generalization of the coefficient of determination R^2.
         R^2 uses squared error and D^2 deviance. Note that those two are equal
-        for ``base_loss_class=HalfSquaredError``.
+        for ``_base_loss_class=HalfSquaredError``.
 
         D^2 is defined as
         :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
@@ -421,7 +421,7 @@ def score(self, X, y, sample_weight=None):
         if not base_loss.in_y_true_range(y):
             raise ValueError(
                 "Some value(s) of y are out of the valid range of the loss"
-                f" {self.base_loss_class.__name__}."
+                f" {self._base_loss_class.__name__}."
             )
 
         # Note that constant_to_optimal_zero is already multiplied by sample_weight.
@@ -447,7 +447,7 @@ def score(self, X, y, sample_weight=None):
 
     def _more_tags(self):
         # create instance of BaseLoss if fit wasn't called yet.
-        base_loss = self.base_loss_class(**self.base_loss_params)
+        base_loss = self._base_loss_class(**self.base_loss_params)
         return {"requires_positive_y": not base_loss.in_y_true_range(-1.0)}
 
     def _get_base_loss_instance(self):
@@ -455,7 +455,7 @@ def _get_base_loss_instance(self):
         # TweedieRegressor.
         # Note that we do not need to pass sample_weight to the loss class as this is
         # only needed to set loss.constant_hessian on which GLMs do not rely.
-        return self.base_loss_class(**self.base_loss_params)
+        return self._base_loss_class(**self.base_loss_params)
 
 
 class PoissonRegressor(_GeneralizedLinearRegressor):
@@ -557,7 +557,7 @@ def __init__(
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
-            base_loss_class=HalfPoissonLoss,
+            _base_loss_class=HalfPoissonLoss,
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
@@ -565,9 +565,9 @@ def __init__(
         )
 
     def _get_base_loss_instance(self):
-        if self.base_loss_class != HalfPoissonLoss:
+        if self._base_loss_class != HalfPoissonLoss:
             raise ValueError(
-                "PoissonRegressor.base_loss_class must be HalfPoissonLoss!"
+                "PoissonRegressor._base_loss_class must be HalfPoissonLoss!"
             )
         return HalfPoissonLoss()
 
@@ -672,7 +672,7 @@ def __init__(
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
-            base_loss_class=HalfGammaLoss,
+            _base_loss_class=HalfGammaLoss,
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
@@ -680,8 +680,8 @@ def __init__(
         )
 
     def _get_base_loss_instance(self):
-        if self.base_loss_class != HalfGammaLoss:
-            raise ValueError("GammaRegressor.base_loss_class must be HalfGammaLoss!")
+        if self._base_loss_class != HalfGammaLoss:
+            raise ValueError("GammaRegressor._base_loss_class must be HalfGammaLoss!")
         return HalfGammaLoss()
 
 
@@ -817,7 +817,7 @@ def __init__(
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
-            base_loss_class=HalfTweedieLoss,
+            _base_loss_class=HalfTweedieLoss,
             base_loss_params={"power": power},
             max_iter=max_iter,
             tol=tol,
@@ -830,9 +830,9 @@ def __init__(
     def _get_base_loss_instance(self):
         # This is only necessary because of the link and power arguments of the
         # TweedieRegressor.
-        if self.base_loss_class not in [HalfTweedieLoss, HalfTweedieLossIdentity]:
+        if self._base_loss_class not in [HalfTweedieLoss, HalfTweedieLossIdentity]:
             raise ValueError(
-                "TweedieRegressor.base_loss_class must be HalfTweedieLoss or"
+                "TweedieRegressor._base_loss_class must be HalfTweedieLoss or"
                 " HalfTweedieLossIdentity!"
             )
 
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index c727f21f165e0..8ca3a77cfcd5b 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -80,8 +80,8 @@ def test_glm_solver_argument(solver):
 def test_glm_validate_base_loss_class():
     y = np.array([1, 2])
     X = np.array([[1], [2]])
-    glm = _GeneralizedLinearRegressor(base_loss_class=LogLink)
-    msg = "The base_loss_class must be an subclass of sklearn._loss.loss.BaseLoss"
+    glm = _GeneralizedLinearRegressor(_base_loss_class=LogLink)
+    msg = "The _base_loss_class must be an subclass of sklearn._loss.loss.BaseLoss"
     with pytest.raises(ValueError, match=msg):
         glm.fit(X, y)
 
@@ -165,7 +165,7 @@ def test_glm_warm_start_argument(warm_start):
     "glm",
     [
         _GeneralizedLinearRegressor(
-            base_loss_class=HalfTweedieLoss, base_loss_params={"power": 3}
+            _base_loss_class=HalfTweedieLoss, base_loss_params={"power": 3}
         ),
         PoissonRegressor(),
         GammaRegressor(),
@@ -188,7 +188,7 @@ def test_glm_identity_regression(fit_intercept):
     y = np.dot(X, coef)
     glm = _GeneralizedLinearRegressor(
         alpha=0,
-        base_loss_class=HalfSquaredError,
+        _base_loss_class=HalfSquaredError,
         fit_intercept=fit_intercept,
         tol=1e-12,
     )
@@ -214,7 +214,7 @@ def test_glm_sample_weight_consistentcy(fit_intercept, alpha, base_loss_class):
     X = rng.rand(n_samples, n_features)
     y = rng.rand(n_samples)
     glm_params = dict(
-        alpha=alpha, base_loss_class=base_loss_class, fit_intercept=fit_intercept
+        alpha=alpha, _base_loss_class=base_loss_class, fit_intercept=fit_intercept
     )
 
     glm = _GeneralizedLinearRegressor(**glm_params).fit(X, y)
@@ -273,7 +273,7 @@ def test_glm_log_regression(fit_intercept, base_loss_class, base_loss_param):
     y = np.exp(np.dot(X, coef))
     glm = _GeneralizedLinearRegressor(
         alpha=0,
-        base_loss_class=base_loss_class,
+        _base_loss_class=base_loss_class,
         base_loss_params=base_loss_param,
         fit_intercept=fit_intercept,
         tol=1e-8,
@@ -371,7 +371,7 @@ def test_normal_ridge_comparison(
 
     glm = _GeneralizedLinearRegressor(
         alpha=alpha,
-        base_loss_class=HalfSquaredError,
+        _base_loss_class=HalfSquaredError,
         fit_intercept=fit_intercept,
         max_iter=300,
         tol=1e-5,
@@ -402,7 +402,7 @@ def test_poisson_glmnet():
     glm = _GeneralizedLinearRegressor(
         alpha=1,
         fit_intercept=True,
-        base_loss_class=HalfPoissonLoss,
+        _base_loss_class=HalfPoissonLoss,
         tol=1e-7,
         max_iter=300,
     )
@@ -424,18 +424,18 @@ def test_convergence_warning(regression_data):
     [(PoissonRegressor, HalfPoissonLoss), (GammaRegressor, HalfGammaLoss)],
 )
 def test_raise_on_reset_base_loss_class(regression_data, estimator, base_loss):
-    # Make sure to raise an appropriate error if base_loss_class was reset.
+    # Make sure to raise an appropriate error if _base_loss_class was reset.
     X, y = regression_data
 
     est = estimator()
-    est.base_loss_class = HalfSquaredError
-    msg = f"{estimator.__name__}.base_loss_class must be {base_loss.__name__}!"
+    est._base_loss_class = HalfSquaredError
+    msg = f"{estimator.__name__}._base_loss_class must be {base_loss.__name__}!"
     with pytest.raises(ValueError, match=msg):
         est.fit(X, y)
 
 
 def test_tweedie_raise_on_reset_base_loss_class(regression_data):
-    # Make sure to raise an appropriate error if base_loss_class or base_loss_params
+    # Make sure to raise an appropriate error if _base_loss_class or base_loss_params
     # was inconsistently reset for a Tweedie deviance loss.
     X, y = regression_data
     # make y positive
@@ -443,7 +443,7 @@ def test_tweedie_raise_on_reset_base_loss_class(regression_data):
 
     power = 2.0
     est = TweedieRegressor(power=power)
-    assert est.base_loss_class, HalfTweedieLoss
+    assert est._base_loss_class, HalfTweedieLoss
     assert est.base_loss_params == {"power": power}
     assert est.power == power
 
@@ -461,13 +461,13 @@ def test_tweedie_raise_on_reset_base_loss_class(regression_data):
     with pytest.raises(ValueError, match=msg):
         est.fit(X, y)
 
-    # reset base_loss_class
+    # reset _base_loss_class
     est = TweedieRegressor(power=power)
-    est.base_loss_class = HalfTweedieLossIdentity
+    est._base_loss_class = HalfTweedieLossIdentity
     est.fit(X, y)
-    est.base_loss_class = HalfSquaredError
+    est._base_loss_class = HalfSquaredError
     msg = (
-        "TweedieRegressor.base_loss_class must be HalfTweedieLoss or"
+        "TweedieRegressor._base_loss_class must be HalfTweedieLoss or"
         " HalfTweedieLossIdentity!"
     )
     with pytest.raises(ValueError, match=msg):

From 6b25501032feda3a3c06d71bc6089bdfda3ce618 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 25 Feb 2022 18:19:36 +0100
Subject: [PATCH 24/43] CLN address review comments

---
 sklearn/_loss/_loss.pyx.tp | 4 +---
 sklearn/_loss/loss.py      | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 208a6f257d421..083d29b9090da 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -559,7 +559,6 @@ cdef inline double cgradient_half_tweedie_identity(
     double raw_prediction,
     double power
 ) nogil:
-    cdef double tmp
     if power == 0.:
         return raw_prediction - y_true
     elif power == 1.:
@@ -567,8 +566,7 @@ cdef inline double cgradient_half_tweedie_identity(
     elif power == 2.:
         return (raw_prediction - y_true) / (raw_prediction * raw_prediction)
     else:
-        tmp = pow(raw_prediction, -power)
-        return tmp * (raw_prediction - y_true)
+        return pow(raw_prediction, -power) * (raw_prediction - y_true)
 
 
 cdef inline double_pair closs_grad_half_tweedie_identity(
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index b3f358248ffe4..e89049a1b9ae3 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -778,7 +778,7 @@ class HalfTweedieLossIdentity(BaseLoss):
                     - y_true_i * raw_prediction_i**(1-p) / (1-p)
                     + raw_prediction_i**(2-p) / (2-p)
 
-    Note that the minimum loss is 0.
+    Note that the minimum value of this loss is 0.
 
     Note furthermore that although no Tweedie distribution exists for
     0 < power < 1, it still gives a strictly consistent scoring function for

From 324b85811a541358e6e4600e4021eea3a31fcb37 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 9 Mar 2022 10:15:05 +0100
Subject: [PATCH 25/43] More valid cases for HalfTweedieLossIdentity(power=0)

---
 sklearn/_loss/tests/test_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 7844f7a5e4794..d3551f1beccf2 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -161,7 +161,7 @@ def test_loss_boundary(loss):
     (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
     (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
     (HalfTweedieLossIdentity(power=-3), [0.1, 100], [-np.inf, np.inf]),
-    (HalfTweedieLossIdentity(power=0), [0.1, 100], [-np.inf, np.inf]),
+    (HalfTweedieLossIdentity(power=0), [-3, -0.1, 0, 0.1, 100], [-np.inf, np.inf]),
     (HalfTweedieLossIdentity(power=1.5), [0.1, 100], [-np.inf, -3, -0.1, np.inf]),
     (HalfTweedieLossIdentity(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
     (HalfTweedieLossIdentity(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),

From d5f472322dc644f98fc4e3597b690e9ceebb20b3 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 9 Mar 2022 14:56:38 +0100
Subject: [PATCH 26/43] Add test_tweedie_log_identity_consistency

---
 sklearn/_loss/tests/test_loss.py | 47 ++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index d3551f1beccf2..38908862d248d 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -1126,3 +1126,50 @@ def test_loss_pickle(loss):
     assert loss(y_true=y_true, raw_prediction=raw_prediction) == approx(
         unpickled_loss(y_true=y_true, raw_prediction=raw_prediction)
     )
+
+
+@pytest.mark.parametrize("p", [0, 1, 1.5, 2, 3])
+def test_tweedie_log_identity_consistency(p):
+    half_tweedie_log = HalfTweedieLoss(power=p)
+    half_tweedie_identity = HalfTweedieLossIdentity(power=p)
+    n_samples = 10
+    # We intentionally pick an arbitrary value for y_true and keep it fixed
+    # because the dropped constant term in the HalfTweedieLoss class depends on
+    # y_true.
+    y_true = np.full(shape=n_samples, fill_value=0.1)
+
+    # XXX: increasing the upper bound makes the test fail very easily, first
+    # with significantly different gradients, and then with different loss
+    # values.
+    raw_prediction = np.logspace(-10, -3, n_samples)
+
+    # Let's compare the loss values, up to some constant term that is dropped
+    # in HalfTweedieLoss but not in HalfTweedieLossIdentity.
+    loss_log_without_constant_terms = half_tweedie_log.loss(
+        y_true=y_true, raw_prediction=raw_prediction
+    )
+    loss_identity_with_constant_terms = half_tweedie_identity.loss(
+        y_true=y_true, raw_prediction=np.exp(raw_prediction)
+    )
+    # Check that the difference between the two ways of computing the loss
+    # values are just the constant terms that are computed by
+    # HalfTweedieLossIdentity but ignored by HalfTweedieLoss (with built-in log
+    # link)
+    diff = loss_log_without_constant_terms - loss_identity_with_constant_terms
+    assert_allclose(diff, diff[0])
+
+    # The gradient and hessian should be the same for the same because they
+    # do not depend on the constant terms.
+
+    gradient_log, hessian_log = half_tweedie_log.gradient_hessian(
+        y_true=y_true, raw_prediction=raw_prediction
+    )
+    gradient_identity, hessian_identity = half_tweedie_identity.gradient_hessian(
+        y_true=y_true, raw_prediction=np.exp(raw_prediction)
+    )
+    # XXX: is the large atol necessary to make this assertion pass hiding a bug
+    # in gradient computation?
+    assert_allclose(gradient_log, gradient_identity, atol=1e-3)
+
+    # XXX: the following always fails
+    # assert_allclose(hessian_log, hessian_identity, atol=1e-3)

From 80bf4702223f624ffcd66625ed8f76cb966a8c08 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 9 Mar 2022 20:45:27 +0100
Subject: [PATCH 27/43] Did not mean to use a logspace for raw_prediction

---
 sklearn/_loss/tests/test_loss.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 38908862d248d..b5c33727f89ca 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -1137,11 +1137,7 @@ def test_tweedie_log_identity_consistency(p):
     # because the dropped constant term in the HalfTweedieLoss class depends on
     # y_true.
     y_true = np.full(shape=n_samples, fill_value=0.1)
-
-    # XXX: increasing the upper bound makes the test fail very easily, first
-    # with significantly different gradients, and then with different loss
-    # values.
-    raw_prediction = np.logspace(-10, -3, n_samples)
+    raw_prediction = np.linspace(-5.0, 5.0, n_samples)
 
     # Let's compare the loss values, up to some constant term that is dropped
     # in HalfTweedieLoss but not in HalfTweedieLossIdentity.
@@ -1158,18 +1154,14 @@ def test_tweedie_log_identity_consistency(p):
     diff = loss_log_without_constant_terms - loss_identity_with_constant_terms
     assert_allclose(diff, diff[0])
 
-    # The gradient and hessian should be the same for the same because they
-    # do not depend on the constant terms.
-
+    # The gradient and hessian should be the same because they do not depend on
+    # the constant terms.
     gradient_log, hessian_log = half_tweedie_log.gradient_hessian(
         y_true=y_true, raw_prediction=raw_prediction
     )
     gradient_identity, hessian_identity = half_tweedie_identity.gradient_hessian(
         y_true=y_true, raw_prediction=np.exp(raw_prediction)
     )
-    # XXX: is the large atol necessary to make this assertion pass hiding a bug
-    # in gradient computation?
-    assert_allclose(gradient_log, gradient_identity, atol=1e-3)
-
-    # XXX: the following always fails
-    # assert_allclose(hessian_log, hessian_identity, atol=1e-3)
+    # XXX: the following always fails even for large values of atol
+    # assert_allclose(gradient_log, gradient_identity)
+    # assert_allclose(hessian_log, hessian_identity)

From 5c9b2647bf299a50cd58e24e505ba8168be07bef Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 10 Mar 2022 13:08:02 +0100
Subject: [PATCH 28/43] CLN address review comments

---
 doc/whats_new/v1.1.rst           | 3 ++-
 sklearn/_loss/_loss.pyx.tp       | 5 +----
 sklearn/_loss/loss.py            | 3 ++-
 sklearn/linear_model/_glm/glm.py | 5 +++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 6d6ab0155df2e..87c939a59897b 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -493,7 +493,7 @@ Changelog
   :class:`kernel_approximation.RBFSampler`, and
   :class:`kernel_approximation.SkewedChi2Sampler`. :pr:`22694` by `Thomas Fan`_.
 
-- |API| Adds :term:`get_feature_names_out` to the following transformers 
+- |API| Adds :term:`get_feature_names_out` to the following transformers
   of the :mod:`~sklearn.kernel_approximation` module:
   :class:`~sklearn.kernel_approximation.AdditiveChi2Sampler`.
   :pr:`22137` by `Thomas Fan`_.
@@ -568,6 +568,7 @@ Changelog
 - |Fix| The input parameter `family` of :class:`linear_model.TweedieRegressor` is not
   validated in `__init__` anymore. This (private) parameter was removed.
   :pr:`22548` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Enhancement| :class:`linear_model.BayesianRidge` and
   :class:`linear_model.ARDRegression` now preserve float32 dtype. :pr:`9087` by
   :user:`Arthur Imbert <Henley13>` and :pr:`22525` by :user:`Meekail Zain <micky774>`.
diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 083d29b9090da..af637d34eba16 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -1,7 +1,7 @@
 {{py:
 
 """
-Template file for easily generate loops over samples using Tempita
+Template file to easily generate loops over samples using Tempita
 (https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
 
 Generated file: _loss.pyx
@@ -133,9 +133,6 @@ doc_HalfTweedieLossIdentity = (
         max(y_true, 0)**(2-p) / (1-p) / (2-p)
         - y_true * y_pred**(1-p) / (1-p)
         + y_pred**(2-p) / (2-p)
-        = max(y_true, 0)**(2-p) / (1-p) / (2-p)
-        - y_true * exp((1-p) * raw_prediction) / (1-p)
-        + exp((2-p) * raw_prediction) / (2-p)
 
     Notes:
     - Here, we do not drop constant terms in contrast to the version with log-link.
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 8747dde6af7bf..5eb58bb0c27f9 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -778,7 +778,8 @@ class HalfTweedieLossIdentity(BaseLoss):
     y_true in real numbers for power <= 0
     y_true in non-negative real numbers for 0 < power < 2
     y_true in positive real numbers for 2 <= power
-    y_pred in positive real numbers, all reals for power = 0
+    y_pred in positive real numbers for power != 0
+    y_pred in real numbers for power = 0
     power in real numbers
 
     Link:
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 6bf3eac955a75..7e25986a0a1db 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -50,7 +50,7 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         1/(2*sum(s_i)) * sum(s_i * deviance(y_i, h(x_i*w)) + 1/2 * alpha * ||w||_2^2
 
     with inverse link function h, s=sample_weight and per observation (unit) deviance
-    deviance(y_i, h(x_i*w). Note that for an EDM 1/2 * deviance is the negative
+    deviance(y_i, h(x_i*w)). Note that for an EDM 1/2 * deviance is the negative
     log-likelihood up to a constant (in w) term.
     The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
 
@@ -89,6 +89,7 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         HalfGammaLoss            log       0 < y
         HalfInverseGaussianLoss  log       0 < y
         HalfTweedieLoss          log       dependend on tweedie power
+        HalfTweedieLossIdentity  identity  dependend on tweedie power
         =======================  ========  ==========================
 
         The link function of the GLM, i.e. mapping from linear predictor
@@ -857,5 +858,5 @@ def _get_base_loss_instance(self):
         else:
             raise ValueError(
                 "The link must be an element of ['auto', 'identity', 'log']; "
-                "got (link={0})".format(self.link)
+                f"got (link={self.link!r})"
             )

From db6c1928a22a1d6c82986b03ab9219e5f1f57e20 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 10 Mar 2022 13:29:29 +0100
Subject: [PATCH 29/43] DEP family - PoissonRegressor with usual deprecration -
 GammaRegressor with usual deprecation - TweedieRegressor breaking deprecation
   Reason: We want to get rid of the `class ExponentialDispersionModel`
 entirely

---
 sklearn/linear_model/_glm/glm.py            | 27 ++++++++++++++++++++-
 sklearn/linear_model/_glm/tests/test_glm.py | 15 ++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 7e25986a0a1db..3d695b980d809 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -21,7 +21,7 @@
 )
 from ...base import BaseEstimator, RegressorMixin
 from ...utils.optimize import _check_optimize_result
-from ...utils import check_scalar, check_array
+from ...utils import check_scalar, check_array, deprecated
 from ...utils.validation import check_is_fitted, _check_sample_weight
 from ...utils._openmp_helpers import _openmp_effective_n_threads
 from .._linear_loss import LinearModelLoss
@@ -76,6 +76,15 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
+    family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} \
+            or a BaseLoss instance, default='normal'
+        The distributional assumption of the GLM, i.e. which distribution from
+        the EDM, specifies the loss function to be minimized.
+
+        .. deprecated:: 1.1
+           `family` is deprecated in 1.1 and will be removed in 1.3.
+           Use `_base_loss_class` instead.
+
     _base_loss_class : subclass of BaseLoss, default=HalfSquaredError
         A `_base_loss_class` contains a specific loss function as well as the link
         function. The loss to be minimized specifies the distributional assumption of
@@ -458,6 +467,22 @@ def _get_base_loss_instance(self):
         # only needed to set loss.constant_hessian on which GLMs do not rely.
         return self._base_loss_class(**self.base_loss_params)
 
+    # FIXME: remove in v1.3
+    @deprecated(  # type: ignore
+        "Attribute `family` was deprecated in version 1.1 and "
+        "will be removed in 1.3. Use `_base_loss_class` instead."
+    )
+    @property
+    def family(self):
+        if isinstance(self, PoissonRegressor):
+            return "poisson"
+        elif isinstance(self, GammaRegressor):
+            return "gamma"
+        elif isinstance(self, TweedieRegressor) and self.power == 3:
+            return "inverse-gaussian"
+        else:
+            return self._base_loss_class.__name__
+
 
 class PoissonRegressor(_GeneralizedLinearRegressor):
     """Generalized Linear Model with a Poisson distribution.
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 8ca3a77cfcd5b..fd7f07ac78380 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -533,3 +533,18 @@ def test_tweedie_score(regression_data, power, link):
 )
 def test_tags(estimator, value):
     assert estimator._get_tags()["requires_positive_y"] is value
+
+
+# FIXME: remove in v1.3
+@pytest.mark.parametrize(
+    "est, family",
+    [
+        (PoissonRegressor(), "poisson"),
+        (GammaRegressor(), "gamma"),
+        (TweedieRegressor(power=3), "inverse-gaussian"),
+        (TweedieRegressor(), "HalfTweedieLoss"),
+    ],
+)
+def test_family_deprecation(est, family):
+    with pytest.warns(FutureWarning, match="`family` was deprecated"):
+        assert est.family == family

From a0be233f6a5dfb8e7efe71d90ae7dfd87d294f87 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 10 Mar 2022 15:23:16 +0100
Subject: [PATCH 30/43] CLN remove base_loss_params

---
 sklearn/linear_model/_glm/glm.py            | 95 ++++++++++-----------
 sklearn/linear_model/_glm/tests/test_glm.py | 85 +++++++++---------
 2 files changed, 82 insertions(+), 98 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 3d695b980d809..22e99806a8897 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -37,7 +37,7 @@ def __init__(self, sample_weight=None):
         super().__init__(sample_weight=sample_weight, power=3)
 
 
-# TODO: We could allow strings for _base_loss_class (as before for the now removed
+# TODO: We could allow strings for _base_loss (as before for the now removed
 # family parameter): 'normal', 'poisson', 'gamma', 'inverse-gaussian'
 class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
     """Regression via a penalized Generalized Linear Model (GLM).
@@ -83,15 +83,15 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
 
         .. deprecated:: 1.1
            `family` is deprecated in 1.1 and will be removed in 1.3.
-           Use `_base_loss_class` instead.
+           Use `_base_loss` instead.
 
-    _base_loss_class : subclass of BaseLoss, default=HalfSquaredError
-        A `_base_loss_class` contains a specific loss function as well as the link
+    _base_loss : BaseLoss, default=HalfSquaredError()
+        A `_base_loss` contains a specific loss function as well as the link
         function. The loss to be minimized specifies the distributional assumption of
         the GLM, i.e. the distribution from the EDM. Here are some examples:
 
         =======================  ========  ==========================
-        _base_loss_class          Link       Target Domain
+        _base_loss               Link      Target Domain
         =======================  ========  ==========================
         HalfSquaredError         identity  y any real number
         HalfPoissonLoss          log       0 <= y
@@ -105,10 +105,6 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         `X @ coeff + intercept` to prediction `y_pred`. For instance, with a log link,
         we have `y_pred = exp(X @ coeff + intercept)`.
 
-    base_loss_params : dictionary, default={}
-        Arguments to be passed to _base_loss_class, e.g. {"power": 1.5} with
-        `_base_loss_class=HalfTweedieLoss`.
-
     solver : 'lbfgs', default='lbfgs'
         Algorithm to use in the optimization problem:
 
@@ -152,8 +148,7 @@ def __init__(
         *,
         alpha=1.0,
         fit_intercept=True,
-        _base_loss_class=HalfSquaredError,
-        base_loss_params={},
+        _base_loss=HalfSquaredError(),
         solver="lbfgs",
         max_iter=100,
         tol=1e-4,
@@ -162,8 +157,7 @@ def __init__(
     ):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
-        self._base_loss_class = _base_loss_class
-        self.base_loss_params = base_loss_params
+        self._base_loss = _base_loss
         self.solver = solver
         self.max_iter = max_iter
         self.tol = tol
@@ -189,13 +183,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted model.
         """
-        if not issubclass(self._base_loss_class, BaseLoss):
-            raise ValueError(
-                "The _base_loss_class must be an subclass of"
-                " sklearn._loss.loss.BaseLoss; ; got"
-                f" (_base_loss_class={self._base_loss_class})"
-            )
-
         check_scalar(
             self.alpha,
             name="alpha",
@@ -266,14 +253,14 @@ def fit(self, X, y, sample_weight=None):
         n_samples, n_features = X.shape
 
         self._linear_loss = LinearModelLoss(
-            base_loss=self._get_base_loss_instance(),
+            base_loss=self._validate_base_loss(),
             fit_intercept=self.fit_intercept,
         )
 
         if not self._linear_loss.base_loss.in_y_true_range(y):
             raise ValueError(
                 "Some value(s) of y are out of the valid range of the loss"
-                f" {self._base_loss_class.__name__}."
+                f" {self._base_loss.__class__.__name__!r}."
             )
 
         # TODO: if alpha=0 check that X is not rank deficient
@@ -388,7 +375,7 @@ def score(self, X, y, sample_weight=None):
 
         D^2 is a generalization of the coefficient of determination R^2.
         R^2 uses squared error and D^2 deviance. Note that those two are equal
-        for ``_base_loss_class=HalfSquaredError``.
+        for ``_base_loss=HalfSquaredError()``.
 
         D^2 is defined as
         :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
@@ -431,7 +418,7 @@ def score(self, X, y, sample_weight=None):
         if not base_loss.in_y_true_range(y):
             raise ValueError(
                 "Some value(s) of y are out of the valid range of the loss"
-                f" {self._base_loss_class.__name__}."
+                f" {self._base_loss.__name__}."
             )
 
         # Note that constant_to_optimal_zero is already multiplied by sample_weight.
@@ -456,21 +443,28 @@ def score(self, X, y, sample_weight=None):
         return 1 - (deviance + constant) / (deviance_null + constant)
 
     def _more_tags(self):
-        # create instance of BaseLoss if fit wasn't called yet.
-        base_loss = self._base_loss_class(**self.base_loss_params)
+        # Create instance of BaseLoss if fit wasn't called yet. This is necessary as
+        # TweedieRegressor might set the used loss during fit different from
+        # self._base_loss.
+        base_loss = self._validate_base_loss()
         return {"requires_positive_y": not base_loss.in_y_true_range(-1.0)}
 
-    def _get_base_loss_instance(self):
+    def _validate_base_loss(self):
         # This is only necessary because of the link and power arguments of the
         # TweedieRegressor.
         # Note that we do not need to pass sample_weight to the loss class as this is
         # only needed to set loss.constant_hessian on which GLMs do not rely.
-        return self._base_loss_class(**self.base_loss_params)
+        if not isinstance(self._base_loss, BaseLoss):
+            raise ValueError(
+                "The _base_loss must be an instance of sklearn._loss.loss.BaseLoss; "
+                f"got _base_loss_class={self._base_loss}"
+            )
+        return self._base_loss
 
     # FIXME: remove in v1.3
     @deprecated(  # type: ignore
         "Attribute `family` was deprecated in version 1.1 and "
-        "will be removed in 1.3. Use `_base_loss_class` instead."
+        "will be removed in 1.3. Use `_base_loss` instead."
     )
     @property
     def family(self):
@@ -481,7 +475,7 @@ def family(self):
         elif isinstance(self, TweedieRegressor) and self.power == 3:
             return "inverse-gaussian"
         else:
-            return self._base_loss_class.__name__
+            return self._base_loss.__class__.__name__
 
 
 class PoissonRegressor(_GeneralizedLinearRegressor):
@@ -583,18 +577,16 @@ def __init__(
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
-            _base_loss_class=HalfPoissonLoss,
+            _base_loss=HalfPoissonLoss(),
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
             verbose=verbose,
         )
 
-    def _get_base_loss_instance(self):
-        if self._base_loss_class != HalfPoissonLoss:
-            raise ValueError(
-                "PoissonRegressor._base_loss_class must be HalfPoissonLoss!"
-            )
+    def _validate_base_loss(self):
+        if not isinstance(self._base_loss, HalfPoissonLoss):
+            raise ValueError("PoissonRegressor._base_loss must be HalfPoissonLoss!")
         return HalfPoissonLoss()
 
 
@@ -698,16 +690,16 @@ def __init__(
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
-            _base_loss_class=HalfGammaLoss,
+            _base_loss=HalfGammaLoss(),
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
             verbose=verbose,
         )
 
-    def _get_base_loss_instance(self):
-        if self._base_loss_class != HalfGammaLoss:
-            raise ValueError("GammaRegressor._base_loss_class must be HalfGammaLoss!")
+    def _validate_base_loss(self):
+        if not isinstance(self._base_loss, HalfGammaLoss):
+            raise ValueError("GammaRegressor._base_loss must be HalfGammaLoss!")
         return HalfGammaLoss()
 
 
@@ -843,8 +835,9 @@ def __init__(
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
-            _base_loss_class=HalfTweedieLoss,
-            base_loss_params={"power": power},
+            # Initialize with fixed power. Otherwise, validation would happen in init.
+            # In the end, _validate_base_loss returns the used loss.
+            _base_loss=HalfTweedieLoss(power=1),
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
@@ -853,21 +846,21 @@ def __init__(
         self.link = link
         self.power = power
 
-    def _get_base_loss_instance(self):
+    def _validate_base_loss(self):
         # This is only necessary because of the link and power arguments of the
         # TweedieRegressor.
-        if self._base_loss_class not in [HalfTweedieLoss, HalfTweedieLossIdentity]:
+        if not isinstance(self._base_loss, (HalfTweedieLoss, HalfTweedieLossIdentity)):
             raise ValueError(
-                "TweedieRegressor._base_loss_class must be HalfTweedieLoss or"
+                "TweedieRegressor._base_loss must be HalfTweedieLoss or"
                 " HalfTweedieLossIdentity!"
             )
 
-        if list(self.base_loss_params.keys()) != ["power"]:
-            raise ValueError(
-                "TweedieRegressor must have base_loss_params={'power': some float}."
-            )
-        # In case parameter power was reset, make it consistent with base_loss_param
-        self.base_loss_params["power"] = self.power
+        # We could raise ValueError. But we want to be able to reset power
+        # gracefully.
+        # if self.power != self.base_loss.power:
+        #     raise ValueError(
+        #         "TweedieRegressor must have self.base_loss.power = self.power."
+        #     )
 
         if self.link == "auto":
             if self.power <= 0:
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index fd7f07ac78380..7ef6a1c30472c 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -77,11 +77,11 @@ def test_glm_solver_argument(solver):
         glm.fit(X, y)
 
 
-def test_glm_validate_base_loss_class():
+def test_glm_validate_base_loss():
     y = np.array([1, 2])
     X = np.array([[1], [2]])
-    glm = _GeneralizedLinearRegressor(_base_loss_class=LogLink)
-    msg = "The _base_loss_class must be an subclass of sklearn._loss.loss.BaseLoss"
+    glm = _GeneralizedLinearRegressor(_base_loss=LogLink())
+    msg = "The _base_loss must be an instance of sklearn._loss.loss.BaseLoss"
     with pytest.raises(ValueError, match=msg):
         glm.fit(X, y)
 
@@ -164,9 +164,7 @@ def test_glm_warm_start_argument(warm_start):
 @pytest.mark.parametrize(
     "glm",
     [
-        _GeneralizedLinearRegressor(
-            _base_loss_class=HalfTweedieLoss, base_loss_params={"power": 3}
-        ),
+        _GeneralizedLinearRegressor(_base_loss=HalfTweedieLoss(power=3)),
         PoissonRegressor(),
         GammaRegressor(),
         TweedieRegressor(power=1.5),
@@ -188,7 +186,7 @@ def test_glm_identity_regression(fit_intercept):
     y = np.dot(X, coef)
     glm = _GeneralizedLinearRegressor(
         alpha=0,
-        _base_loss_class=HalfSquaredError,
+        _base_loss=HalfSquaredError(),
         fit_intercept=fit_intercept,
         tol=1e-12,
     )
@@ -204,18 +202,16 @@ def test_glm_identity_regression(fit_intercept):
 @pytest.mark.parametrize("fit_intercept", [False, True])
 @pytest.mark.parametrize("alpha", [0.0, 1.0])
 @pytest.mark.parametrize(
-    "base_loss_class", [HalfSquaredError, HalfPoissonLoss, HalfGammaLoss]
+    "base_loss", [HalfSquaredError(), HalfPoissonLoss(), HalfGammaLoss()]
 )
-def test_glm_sample_weight_consistentcy(fit_intercept, alpha, base_loss_class):
+def test_glm_sample_weight_consistentcy(fit_intercept, alpha, base_loss):
     """Test that the impact of sample_weight is consistent"""
     rng = np.random.RandomState(0)
     n_samples, n_features = 10, 5
 
     X = rng.rand(n_samples, n_features)
     y = rng.rand(n_samples)
-    glm_params = dict(
-        alpha=alpha, _base_loss_class=base_loss_class, fit_intercept=fit_intercept
-    )
+    glm_params = dict(alpha=alpha, _base_loss=base_loss, fit_intercept=fit_intercept)
 
     glm = _GeneralizedLinearRegressor(**glm_params).fit(X, y)
     coef = glm.coef_.copy()
@@ -256,25 +252,24 @@ def test_glm_sample_weight_consistentcy(fit_intercept, alpha, base_loss_class):
 
 @pytest.mark.parametrize("fit_intercept", [True, False])
 @pytest.mark.parametrize(
-    "base_loss_class, base_loss_param",
+    "base_loss",
     [
-        (HalfPoissonLoss, {}),
-        (HalfGammaLoss, {}),
-        (HalfInverseGaussianLoss, {}),
-        (HalfTweedieLoss, {"power": 0}),
-        (HalfTweedieLoss, {"power": 1.5}),
-        (HalfTweedieLoss, {"power": 4.5}),
+        HalfPoissonLoss(),
+        HalfGammaLoss(),
+        HalfInverseGaussianLoss(),
+        HalfTweedieLoss(power=0),
+        HalfTweedieLoss(power=1.5),
+        HalfTweedieLoss(power=4.5),
     ],
 )
-def test_glm_log_regression(fit_intercept, base_loss_class, base_loss_param):
+def test_glm_log_regression(fit_intercept, base_loss):
     """Test GLM regression with log link on a simple dataset."""
     coef = [0.2, -0.1]
     X = np.array([[0, 1, 2, 3, 4], [1, 1, 1, 1, 1]]).T
     y = np.exp(np.dot(X, coef))
     glm = _GeneralizedLinearRegressor(
         alpha=0,
-        _base_loss_class=base_loss_class,
-        base_loss_params=base_loss_param,
+        _base_loss=base_loss,
         fit_intercept=fit_intercept,
         tol=1e-8,
     )
@@ -371,7 +366,7 @@ def test_normal_ridge_comparison(
 
     glm = _GeneralizedLinearRegressor(
         alpha=alpha,
-        _base_loss_class=HalfSquaredError,
+        _base_loss=HalfSquaredError(),
         fit_intercept=fit_intercept,
         max_iter=300,
         tol=1e-5,
@@ -402,7 +397,7 @@ def test_poisson_glmnet():
     glm = _GeneralizedLinearRegressor(
         alpha=1,
         fit_intercept=True,
-        _base_loss_class=HalfPoissonLoss,
+        _base_loss=HalfPoissonLoss(),
         tol=1e-7,
         max_iter=300,
     )
@@ -423,19 +418,19 @@ def test_convergence_warning(regression_data):
     "estimator, base_loss",
     [(PoissonRegressor, HalfPoissonLoss), (GammaRegressor, HalfGammaLoss)],
 )
-def test_raise_on_reset_base_loss_class(regression_data, estimator, base_loss):
-    # Make sure to raise an appropriate error if _base_loss_class was reset.
+def test_raise_on_reset_base_loss(regression_data, estimator, base_loss):
+    # Make sure to raise an appropriate error if _base_loss was reset.
     X, y = regression_data
 
     est = estimator()
-    est._base_loss_class = HalfSquaredError
-    msg = f"{estimator.__name__}._base_loss_class must be {base_loss.__name__}!"
+    est._base_loss = HalfSquaredError()
+    msg = f"{estimator.__name__}._base_loss must be {base_loss.__name__}!"
     with pytest.raises(ValueError, match=msg):
         est.fit(X, y)
 
 
-def test_tweedie_raise_on_reset_base_loss_class(regression_data):
-    # Make sure to raise an appropriate error if _base_loss_class or base_loss_params
+def test_tweedie_raise_on_reset_base_loss(regression_data):
+    # Make sure to raise an appropriate error if _base_loss
     # was inconsistently reset for a Tweedie deviance loss.
     X, y = regression_data
     # make y positive
@@ -443,31 +438,27 @@ def test_tweedie_raise_on_reset_base_loss_class(regression_data):
 
     power = 2.0
     est = TweedieRegressor(power=power)
-    assert est._base_loss_class, HalfTweedieLoss
-    assert est.base_loss_params == {"power": power}
+    assert isinstance(est._base_loss, HalfTweedieLoss)
     assert est.power == power
+    # The following is not true to avoid input validation in init.
+    # assert est._base_loss.closs.power == power
+    est.fit(X, y)
+    assert est._linear_loss.base_loss.closs.power == power
 
     # reset power
     new_power = 0
     est.power = new_power
-    est.fit(X, y)  # fit resets base_loss_param["power"]
-    assert est.power == est.base_loss_params["power"]
-
-    # reset base_loss_params
-    est.base_loss_params = {"should error": True}
-    msg = re.escape(
-        "TweedieRegressor must have base_loss_params={'power': some float}."
-    )
-    with pytest.raises(ValueError, match=msg):
-        est.fit(X, y)
+    est.fit(X, y)  # fit resets _linear_loss
+    assert new_power == est.power == est._linear_loss.base_loss.closs.power
 
-    # reset _base_loss_class
-    est = TweedieRegressor(power=power)
-    est._base_loss_class = HalfTweedieLossIdentity
+    # reset _base_loss
+    est = TweedieRegressor(power=power, link="identity")
+    assert isinstance(est._base_loss, HalfTweedieLoss)  # this is the unused default
     est.fit(X, y)
-    est._base_loss_class = HalfSquaredError
+    est._base_loss = HalfSquaredError()
+    assert isinstance(est._linear_loss.base_loss, HalfTweedieLossIdentity)
     msg = (
-        "TweedieRegressor._base_loss_class must be HalfTweedieLoss or"
+        "TweedieRegressor._base_loss must be HalfTweedieLoss or"
         " HalfTweedieLossIdentity!"
     )
     with pytest.raises(ValueError, match=msg):

From 225863b4d82d2f0d164fae1142ae3009a60ba295 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 10 Mar 2022 15:46:48 +0100
Subject: [PATCH 31/43] TST fix test_tweedie_log_identity_consistency

---
 sklearn/_loss/tests/test_loss.py | 54 ++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index b5c33727f89ca..94caa29fb1752 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -1128,40 +1128,46 @@ def test_loss_pickle(loss):
     )
 
 
-@pytest.mark.parametrize("p", [0, 1, 1.5, 2, 3])
+@pytest.mark.parametrize("p", [-1.5, 0, 1, 1.5, 2, 3])
 def test_tweedie_log_identity_consistency(p):
+    """Test for identical losses when only the link function is different."""
     half_tweedie_log = HalfTweedieLoss(power=p)
     half_tweedie_identity = HalfTweedieLossIdentity(power=p)
     n_samples = 10
-    # We intentionally pick an arbitrary value for y_true and keep it fixed
-    # because the dropped constant term in the HalfTweedieLoss class depends on
-    # y_true.
-    y_true = np.full(shape=n_samples, fill_value=0.1)
-    raw_prediction = np.linspace(-5.0, 5.0, n_samples)
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=half_tweedie_log, n_samples=n_samples, seed=42
+    )
+    y_pred = half_tweedie_log.link.inverse(raw_prediction)  # exp(raw_prediction)
 
     # Let's compare the loss values, up to some constant term that is dropped
     # in HalfTweedieLoss but not in HalfTweedieLossIdentity.
-    loss_log_without_constant_terms = half_tweedie_log.loss(
+    loss_log = half_tweedie_log.loss(
         y_true=y_true, raw_prediction=raw_prediction
-    )
-    loss_identity_with_constant_terms = half_tweedie_identity.loss(
-        y_true=y_true, raw_prediction=np.exp(raw_prediction)
-    )
-    # Check that the difference between the two ways of computing the loss
-    # values are just the constant terms that are computed by
-    # HalfTweedieLossIdentity but ignored by HalfTweedieLoss (with built-in log
-    # link)
-    diff = loss_log_without_constant_terms - loss_identity_with_constant_terms
-    assert_allclose(diff, diff[0])
-
-    # The gradient and hessian should be the same because they do not depend on
-    # the constant terms.
+    ) + half_tweedie_log.constant_to_optimal_zero(y_true)
+    loss_identity = half_tweedie_identity.loss(
+        y_true=y_true, raw_prediction=y_pred
+    ) + half_tweedie_identity.constant_to_optimal_zero(y_true)
+    # Note that HalfTweedieLoss ignores different constant terms than
+    # HalfTweedieLossIdentity. Constant terms means terms not depending on
+    # raw_prediction. By adding these terms, `constant_to_optimal_zero`, both losses
+    # give the same values.
+    assert_allclose(loss_log, loss_identity)
+
+    # For gradients and hessians, the constant terms do not matter. We have, however,
+    # to account for the chain rule, i.e. with x=raw_prediction
+    #     gradient_log(x) = d/dx loss_log(x)
+    #                     = d/dx loss_identity(exp(x))
+    #                     = exp(x) * gradient_identity(exp(x))
+    # Similarly,
+    #     hessian_log(x) = exp(x) * gradient_identity(exp(x))
+    #                    + exp(x)**2 * hessian_identity(x)
     gradient_log, hessian_log = half_tweedie_log.gradient_hessian(
         y_true=y_true, raw_prediction=raw_prediction
     )
     gradient_identity, hessian_identity = half_tweedie_identity.gradient_hessian(
-        y_true=y_true, raw_prediction=np.exp(raw_prediction)
+        y_true=y_true, raw_prediction=y_pred
+    )
+    assert_allclose(gradient_log, y_pred * gradient_identity)
+    assert_allclose(
+        hessian_log, y_pred * gradient_identity + y_pred**2 * hessian_identity
     )
-    # XXX: the following always fails even for large values of atol
-    # assert_allclose(gradient_log, gradient_identity)
-    # assert_allclose(hessian_log, hessian_identity)

From 38b31043145e6ec97daccd92bac5d250a119c68c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 10 Mar 2022 21:25:42 +0100
Subject: [PATCH 32/43] DOC add deprecation and change info in whatsnew

---
 doc/whats_new/v1.1.rst | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 87c939a59897b..dc1b517b61404 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -566,7 +566,11 @@ Changelog
   :user:`Olivier Grisel <ogrisel>`.
 
 - |Fix| The input parameter `family` of :class:`linear_model.TweedieRegressor` is not
-  validated in `__init__` anymore. This (private) parameter was removed.
+  validated in `__init__` anymore. Instead, this (private) parameter is deprecated in
+  :class:`linear_model.GammaRegressor`, :class:`linear_model.PoissonRegressor` and
+  :class:`linear_model.TweedieRegressor`, and will be removed in 1.3.
+  Note that the value of `family` for :class:`linear_model.TweedieRegressor` changed
+  in v1.1.
   :pr:`22548` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 - |Enhancement| :class:`linear_model.BayesianRidge` and

From 4bfc1e959f6a37da0ae50218b2f044619044a165 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 18 Mar 2022 09:10:15 +0100
Subject: [PATCH 33/43] Revert "MAINT remove glm_distribution.py"

This reverts commit 719bb1be3ee049be23ae927efb357578d04949f7.
---
 sklearn/_loss/glm_distribution.py            | 369 +++++++++++++++++++
 sklearn/_loss/tests/test_glm_distribution.py | 121 ++++++
 2 files changed, 490 insertions(+)
 create mode 100644 sklearn/_loss/glm_distribution.py
 create mode 100644 sklearn/_loss/tests/test_glm_distribution.py

diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py
new file mode 100644
index 0000000000000..dfc512c8b10b7
--- /dev/null
+++ b/sklearn/_loss/glm_distribution.py
@@ -0,0 +1,369 @@
+"""
+Distribution functions used in GLM
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# License: BSD 3 clause
+
+from abc import ABCMeta, abstractmethod
+from collections import namedtuple
+import numbers
+
+import numpy as np
+from scipy.special import xlogy
+
+
+DistributionBoundary = namedtuple("DistributionBoundary", ("value", "inclusive"))
+
+
+class ExponentialDispersionModel(metaclass=ABCMeta):
+    r"""Base class for reproductive Exponential Dispersion Models (EDM).
+
+    The pdf of :math:`Y\sim \mathrm{EDM}(y_\textrm{pred}, \phi)` is given by
+
+    .. math:: p(y| \theta, \phi) = c(y, \phi)
+        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
+        = \tilde{c}(y, \phi)
+            \exp\left(-\frac{d(y, y_\textrm{pred})}{2\phi}\right)
+
+    with mean :math:`\mathrm{E}[Y] = A'(\theta) = y_\textrm{pred}`,
+    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(y_\textrm{pred})`,
+    unit variance :math:`v(y_\textrm{pred})` and
+    unit deviance :math:`d(y,y_\textrm{pred})`.
+
+    Methods
+    -------
+    deviance
+    deviance_derivative
+    in_y_range
+    unit_deviance
+    unit_deviance_derivative
+    unit_variance
+
+    References
+    ----------
+    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+    """
+
+    def in_y_range(self, y):
+        """Returns ``True`` if y is in the valid range of Y~EDM.
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+        """
+        # Note that currently supported distributions have +inf upper bound
+
+        if not isinstance(self._lower_bound, DistributionBoundary):
+            raise TypeError(
+                "_lower_bound attribute must be of type DistributionBoundary"
+            )
+
+        if self._lower_bound.inclusive:
+            return np.greater_equal(y, self._lower_bound.value)
+        else:
+            return np.greater(y, self._lower_bound.value)
+
+    @abstractmethod
+    def unit_variance(self, y_pred):
+        r"""Compute the unit variance function.
+
+        The unit variance :math:`v(y_\textrm{pred})` determines the variance as
+        a function of the mean :math:`y_\textrm{pred}` by
+        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(y_\textrm{pred}_i)`.
+        It can also be derived from the unit deviance
+        :math:`d(y,y_\textrm{pred})` as
+
+        .. math:: v(y_\textrm{pred}) = \frac{2}{
+            \frac{\partial^2 d(y,y_\textrm{pred})}{
+            \partialy_\textrm{pred}^2}}\big|_{y=y_\textrm{pred}}
+
+        See also :func:`variance`.
+
+        Parameters
+        ----------
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+        """
+
+    @abstractmethod
+    def unit_deviance(self, y, y_pred, check_input=False):
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
+        log-likelihood as
+        :math:`d(y,y_\textrm{pred}) = -2\phi\cdot
+        \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+
+        check_input : bool, default=False
+            If True raise an exception on invalid y or y_pred values, otherwise
+            they will be propagated as NaN.
+        Returns
+        -------
+        deviance: array of shape (n_samples,)
+            Computed deviance
+        """
+
+    def unit_deviance_derivative(self, y, y_pred):
+        r"""Compute the derivative of the unit deviance w.r.t. y_pred.
+
+        The derivative of the unit deviance is given by
+        :math:`\frac{\partial}{\partialy_\textrm{pred}}d(y,y_\textrm{pred})
+             = -2\frac{y-y_\textrm{pred}}{v(y_\textrm{pred})}`
+        with unit variance :math:`v(y_\textrm{pred})`.
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+        """
+        return -2 * (y - y_pred) / self.unit_variance(y_pred)
+
+    def deviance(self, y, y_pred, weights=1):
+        r"""Compute the deviance.
+
+        The deviance is a weighted sum of the per sample unit deviances,
+        :math:`D = \sum_i s_i \cdot d(y_i, y_\textrm{pred}_i)`
+        with weights :math:`s_i` and unit deviance
+        :math:`d(y,y_\textrm{pred})`.
+        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
+        \left(loglike(y,y_\textrm{pred},\frac{phi}{s})
+        - loglike(y,y,\frac{phi}{s})\right)`.
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+
+        weights : {int, array of shape (n_samples,)}, default=1
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return np.sum(weights * self.unit_deviance(y, y_pred))
+
+    def deviance_derivative(self, y, y_pred, weights=1):
+        r"""Compute the derivative of the deviance w.r.t. y_pred.
+
+        It gives :math:`\frac{\partial}{\partial y_\textrm{pred}}
+        D(y, \y_\textrm{pred}; weights)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        y_pred : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : {int, array of shape (n_samples,)}, default=1
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return weights * self.unit_deviance_derivative(y, y_pred)
+
+
+class TweedieDistribution(ExponentialDispersionModel):
+    r"""A class for the Tweedie distribution.
+
+    A Tweedie distribution with mean :math:`y_\textrm{pred}=\mathrm{E}[Y]`
+    is uniquely defined by it's mean-variance relationship
+    :math:`\mathrm{Var}[Y] \propto y_\textrm{pred}^power`.
+
+    Special cases are:
+
+    ===== ================
+    Power Distribution
+    ===== ================
+    0     Normal
+    1     Poisson
+    (1,2) Compound Poisson
+    2     Gamma
+    3     Inverse Gaussian
+
+    Parameters
+    ----------
+    power : float, default=0
+            The variance power of the `unit_variance`
+            :math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`.
+            For ``0<power<1``, no distribution exists.
+    """
+
+    def __init__(self, power=0):
+        self.power = power
+
+    @property
+    def power(self):
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        # We use a property with a setter, to update lower and
+        # upper bound when the power parameter is updated e.g. in grid
+        # search.
+        if not isinstance(power, numbers.Real):
+            raise TypeError("power must be a real number, input was {0}".format(power))
+
+        if power <= 0:
+            # Extreme Stable or Normal distribution
+            self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False)
+        elif 0 < power < 1:
+            raise ValueError(
+                "Tweedie distribution is only defined for power<=0 and power>=1."
+            )
+        elif 1 <= power < 2:
+            # Poisson or Compound Poisson distribution
+            self._lower_bound = DistributionBoundary(0, inclusive=True)
+        elif power >= 2:
+            # Gamma, Positive Stable, Inverse Gaussian distributions
+            self._lower_bound = DistributionBoundary(0, inclusive=False)
+        else:  # pragma: no cover
+            # this branch should be unreachable.
+            raise ValueError
+
+        self._power = power
+
+    def unit_variance(self, y_pred):
+        """Compute the unit variance of a Tweedie distribution
+        v(y_\textrm{pred})=y_\textrm{pred}**power.
+
+        Parameters
+        ----------
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+        """
+        return np.power(y_pred, self.power)
+
+    def unit_deviance(self, y, y_pred, check_input=False):
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
+        log-likelihood as
+        :math:`d(y,y_\textrm{pred}) = -2\phi\cdot
+        \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+
+        check_input : bool, default=False
+            If True raise an exception on invalid y or y_pred values, otherwise
+            they will be propagated as NaN.
+        Returns
+        -------
+        deviance: array of shape (n_samples,)
+            Computed deviance
+        """
+        p = self.power
+
+        if check_input:
+            message = (
+                "Mean Tweedie deviance error with power={} can only be used on ".format(
+                    p
+                )
+            )
+            if p < 0:
+                # 'Extreme stable', y any real number, y_pred > 0
+                if (y_pred <= 0).any():
+                    raise ValueError(message + "strictly positive y_pred.")
+            elif p == 0:
+                # Normal, y and y_pred can be any real number
+                pass
+            elif 0 < p < 1:
+                raise ValueError(
+                    "Tweedie deviance is only defined for power<=0 and power>=1."
+                )
+            elif 1 <= p < 2:
+                # Poisson and compound Poisson distribution, y >= 0, y_pred > 0
+                if (y < 0).any() or (y_pred <= 0).any():
+                    raise ValueError(
+                        message + "non-negative y and strictly positive y_pred."
+                    )
+            elif p >= 2:
+                # Gamma and Extreme stable distribution, y and y_pred > 0
+                if (y <= 0).any() or (y_pred <= 0).any():
+                    raise ValueError(message + "strictly positive y and y_pred.")
+            else:  # pragma: nocover
+                # Unreachable statement
+                raise ValueError
+
+        if p < 0:
+            # 'Extreme stable', y any real number, y_pred > 0
+            dev = 2 * (
+                np.power(np.maximum(y, 0), 2 - p) / ((1 - p) * (2 - p))
+                - y * np.power(y_pred, 1 - p) / (1 - p)
+                + np.power(y_pred, 2 - p) / (2 - p)
+            )
+
+        elif p == 0:
+            # Normal distribution, y and y_pred any real number
+            dev = (y - y_pred) ** 2
+        elif p < 1:
+            raise ValueError(
+                "Tweedie deviance is only defined for power<=0 and power>=1."
+            )
+        elif p == 1:
+            # Poisson distribution
+            dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
+        elif p == 2:
+            # Gamma distribution
+            dev = 2 * (np.log(y_pred / y) + y / y_pred - 1)
+        else:
+            dev = 2 * (
+                np.power(y, 2 - p) / ((1 - p) * (2 - p))
+                - y * np.power(y_pred, 1 - p) / (1 - p)
+                + np.power(y_pred, 2 - p) / (2 - p)
+            )
+        return dev
+
+
+class NormalDistribution(TweedieDistribution):
+    """Class for the Normal (aka Gaussian) distribution."""
+
+    def __init__(self):
+        super().__init__(power=0)
+
+
+class PoissonDistribution(TweedieDistribution):
+    """Class for the scaled Poisson distribution."""
+
+    def __init__(self):
+        super().__init__(power=1)
+
+
+class GammaDistribution(TweedieDistribution):
+    """Class for the Gamma distribution."""
+
+    def __init__(self):
+        super().__init__(power=2)
+
+
+class InverseGaussianDistribution(TweedieDistribution):
+    """Class for the scaled InverseGaussianDistribution distribution."""
+
+    def __init__(self):
+        super().__init__(power=3)
+
+
+EDM_DISTRIBUTIONS = {
+    "normal": NormalDistribution,
+    "poisson": PoissonDistribution,
+    "gamma": GammaDistribution,
+    "inverse-gaussian": InverseGaussianDistribution,
+}
diff --git a/sklearn/_loss/tests/test_glm_distribution.py b/sklearn/_loss/tests/test_glm_distribution.py
new file mode 100644
index 0000000000000..453f61e2f3214
--- /dev/null
+++ b/sklearn/_loss/tests/test_glm_distribution.py
@@ -0,0 +1,121 @@
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#
+# License: BSD 3 clause
+import numpy as np
+from numpy.testing import (
+    assert_allclose,
+    assert_array_equal,
+)
+from scipy.optimize import check_grad
+import pytest
+
+from sklearn._loss.glm_distribution import (
+    TweedieDistribution,
+    NormalDistribution,
+    PoissonDistribution,
+    GammaDistribution,
+    InverseGaussianDistribution,
+    DistributionBoundary,
+)
+
+
+@pytest.mark.parametrize(
+    "family, expected",
+    [
+        (NormalDistribution(), [True, True, True]),
+        (PoissonDistribution(), [False, True, True]),
+        (TweedieDistribution(power=1.5), [False, True, True]),
+        (GammaDistribution(), [False, False, True]),
+        (InverseGaussianDistribution(), [False, False, True]),
+        (TweedieDistribution(power=4.5), [False, False, True]),
+    ],
+)
+def test_family_bounds(family, expected):
+    """Test the valid range of distributions at -1, 0, 1."""
+    result = family.in_y_range([-1, 0, 1])
+    assert_array_equal(result, expected)
+
+
+def test_invalid_distribution_bound():
+    dist = TweedieDistribution()
+    dist._lower_bound = 0
+    with pytest.raises(TypeError, match="must be of type DistributionBoundary"):
+        dist.in_y_range([-1, 0, 1])
+
+
+def test_tweedie_distribution_power():
+    msg = "distribution is only defined for power<=0 and power>=1"
+    with pytest.raises(ValueError, match=msg):
+        TweedieDistribution(power=0.5)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        TweedieDistribution(power=1j)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        dist = TweedieDistribution()
+        dist.power = 1j
+
+    dist = TweedieDistribution()
+    assert isinstance(dist._lower_bound, DistributionBoundary)
+
+    assert dist._lower_bound.inclusive is False
+    dist.power = 1
+    assert dist._lower_bound.value == 0.0
+    assert dist._lower_bound.inclusive is True
+
+
+@pytest.mark.parametrize(
+    "family, chk_values",
+    [
+        (NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
+        (PoissonDistribution(), [0.1, 1.5]),
+        (GammaDistribution(), [0.1, 1.5]),
+        (InverseGaussianDistribution(), [0.1, 1.5]),
+        (TweedieDistribution(power=-2.5), [0.1, 1.5]),
+        (TweedieDistribution(power=-1), [0.1, 1.5]),
+        (TweedieDistribution(power=1.5), [0.1, 1.5]),
+        (TweedieDistribution(power=2.5), [0.1, 1.5]),
+        (TweedieDistribution(power=-4), [0.1, 1.5]),
+    ],
+)
+def test_deviance_zero(family, chk_values):
+    """Test deviance(y,y) = 0 for different families."""
+    for x in chk_values:
+        assert_allclose(family.deviance(x, x), 0, atol=1e-9)
+
+
+@pytest.mark.parametrize(
+    "family",
+    [
+        NormalDistribution(),
+        PoissonDistribution(),
+        GammaDistribution(),
+        InverseGaussianDistribution(),
+        TweedieDistribution(power=-2.5),
+        TweedieDistribution(power=-1),
+        TweedieDistribution(power=1.5),
+        TweedieDistribution(power=2.5),
+        TweedieDistribution(power=-4),
+    ],
+    ids=lambda x: x.__class__.__name__,
+)
+def test_deviance_derivative(family):
+    """Test deviance derivative for different families."""
+    rng = np.random.RandomState(0)
+    y_true = rng.rand(10)
+    # make data positive
+    y_true += np.abs(y_true.min()) + 1e-2
+
+    y_pred = y_true + np.fmax(rng.rand(10), 0.0)
+
+    dev = family.deviance(y_true, y_pred)
+    assert isinstance(dev, float)
+    dev_derivative = family.deviance_derivative(y_true, y_pred)
+    assert dev_derivative.shape == y_pred.shape
+
+    err = check_grad(
+        lambda y_pred: family.deviance(y_true, y_pred),
+        lambda y_pred: family.deviance_derivative(y_true, y_pred),
+        y_pred,
+    ) / np.linalg.norm(dev_derivative)
+    assert abs(err) < 1e-6

From 7f428b568ed0c9836bc16ddb4aee3016058ef7d7 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 18 Mar 2022 10:12:41 +0100
Subject: [PATCH 34/43] MNT add removal TODO for glm_distribution.py

---
 sklearn/_loss/glm_distribution.py            | 4 ++++
 sklearn/_loss/tests/test_glm_distribution.py | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py
index dfc512c8b10b7..d5f41c1d83f53 100644
--- a/sklearn/_loss/glm_distribution.py
+++ b/sklearn/_loss/glm_distribution.py
@@ -4,6 +4,10 @@
 
 # Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
 # License: BSD 3 clause
+#
+# TODO: Remove with v1.3
+#       This is only used for backward compatibility in _GeneralizedLinearRegressor
+#       for the deprecated family attribute.
 
 from abc import ABCMeta, abstractmethod
 from collections import namedtuple
diff --git a/sklearn/_loss/tests/test_glm_distribution.py b/sklearn/_loss/tests/test_glm_distribution.py
index 453f61e2f3214..3dd48f814db87 100644
--- a/sklearn/_loss/tests/test_glm_distribution.py
+++ b/sklearn/_loss/tests/test_glm_distribution.py
@@ -1,6 +1,8 @@
 # Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
 #
 # License: BSD 3 clause
+#
+# TODO: Remove with v1.3
 import numpy as np
 from numpy.testing import (
     assert_allclose,

From a1474d0776480423a7786150c3629ec99ab134bc Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 18 Mar 2022 10:15:40 +0100
Subject: [PATCH 35/43] DEP backward compatible deprecation of family property

---
 doc/whats_new/v1.1.rst                      |  6 ++--
 sklearn/linear_model/_glm/glm.py            | 31 ++++++++++-----------
 sklearn/linear_model/_glm/tests/test_glm.py | 13 +++++++--
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index dc1b517b61404..c22946212b031 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -565,12 +565,10 @@ Changelog
   sub-problem while now all of them are recorded. :pr:`21998` by
   :user:`Olivier Grisel <ogrisel>`.
 
-- |Fix| The input parameter `family` of :class:`linear_model.TweedieRegressor` is not
-  validated in `__init__` anymore. Instead, this (private) parameter is deprecated in
+- |Fix| The property `family` of :class:`linear_model.TweedieRegressor` is not
+  validated in `__init__` anymore. Instead, this (private) property is deprecated in
   :class:`linear_model.GammaRegressor`, :class:`linear_model.PoissonRegressor` and
   :class:`linear_model.TweedieRegressor`, and will be removed in 1.3.
-  Note that the value of `family` for :class:`linear_model.TweedieRegressor` changed
-  in v1.1.
   :pr:`22548` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 - |Enhancement| :class:`linear_model.BayesianRidge` and
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 22e99806a8897..8c9f451479ffd 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -11,6 +11,7 @@
 import numpy as np
 import scipy.optimize
 
+from ..._loss.glm_distribution import TweedieDistribution
 from ..._loss.loss import (
     BaseLoss,
     HalfGammaLoss,
@@ -76,15 +77,6 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
-    family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} \
-            or a BaseLoss instance, default='normal'
-        The distributional assumption of the GLM, i.e. which distribution from
-        the EDM, specifies the loss function to be minimized.
-
-        .. deprecated:: 1.1
-           `family` is deprecated in 1.1 and will be removed in 1.3.
-           Use `_base_loss` instead.
-
     _base_loss : BaseLoss, default=HalfSquaredError()
         A `_base_loss` contains a specific loss function as well as the link
         function. The loss to be minimized specifies the distributional assumption of
@@ -374,8 +366,8 @@ def score(self, X, y, sample_weight=None):
         """Compute D^2, the percentage of deviance explained.
 
         D^2 is a generalization of the coefficient of determination R^2.
-        R^2 uses squared error and D^2 deviance. Note that those two are equal
-        for ``_base_loss=HalfSquaredError()``.
+        R^2 uses squared error and D^2 uses the deviance of this GLM, see the
+        :ref:`User Guide <regression_metrics>`.
 
         D^2 is defined as
         :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
@@ -401,6 +393,9 @@ def score(self, X, y, sample_weight=None):
         score : float
             D^2 of self.predict(X) w.r.t. y.
         """
+        # TODO: Adapt link to User Guide in the docstring, once
+        # https://github.com/scikit-learn/scikit-learn/pull/22118 is merged.
+        #
         # Note, default score defined in RegressorMixin is R^2 score.
         # TODO: make D^2 a score function in module metrics (and thereby get
         #       input validation and so on)
@@ -463,19 +458,23 @@ def _validate_base_loss(self):
 
     # FIXME: remove in v1.3
     @deprecated(  # type: ignore
-        "Attribute `family` was deprecated in version 1.1 and "
-        "will be removed in 1.3. Use `_base_loss` instead."
+        "Attribute `family` was deprecated in version 1.1 and will be removed in 1.3."
     )
     @property
     def family(self):
+        """Ensure backward compatibility for the time of deprecation."""
         if isinstance(self, PoissonRegressor):
             return "poisson"
         elif isinstance(self, GammaRegressor):
             return "gamma"
-        elif isinstance(self, TweedieRegressor) and self.power == 3:
-            return "inverse-gaussian"
+        elif isinstance(self, TweedieRegressor):
+            return TweedieDistribution(power=self.power)
         else:
-            return self._base_loss.__class__.__name__
+            raise ValueError(  # noqa
+                "This should never happen. You presumably accessed the deprecated "
+                "`family` attribute from a subclass of the private scikit-learn class "
+                "_GeneralizedLinearRegressor."
+            )
 
 
 class PoissonRegressor(_GeneralizedLinearRegressor):
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 7ef6a1c30472c..2f8c4e568a764 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -8,6 +8,7 @@
 import pytest
 import warnings
 
+from sklearn._loss.glm_distribution import TweedieDistribution
 from sklearn._loss.link import IdentityLink, LogLink
 from sklearn._loss.loss import (
     HalfGammaLoss,
@@ -532,10 +533,16 @@ def test_tags(estimator, value):
     [
         (PoissonRegressor(), "poisson"),
         (GammaRegressor(), "gamma"),
-        (TweedieRegressor(power=3), "inverse-gaussian"),
-        (TweedieRegressor(), "HalfTweedieLoss"),
+        (TweedieRegressor(), TweedieDistribution()),
+        (TweedieRegressor(power=2), TweedieDistribution(power=2)),
+        (TweedieRegressor(power=3), TweedieDistribution(power=3)),
     ],
 )
 def test_family_deprecation(est, family):
+    """Test backward compatibility of the family property."""
     with pytest.warns(FutureWarning, match="`family` was deprecated"):
-        assert est.family == family
+        if isinstance(family, str):
+            assert est.family == family
+        else:
+            assert est.family.__class__ == family.__class__
+            assert est.family.power == family.power

From f23dd310c0000e74dce797e1329c0e6227f28d55 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 18 Mar 2022 23:57:42 -0400
Subject: [PATCH 36/43] CLN Idea removing _base_loss

---
 sklearn/linear_model/_glm/glm.py            |  50 +++------
 sklearn/linear_model/_glm/tests/test_glm.py | 110 ++++----------------
 2 files changed, 33 insertions(+), 127 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 8c9f451479ffd..2407a12e0594f 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -13,7 +13,6 @@
 
 from ..._loss.glm_distribution import TweedieDistribution
 from ..._loss.loss import (
-    BaseLoss,
     HalfGammaLoss,
     HalfPoissonLoss,
     HalfSquaredError,
@@ -140,7 +139,6 @@ def __init__(
         *,
         alpha=1.0,
         fit_intercept=True,
-        _base_loss=HalfSquaredError(),
         solver="lbfgs",
         max_iter=100,
         tol=1e-4,
@@ -149,7 +147,6 @@ def __init__(
     ):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
-        self._base_loss = _base_loss
         self.solver = solver
         self.max_iter = max_iter
         self.tol = tol
@@ -243,9 +240,10 @@ def fit(self, X, y, sample_weight=None):
         sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
 
         n_samples, n_features = X.shape
+        self._base_loss = self._get_loss()
 
         self._linear_loss = LinearModelLoss(
-            base_loss=self._validate_base_loss(),
+            base_loss=self._base_loss,
             fit_intercept=self.fit_intercept,
         )
 
@@ -441,20 +439,17 @@ def _more_tags(self):
         # Create instance of BaseLoss if fit wasn't called yet. This is necessary as
         # TweedieRegressor might set the used loss during fit different from
         # self._base_loss.
-        base_loss = self._validate_base_loss()
+        base_loss = self._get_loss()
         return {"requires_positive_y": not base_loss.in_y_true_range(-1.0)}
 
-    def _validate_base_loss(self):
-        # This is only necessary because of the link and power arguments of the
-        # TweedieRegressor.
-        # Note that we do not need to pass sample_weight to the loss class as this is
-        # only needed to set loss.constant_hessian on which GLMs do not rely.
-        if not isinstance(self._base_loss, BaseLoss):
-            raise ValueError(
-                "The _base_loss must be an instance of sklearn._loss.loss.BaseLoss; "
-                f"got _base_loss_class={self._base_loss}"
-            )
-        return self._base_loss
+    def _get_loss(self):
+        """This is only necessary because of the link and power arguments of the
+        TweedieRegressor.
+
+        Note that we do not need to pass sample_weight to the loss class as this is
+        only needed to set loss.constant_hessian on which GLMs do not rely.
+        """
+        return HalfSquaredError()
 
     # FIXME: remove in v1.3
     @deprecated(  # type: ignore
@@ -576,16 +571,13 @@ def __init__(
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
-            _base_loss=HalfPoissonLoss(),
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
             verbose=verbose,
         )
 
-    def _validate_base_loss(self):
-        if not isinstance(self._base_loss, HalfPoissonLoss):
-            raise ValueError("PoissonRegressor._base_loss must be HalfPoissonLoss!")
+    def _get_loss(self):
         return HalfPoissonLoss()
 
 
@@ -689,16 +681,13 @@ def __init__(
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
-            _base_loss=HalfGammaLoss(),
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
             verbose=verbose,
         )
 
-    def _validate_base_loss(self):
-        if not isinstance(self._base_loss, HalfGammaLoss):
-            raise ValueError("GammaRegressor._base_loss must be HalfGammaLoss!")
+    def _get_loss(self):
         return HalfGammaLoss()
 
 
@@ -834,9 +823,6 @@ def __init__(
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
-            # Initialize with fixed power. Otherwise, validation would happen in init.
-            # In the end, _validate_base_loss returns the used loss.
-            _base_loss=HalfTweedieLoss(power=1),
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
@@ -845,15 +831,7 @@ def __init__(
         self.link = link
         self.power = power
 
-    def _validate_base_loss(self):
-        # This is only necessary because of the link and power arguments of the
-        # TweedieRegressor.
-        if not isinstance(self._base_loss, (HalfTweedieLoss, HalfTweedieLossIdentity)):
-            raise ValueError(
-                "TweedieRegressor._base_loss must be HalfTweedieLoss or"
-                " HalfTweedieLossIdentity!"
-            )
-
+    def _get_loss(self):
         # We could raise ValueError. But we want to be able to reset power
         # gracefully.
         # if self.power != self.base_loss.power:
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 2f8c4e568a764..2723b0921537d 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -8,18 +8,12 @@
 import pytest
 import warnings
 
+from sklearn.base import clone
 from sklearn._loss.glm_distribution import TweedieDistribution
 from sklearn._loss.link import IdentityLink, LogLink
-from sklearn._loss.loss import (
-    HalfGammaLoss,
-    HalfPoissonLoss,
-    HalfSquaredError,
-    HalfTweedieLoss,
-    HalfTweedieLossIdentity,
-)
+
 from sklearn.datasets import make_regression
 from sklearn.linear_model._glm import _GeneralizedLinearRegressor
-from sklearn.linear_model._glm.glm import HalfInverseGaussianLoss
 from sklearn.linear_model import TweedieRegressor, PoissonRegressor, GammaRegressor
 from sklearn.linear_model import Ridge
 from sklearn.exceptions import ConvergenceWarning
@@ -78,15 +72,6 @@ def test_glm_solver_argument(solver):
         glm.fit(X, y)
 
 
-def test_glm_validate_base_loss():
-    y = np.array([1, 2])
-    X = np.array([[1], [2]])
-    glm = _GeneralizedLinearRegressor(_base_loss=LogLink())
-    msg = "The _base_loss must be an instance of sklearn._loss.loss.BaseLoss"
-    with pytest.raises(ValueError, match=msg):
-        glm.fit(X, y)
-
-
 @pytest.mark.parametrize(
     "Estimator",
     [_GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor, TweedieRegressor],
@@ -165,7 +150,7 @@ def test_glm_warm_start_argument(warm_start):
 @pytest.mark.parametrize(
     "glm",
     [
-        _GeneralizedLinearRegressor(_base_loss=HalfTweedieLoss(power=3)),
+        TweedieRegressor(power=3),
         PoissonRegressor(),
         GammaRegressor(),
         TweedieRegressor(power=1.5),
@@ -187,7 +172,6 @@ def test_glm_identity_regression(fit_intercept):
     y = np.dot(X, coef)
     glm = _GeneralizedLinearRegressor(
         alpha=0,
-        _base_loss=HalfSquaredError(),
         fit_intercept=fit_intercept,
         tol=1e-12,
     )
@@ -203,18 +187,18 @@ def test_glm_identity_regression(fit_intercept):
 @pytest.mark.parametrize("fit_intercept", [False, True])
 @pytest.mark.parametrize("alpha", [0.0, 1.0])
 @pytest.mark.parametrize(
-    "base_loss", [HalfSquaredError(), HalfPoissonLoss(), HalfGammaLoss()]
+    "GLMEstimator", [_GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor]
 )
-def test_glm_sample_weight_consistentcy(fit_intercept, alpha, base_loss):
+def test_glm_sample_weight_consistentcy(fit_intercept, alpha, GLMEstimator):
     """Test that the impact of sample_weight is consistent"""
     rng = np.random.RandomState(0)
     n_samples, n_features = 10, 5
 
     X = rng.rand(n_samples, n_features)
     y = rng.rand(n_samples)
-    glm_params = dict(alpha=alpha, _base_loss=base_loss, fit_intercept=fit_intercept)
+    glm_params = dict(alpha=alpha, fit_intercept=fit_intercept)
 
-    glm = _GeneralizedLinearRegressor(**glm_params).fit(X, y)
+    glm = GLMEstimator(**glm_params).fit(X, y)
     coef = glm.coef_.copy()
 
     # sample_weight=np.ones(..) should be equivalent to sample_weight=None
@@ -243,34 +227,31 @@ def test_glm_sample_weight_consistentcy(fit_intercept, alpha, base_loss):
     sample_weight_1 = np.ones(len(y))
     sample_weight_1[: n_samples // 2] = 2
 
-    glm1 = _GeneralizedLinearRegressor(**glm_params).fit(
-        X, y, sample_weight=sample_weight_1
-    )
+    glm1 = GLMEstimator(**glm_params).fit(X, y, sample_weight=sample_weight_1)
 
-    glm2 = _GeneralizedLinearRegressor(**glm_params).fit(X2, y2, sample_weight=None)
+    glm2 = GLMEstimator(**glm_params).fit(X2, y2, sample_weight=None)
     assert_allclose(glm1.coef_, glm2.coef_)
 
 
 @pytest.mark.parametrize("fit_intercept", [True, False])
 @pytest.mark.parametrize(
-    "base_loss",
+    "estimator",
     [
-        HalfPoissonLoss(),
-        HalfGammaLoss(),
-        HalfInverseGaussianLoss(),
-        HalfTweedieLoss(power=0),
-        HalfTweedieLoss(power=1.5),
-        HalfTweedieLoss(power=4.5),
+        PoissonRegressor(),
+        GammaRegressor(),
+        TweedieRegressor(power=3.0),
+        TweedieRegressor(power=0, link="log"),
+        TweedieRegressor(power=1.5),
+        TweedieRegressor(power=4.5),
     ],
 )
-def test_glm_log_regression(fit_intercept, base_loss):
+def test_glm_log_regression(fit_intercept, estimator):
     """Test GLM regression with log link on a simple dataset."""
     coef = [0.2, -0.1]
     X = np.array([[0, 1, 2, 3, 4], [1, 1, 1, 1, 1]]).T
     y = np.exp(np.dot(X, coef))
-    glm = _GeneralizedLinearRegressor(
+    glm = clone(estimator).set_params(
         alpha=0,
-        _base_loss=base_loss,
         fit_intercept=fit_intercept,
         tol=1e-8,
     )
@@ -367,7 +348,6 @@ def test_normal_ridge_comparison(
 
     glm = _GeneralizedLinearRegressor(
         alpha=alpha,
-        _base_loss=HalfSquaredError(),
         fit_intercept=fit_intercept,
         max_iter=300,
         tol=1e-5,
@@ -395,10 +375,9 @@ def test_poisson_glmnet():
     # b            0.03741173122
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
-    glm = _GeneralizedLinearRegressor(
+    glm = PoissonRegressor(
         alpha=1,
         fit_intercept=True,
-        _base_loss=HalfPoissonLoss(),
         tol=1e-7,
         max_iter=300,
     )
@@ -415,57 +394,6 @@ def test_convergence_warning(regression_data):
         est.fit(X, y)
 
 
-@pytest.mark.parametrize(
-    "estimator, base_loss",
-    [(PoissonRegressor, HalfPoissonLoss), (GammaRegressor, HalfGammaLoss)],
-)
-def test_raise_on_reset_base_loss(regression_data, estimator, base_loss):
-    # Make sure to raise an appropriate error if _base_loss was reset.
-    X, y = regression_data
-
-    est = estimator()
-    est._base_loss = HalfSquaredError()
-    msg = f"{estimator.__name__}._base_loss must be {base_loss.__name__}!"
-    with pytest.raises(ValueError, match=msg):
-        est.fit(X, y)
-
-
-def test_tweedie_raise_on_reset_base_loss(regression_data):
-    # Make sure to raise an appropriate error if _base_loss
-    # was inconsistently reset for a Tweedie deviance loss.
-    X, y = regression_data
-    # make y positive
-    y = np.abs(y) + 1.0
-
-    power = 2.0
-    est = TweedieRegressor(power=power)
-    assert isinstance(est._base_loss, HalfTweedieLoss)
-    assert est.power == power
-    # The following is not true to avoid input validation in init.
-    # assert est._base_loss.closs.power == power
-    est.fit(X, y)
-    assert est._linear_loss.base_loss.closs.power == power
-
-    # reset power
-    new_power = 0
-    est.power = new_power
-    est.fit(X, y)  # fit resets _linear_loss
-    assert new_power == est.power == est._linear_loss.base_loss.closs.power
-
-    # reset _base_loss
-    est = TweedieRegressor(power=power, link="identity")
-    assert isinstance(est._base_loss, HalfTweedieLoss)  # this is the unused default
-    est.fit(X, y)
-    est._base_loss = HalfSquaredError()
-    assert isinstance(est._linear_loss.base_loss, HalfTweedieLossIdentity)
-    msg = (
-        "TweedieRegressor._base_loss must be HalfTweedieLoss or"
-        " HalfTweedieLossIdentity!"
-    )
-    with pytest.raises(ValueError, match=msg):
-        est.fit(X, y)
-
-
 @pytest.mark.parametrize(
     "name, link_class", [("identity", IdentityLink), ("log", LogLink)]
 )

From db4d402c512960927697d3b6a384c0de05633759 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 24 Mar 2022 23:15:34 +0100
Subject: [PATCH 37/43] CLN fix type in test name

---
 sklearn/linear_model/_glm/tests/test_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 2723b0921537d..c1a1b5d28b0dc 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -189,7 +189,7 @@ def test_glm_identity_regression(fit_intercept):
 @pytest.mark.parametrize(
     "GLMEstimator", [_GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor]
 )
-def test_glm_sample_weight_consistentcy(fit_intercept, alpha, GLMEstimator):
+def test_glm_sample_weight_consistency(fit_intercept, alpha, GLMEstimator):
     """Test that the impact of sample_weight is consistent"""
     rng = np.random.RandomState(0)
     n_samples, n_features = 10, 5

From 87dd21751331405f85fa4e8bd7b854f330e443f9 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 27 Mar 2022 17:31:38 -0400
Subject: [PATCH 38/43] Small refactor for _mean_tweedie_deviance

---
 sklearn/metrics/_regression.py | 60 +++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 26 deletions(-)

diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index bf16a6a3ed662..270d6a2c19cfe 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -967,6 +967,35 @@ def max_error(y_true, y_pred):
     return np.max(np.abs(y_true - y_pred))
 
 
+def _mean_tweedie_deviance(y_true, y_pred, sample_weight, power):
+    """Mean Tweedie deviance regression loss."""
+    p = power
+    if p < 0:
+        # 'Extreme stable', y any real number, y_pred > 0
+        dev = 2 * (
+            np.power(np.maximum(y_true, 0), 2 - p) / ((1 - p) * (2 - p))
+            - y_true * np.power(y_pred, 1 - p) / (1 - p)
+            + np.power(y_pred, 2 - p) / (2 - p)
+        )
+    elif p == 0:
+        # Normal distribution, y and y_pred any real number
+        dev = (y_true - y_pred) ** 2
+    elif p == 1:
+        # Poisson distribution
+        dev = 2 * (xlogy(y_true, y_true / y_pred) - y_true + y_pred)
+    elif p == 2:
+        # Gamma distribution
+        dev = 2 * (np.log(y_pred / y_true) + y_true / y_pred - 1)
+    else:
+        dev = 2 * (
+            np.power(y_true, 2 - p) / ((1 - p) * (2 - p))
+            - y_true * np.power(y_pred, 1 - p) / (1 - p)
+            + np.power(y_pred, 2 - p) / (2 - p)
+        )
+
+    return np.average(dev, weights=sample_weight)
+
+
 def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0):
     """Mean Tweedie deviance regression loss.
 
@@ -1054,30 +1083,9 @@ def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0):
         # Unreachable statement
         raise ValueError
 
-    if p < 0:
-        # 'Extreme stable', y any real number, y_pred > 0
-        dev = 2 * (
-            np.power(np.maximum(y_true, 0), 2 - p) / ((1 - p) * (2 - p))
-            - y_true * np.power(y_pred, 1 - p) / (1 - p)
-            + np.power(y_pred, 2 - p) / (2 - p)
-        )
-    elif p == 0:
-        # Normal distribution, y and y_pred any real number
-        dev = (y_true - y_pred) ** 2
-    elif p == 1:
-        # Poisson distribution
-        dev = 2 * (xlogy(y_true, y_true / y_pred) - y_true + y_pred)
-    elif p == 2:
-        # Gamma distribution
-        dev = 2 * (np.log(y_pred / y_true) + y_true / y_pred - 1)
-    else:
-        dev = 2 * (
-            np.power(y_true, 2 - p) / ((1 - p) * (2 - p))
-            - y_true * np.power(y_pred, 1 - p) / (1 - p)
-            + np.power(y_pred, 2 - p) / (2 - p)
-        )
-
-    return np.average(dev, weights=sample_weight)
+    return _mean_tweedie_deviance(
+        y_true, y_pred, sample_weight=sample_weight, power=power
+    )
 
 
 def mean_poisson_deviance(y_true, y_pred, *, sample_weight=None):
@@ -1243,8 +1251,8 @@ def d2_tweedie_score(y_true, y_pred, *, sample_weight=None, power=0):
         y_true, y_pred, sample_weight=sample_weight, power=power
     )
 
-    y_avg = np.full_like(y_true, np.average(y_true, weights=sample_weight))
-    denominator = mean_tweedie_deviance(
+    y_avg = np.average(y_true, weights=sample_weight)
+    denominator = _mean_tweedie_deviance(
         y_true, y_avg, sample_weight=sample_weight, power=power
     )
 

From 5170acac6b9f6f19dc30519ab1703525e726558c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 28 Mar 2022 07:47:02 +0200
Subject: [PATCH 39/43] CLN fix merge conflict in whatsnew

---
 doc/whats_new/v1.1.rst | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 49ed4d371aa3b..13b473fba11a9 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -526,12 +526,6 @@ Changelog
   :class:`kernel_approximation.Nystroem`,
   :class:`kernel_approximation.PolynomialCountSketch`,
   :class:`kernel_approximation.RBFSampler`, and
-  :class:`kernel_approximation.SkewedChi2Sampler`. :pr:`22694` by `Thomas Fan`_.
-
-- |API| Adds :term:`get_feature_names_out` to the following transformers
-  of the :mod:`~sklearn.kernel_approximation` module:
-  :class:`~sklearn.kernel_approximation.AdditiveChi2Sampler`.
-  :pr:`22137` by `Thomas Fan`_.
   :class:`kernel_approximation.SkewedChi2Sampler`.
   :pr:`22137` and :pr:`22694` by `Thomas Fan`_.
 

From 5a5a22fdd9ac81d39971c8d65f39e883adab9e59 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 28 Mar 2022 07:58:40 +0200
Subject: [PATCH 40/43] CLN remove HalfInverseGaussianLoss

---
 sklearn/linear_model/_glm/glm.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 2407a12e0594f..e60985fca4869 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -27,18 +27,6 @@
 from .._linear_loss import LinearModelLoss
 
 
-class HalfInverseGaussianLoss(HalfTweedieLoss):
-    """Half inverse Gaussian deviance loss with log-link, for regression.
-
-    This is equivalent to HalfTweedieLoss(power=3).
-    """
-
-    def __init__(self, sample_weight=None):
-        super().__init__(sample_weight=sample_weight, power=3)
-
-
-# TODO: We could allow strings for _base_loss (as before for the now removed
-# family parameter): 'normal', 'poisson', 'gamma', 'inverse-gaussian'
 class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
     """Regression via a penalized Generalized Linear Model (GLM).
 

From 16e8be5fd9d8a7b143b57cffecf32ccc248d4495 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 28 Mar 2022 07:59:46 +0200
Subject: [PATCH 41/43] DOC move private _base_loss to attributes section

---
 sklearn/linear_model/_glm/glm.py | 42 ++++++++++++++++----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index e60985fca4869..c174acbf7e31b 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -38,7 +38,7 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         1/(2*sum(s_i)) * sum(s_i * deviance(y_i, h(x_i*w)) + 1/2 * alpha * ||w||_2^2
 
     with inverse link function h, s=sample_weight and per observation (unit) deviance
-    deviance(y_i, h(x_i*w)). Note that for an EDM 1/2 * deviance is the negative
+    deviance(y_i, h(x_i*w)). Note that for an EDM, 1/2 * deviance is the negative
     log-likelihood up to a constant (in w) term.
     The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
 
@@ -64,26 +64,6 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
-    _base_loss : BaseLoss, default=HalfSquaredError()
-        A `_base_loss` contains a specific loss function as well as the link
-        function. The loss to be minimized specifies the distributional assumption of
-        the GLM, i.e. the distribution from the EDM. Here are some examples:
-
-        =======================  ========  ==========================
-        _base_loss               Link      Target Domain
-        =======================  ========  ==========================
-        HalfSquaredError         identity  y any real number
-        HalfPoissonLoss          log       0 <= y
-        HalfGammaLoss            log       0 < y
-        HalfInverseGaussianLoss  log       0 < y
-        HalfTweedieLoss          log       dependend on tweedie power
-        HalfTweedieLossIdentity  identity  dependend on tweedie power
-        =======================  ========  ==========================
-
-        The link function of the GLM, i.e. mapping from linear predictor
-        `X @ coeff + intercept` to prediction `y_pred`. For instance, with a log link,
-        we have `y_pred = exp(X @ coeff + intercept)`.
-
     solver : 'lbfgs', default='lbfgs'
         Algorithm to use in the optimization problem:
 
@@ -120,6 +100,26 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
 
     n_iter_ : int
         Actual number of iterations used in the solver.
+
+    _base_loss : BaseLoss, default=HalfSquaredError()
+        This is set during fit via `self._get_loss()`.
+        A `_base_loss` contains a specific loss function as well as the link
+        function. The loss to be minimized specifies the distributional assumption of
+        the GLM, i.e. the distribution from the EDM. Here are some examples:
+
+        =======================  ========  ==========================
+        _base_loss               Link      Target Domain
+        =======================  ========  ==========================
+        HalfSquaredError         identity  y any real number
+        HalfPoissonLoss          log       0 <= y
+        HalfGammaLoss            log       0 < y
+        HalfTweedieLoss          log       dependend on tweedie power
+        HalfTweedieLossIdentity  identity  dependend on tweedie power
+        =======================  ========  ==========================
+
+        The link function of the GLM, i.e. mapping from linear predictor
+        `X @ coeff + intercept` to prediction `y_pred`. For instance, with a log link,
+        we have `y_pred = exp(X @ coeff + intercept)`.
     """
 
     def __init__(

From 0024ca3a8f00be7da804a2f90d799dd10c35d8fc Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 28 Mar 2022 08:00:48 +0200
Subject: [PATCH 42/43] CLN remove comment

---
 sklearn/linear_model/_glm/glm.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index c174acbf7e31b..d25301ef36ec9 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -820,13 +820,6 @@ def __init__(
         self.power = power
 
     def _get_loss(self):
-        # We could raise ValueError. But we want to be able to reset power
-        # gracefully.
-        # if self.power != self.base_loss.power:
-        #     raise ValueError(
-        #         "TweedieRegressor must have self.base_loss.power = self.power."
-        #     )
-
         if self.link == "auto":
             if self.power <= 0:
                 # identity link

From 49fb93f99f3fe76d74f2faaa7ee3a8751e568d81 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 28 Mar 2022 08:05:07 +0200
Subject: [PATCH 43/43] CLN TODO(1.3)

---
 sklearn/_loss/glm_distribution.py            | 2 +-
 sklearn/_loss/tests/test_glm_distribution.py | 2 +-
 sklearn/linear_model/_glm/glm.py             | 2 +-
 sklearn/linear_model/_glm/tests/test_glm.py  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py
index d5f41c1d83f53..6fbe675fef533 100644
--- a/sklearn/_loss/glm_distribution.py
+++ b/sklearn/_loss/glm_distribution.py
@@ -5,7 +5,7 @@
 # Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
 # License: BSD 3 clause
 #
-# TODO: Remove with v1.3
+# TODO(1.3): remove file
 #       This is only used for backward compatibility in _GeneralizedLinearRegressor
 #       for the deprecated family attribute.
 
diff --git a/sklearn/_loss/tests/test_glm_distribution.py b/sklearn/_loss/tests/test_glm_distribution.py
index 3dd48f814db87..aaaa9de39a502 100644
--- a/sklearn/_loss/tests/test_glm_distribution.py
+++ b/sklearn/_loss/tests/test_glm_distribution.py
@@ -2,7 +2,7 @@
 #
 # License: BSD 3 clause
 #
-# TODO: Remove with v1.3
+# TODO(1.3): remove file
 import numpy as np
 from numpy.testing import (
     assert_allclose,
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index d25301ef36ec9..68aa4ea0df22c 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -439,7 +439,7 @@ def _get_loss(self):
         """
         return HalfSquaredError()
 
-    # FIXME: remove in v1.3
+    # TODO(1.3): remove
     @deprecated(  # type: ignore
         "Attribute `family` was deprecated in version 1.1 and will be removed in 1.3."
     )
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index c1a1b5d28b0dc..9bfa2fe28e91a 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -455,7 +455,7 @@ def test_tags(estimator, value):
     assert estimator._get_tags()["requires_positive_y"] is value
 
 
-# FIXME: remove in v1.3
+# TODO(1.3): remove
 @pytest.mark.parametrize(
     "est, family",
     [