scikit-learn · rth · Oct 14, 2019 · Oct 14, 2019 · Oct 14, 2019 · Oct 14, 2019
diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py
diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py
@@ -0,0 +1,274 @@
+"""
+Distribution functions used in GLM
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# License: BSD 3 clause
+
+from abc import ABCMeta, abstractmethod
+from collections import namedtuple
+import numbers
+
+import numpy as np
+from scipy.special import xlogy
+
+
+DistributionBoundary = namedtuple("DistributionBoundary",
+                                  ("value", "inclusive"))
+
+
+class ExponentialDispersionModel(metaclass=ABCMeta):
+    r"""Base class for reproductive Exponential Dispersion Models (EDM).
+
+    The pdf of :math:`Y\sim \mathrm{EDM}(y_\textrm{pred}, \phi)` is given by
+
+    .. math:: p(y| \theta, \phi) = c(y, \phi)
+        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
+        = \tilde{c}(y, \phi)
+            \exp\left(-\frac{d(y, y_\textrm{pred})}{2\phi}\right)
+
+    with mean :math:`\mathrm{E}[Y] = A'(\theta) = y_\textrm{pred}`,
+    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(y_\textrm{pred})`,
+    unit variance :math:`v(y_\textrm{pred})` and
+    unit deviance :math:`d(y,y_\textrm{pred})`.
+
+    Methods
+    -------
+    in_y_range
+    unit_deviance
+
+    References
+    ----------
+    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+    """
+
+    def in_y_range(self, y):
+        """Returns ``True`` if y is in the valid range of Y~EDM.
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+        """
+        # Note that currently supported distributions have +inf upper bound
+
+        if not isinstance(self._lower_bound, DistributionBoundary):
+            raise TypeError('_lower_bound attribute must be of type '
+                            'DistributionBoundary')
+
+        if self._lower_bound.inclusive:
+            return np.greater_equal(y, self._lower_bound.value)
+        else:
+            return np.greater(y, self._lower_bound.value)
+
+    @abstractmethod
+    def unit_deviance(self, y, y_pred, check_input=False):
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
+        log-likelihood as
+        :math:`d(y,y_\textrm{pred}) = -2\phi\cdot
+        \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+
+        check_input : bool, default=False
+            If True raise an exception on invalid y or y_pred values, otherwise
+            they will be propagated as NaN.
+        Returns
+        -------
+        deviance: array of shape (n_samples,)
+            Computed deviance
+        """
+
+    def deviance(self, y, y_pred, weights=1):
+        r"""Compute the deviance.
+
+        The deviance is a weighted sum of the per sample unit deviances,
+        :math:`D = \sum_i s_i \cdot d(y_i, y_\textrm{pred}_i)`
+        with weights :math:`s_i` and unit deviance
+        :math:`d(y,y_\textrm{pred})`.
+        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
+        \left(loglike(y,y_\textrm{pred},\frac{phi}{s})
+        - loglike(y,y,\frac{phi}{s})\right)`.
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+
+        weights : {int, array of shape (n_samples,)}, default=1
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return np.sum(weights * self.unit_deviance(y, y_pred))
+
+
+class TweedieDistribution(ExponentialDispersionModel):
+    r"""A class for the Tweedie distribution.
+
+    A Tweedie distribution with mean :math:`y_\textrm{pred}=\mathrm{E}[Y]`
+    is uniquely defined by it's mean-variance relationship
+    :math:`\mathrm{Var}[Y] \propto y_\textrm{pred}^power`.
+
+    Special cases are:
+
+    ===== ================
+    Power Distribution
+    ===== ================
+    0     Normal
+    1     Poisson
+    (1,2) Compound Poisson
+    2     Gamma
+    3     Inverse Gaussian
+
+    Parameters
+    ----------
+    power : float, default=0
+            The variance power of the `unit_variance`
+            :math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`.
+            For ``0<power<1``, no distribution exists.
+    """
+    def __init__(self, power=0):
+        self.power = power
+
+    @property
+    def power(self):
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        # We use a property with a setter, to update lower and
+        # upper bound when the power parameter is updated e.g. in grid
+        # search.
+        if not isinstance(power, numbers.Real):
+            raise TypeError('power must be a real number, input was {0}'
+                            .format(power))
+
+        if power <= 0:
+            # Extreme Stable or Normal distribution
+            self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False)
+        elif 0 < power < 1:
+            raise ValueError('Tweedie distribution is only defined for '
+                             'power<=0 and power>=1.')
+        elif 1 <= power < 2:
+            # Poisson or Compound Poisson distribution
+            self._lower_bound = DistributionBoundary(0, inclusive=True)
+        elif power >= 2:
+            # Gamma, Positive Stable, Inverse Gaussian distributions
+            self._lower_bound = DistributionBoundary(0, inclusive=False)
+        else:  # pragma: no cover
+            # this branch should be unreachable.
+            raise ValueError
+
+        self._power = power
+
+    def unit_deviance(self, y, y_pred, check_input=False):
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
+        log-likelihood as
+        :math:`d(y,y_\textrm{pred}) = -2\phi\cdot
+        \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+
+        check_input : bool, default=False
+            If True raise an exception on invalid y or y_pred values, otherwise
+            they will be propagated as NaN.
+        Returns
+        -------
+        deviance: array of shape (n_samples,)
+            Computed deviance
+        """
+        p = self.power
+
+        if check_input:
+            message = ("Mean Tweedie deviance error with power={} can only be "
+                       "used on ".format(p))
+            if p < 0:
+                # 'Extreme stable', y any realy number, y_pred > 0
+                if (y_pred <= 0).any():
+                    raise ValueError(message + "strictly positive y_pred.")
+            elif p == 0:
+                # Normal, y and y_pred can be any real number
+                pass
+            elif 0 < p < 1:
+                raise ValueError("Tweedie deviance is only defined for "
+                                 "power<=0 and power>=1.")
+            elif 1 <= p < 2:
+                # Poisson and Compount poisson distribution, y >= 0, y_pred > 0
+                if (y < 0).any() or (y_pred <= 0).any():
+                    raise ValueError(message + "non-negative y and strictly "
+                                     "positive y_pred.")
+            elif p >= 2:
+                # Gamma and Extreme stable distribution, y and y_pred > 0
+                if (y <= 0).any() or (y_pred <= 0).any():
+                    raise ValueError(message
+                                     + "strictly positive y and y_pred.")
+            else:  # pragma: nocover
+                # Unreachable statement
+                raise ValueError
+
+        if p < 0:
+            # 'Extreme stable', y any realy number, y_pred > 0
+            dev = 2 * (np.power(np.maximum(y, 0), 2-p) / ((1-p) * (2-p))
+                       - y * np.power(y_pred, 1-p) / (1-p)
+                       + np.power(y_pred, 2-p) / (2-p))
+
+        elif p == 0:
+            # Normal distribution, y and y_pred any real number
+            dev = (y - y_pred)**2
+        elif p < 1:
+            raise ValueError("Tweedie deviance is only defined for power<=0 "
+                             "and power>=1.")
+        elif p == 1:
+            # Poisson distribution
+            dev = 2 * (xlogy(y, y/y_pred) - y + y_pred)
+        elif p == 2:
+            # Gamma distribution
+            dev = 2 * (np.log(y_pred/y) + y/y_pred - 1)
+        else:
+            dev = 2 * (np.power(y, 2-p) / ((1-p) * (2-p))
+                       - y * np.power(y_pred, 1-p) / (1-p)
+                       + np.power(y_pred, 2-p) / (2-p))
+        return dev
+
+
+class NormalDistribution(TweedieDistribution):
+    """Class for the Normal (aka Gaussian) distribution"""
+    def __init__(self):
+        super().__init__(power=0)
+
+
+class PoissonDistribution(TweedieDistribution):
+    """Class for the scaled Poisson distribution"""
+    def __init__(self):
+        super().__init__(power=1)
+
+
+class GammaDistribution(TweedieDistribution):
+    """Class for the Gamma distribution"""
+    def __init__(self):
+        super().__init__(power=2)
+
+
+EDM_DISTRIBUTIONS = {
+    'normal': NormalDistribution,
+    'poisson': PoissonDistribution,
+    'gamma': GammaDistribution,
+}
diff --git a/sklearn/_loss/tests/__init__.py b/sklearn/_loss/tests/__init__.py
diff --git a/sklearn/_loss/tests/test_glm_distribution.py b/sklearn/_loss/tests/test_glm_distribution.py
@@ -0,0 +1,72 @@
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#
+# License: BSD 3 clause
+from numpy.testing import (
+    assert_allclose,
+    assert_array_equal,
+)
+import pytest
+
+from sklearn._loss.glm_distribution import (
+    TweedieDistribution,
+    NormalDistribution, PoissonDistribution,
+    GammaDistribution, DistributionBoundary
+)
+
+
+@pytest.mark.parametrize(
+    'family, expected',
+    [(NormalDistribution(), [True, True, True]),
+     (PoissonDistribution(), [False, True, True]),
+     (TweedieDistribution(power=1.5), [False, True, True]),
+     (GammaDistribution(), [False, False, True]),
+     (TweedieDistribution(power=4.5), [False, False, True])])
+def test_family_bounds(family, expected):
+    """Test the valid range of distributions at -1, 0, 1."""
+    result = family.in_y_range([-1, 0, 1])
+    assert_array_equal(result, expected)
+
+
+def test_invalid_distribution_bound():
+    dist = TweedieDistribution()
+    dist._lower_bound = 0
+    with pytest.raises(TypeError,
+                       match="must be of type DistributionBoundary"):
+        dist.in_y_range([-1, 0, 1])
+
+
+def test_tweedie_distribution_power():
+    msg = "distribution is only defined for power<=0 and power>=1"
+    with pytest.raises(ValueError, match=msg):
+        TweedieDistribution(power=0.5)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        TweedieDistribution(power=1j)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        dist = TweedieDistribution()
+        dist.power = 1j
+
+    dist = TweedieDistribution()
+    assert isinstance(dist._lower_bound, DistributionBoundary)
+
+    assert dist._lower_bound.inclusive is False
+    dist.power = 1
+    assert dist._lower_bound.value == 0.0
+    assert dist._lower_bound.inclusive is True
+
+
+@pytest.mark.parametrize(
+    'family, chk_values',
+    [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
+     (PoissonDistribution(), [0.1, 1.5]),
+     (GammaDistribution(), [0.1, 1.5]),
+     (TweedieDistribution(power=-2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-1), [0.1, 1.5]),
+     (TweedieDistribution(power=1.5), [0.1, 1.5]),
+     (TweedieDistribution(power=2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-4), [0.1, 1.5])])
+def test_deviance_zero(family, chk_values):
+    """Test deviance(y,y) = 0 for different families."""
+    for x in chk_values:
+        assert_allclose(family.deviance(x, x), 0, atol=1e-9)