From d5e88108a71c6c256b67948515f20955cc96fabf Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 18 Jul 2017 21:50:10 +0200
Subject: [PATCH 001/209] [WIP] Add Generalized Linear Model, issue #5975,
 initial commit

---
 sklearn/linear_model/__init__.py       |   7 +
 sklearn/linear_model/glm.py            | 872 +++++++++++++++++++++++++
 sklearn/linear_model/tests/test_glm.py |  73 +++
 3 files changed, 952 insertions(+)
 create mode 100644 sklearn/linear_model/glm.py
 create mode 100644 sklearn/linear_model/tests/test_glm.py

diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 2e01990ccce8c..5acc51e9dc87f 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,6 +18,12 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
+from .glm import (Link, IdentityLink, LogLink,
+                 ExponentialDispersionModel, TweedieDistribution,
+                 NormalDistribution, GaussianDistribution,
+                 PoissonDistribution, GammaDistribution,
+                 InverseGaussianDistribution, GeneralizedHyperbolicSecand,
+                 GeneralizedLinearModel)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
@@ -38,6 +44,7 @@
            'BayesianRidge',
            'ElasticNet',
            'ElasticNetCV',
+           'GeneralizedLinearModel',
            'Hinge',
            'Huber',
            'HuberRegressor',
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
new file mode 100644
index 0000000000000..8b6eb8f3bf16c
--- /dev/null
+++ b/sklearn/linear_model/glm.py
@@ -0,0 +1,872 @@
+"""
+Generalized Linear Models with Exponential Dispersion Family
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.ch>
+# License: BSD 3 clause
+
+# TODO: Which name? GeneralizedLinearModel vs GeneralizedLinearRegression.
+#       So far, it is GeneralizedLinearModel, since it could very easily
+#       extended by Bernoulli/Binomial distribution.
+# TODO: Which name/symbol for coefficients and weights in docu?
+#       sklearn.linear_models uses w for coefficients.
+#       So far, coefficients=beta and weight=w (as standard literature)
+# TODO: Add l2-penalty
+# TODO: Add l1-penalty (elastic net)
+# TODO: Add cross validation
+# TODO: Write docu and examples
+
+# Design Decisions:
+# - The link funtion (instance of class Link) is necessary for the evaluation
+#   of deviance, score, Fisher and Hessian matrix as functions of the
+#   coefficients, which is needed by optimizers.
+#   Solution: link as argument in those functions
+
+from __future__ import division
+from abc import ABCMeta, abstractmethod, abstractproperty
+import numbers
+import numpy as np
+from scipy import linalg, optimize, sparse
+import warnings
+from .base import LinearModel, LinearRegression
+from ..base import RegressorMixin
+from ..utils import check_X_y
+from ..utils.extmath import safe_sparse_dot
+from ..utils.optimize import newton_cg
+from ..utils.validation import check_is_fitted
+
+
+
+class Link(metaclass=ABCMeta):
+    """Abstract base class for Link funtions
+    """
+
+    @abstractmethod
+    def link(self, mu):
+        """The link function g(mu) with argument mu=E[Y] returns the
+        linear predictor.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def derivative(self, mu):
+        """Derivative of the link g'(mu).
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def inverse(self, lin_pred):
+        """The inverse link function h(lin_pred) with the linear predictor as
+        argument returns mu=E[Y].
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def inverse_derivative(self, lin_pred):
+        """Derivative of the inverse link function h'(lin_pred).
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def inverse_derivative2(self, lin_pred):
+        """Second derivative of the inverse link function h''(lin_pred).
+        """
+        raise NotImplementedError
+
+class IdentityLink(Link):
+    """The identity link function g(x)=x.
+    """
+
+    def link(self, mu):
+        return mu
+
+    def derivative(self, mu):
+        return np.ones_like(mu)
+
+    def inverse(self, lin_pred):
+        return lin_pred
+
+    def inverse_derivative(self, lin_pred):
+        return np.ones_like(lin_pred)
+
+    def inverse_derivative2(self, lin_pred):
+        return np.zeros_like(lin_pred)
+
+
+class LogLink(Link):
+    """The log link function g(x)=log(x).
+    """
+
+    def link(self, mu):
+        return np.log(mu)
+
+    def derivative(self, mu):
+        return 1./mu
+
+    def inverse(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative2(self, lin_pred):
+        return np.exp(lin_pred)
+
+
+class ExponentialDispersionModel(metaclass=ABCMeta):
+    """Base class for reproductive Exponential Dispersion Models (EDM).
+
+    The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
+
+    .. math:: p(y| \theta, \phi) = c(y, \phi)
+        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
+        = \tilde{c}(y, \phi)
+            \exp\left(-\frac{d(y, \mu)}{2\phi}\right)
+
+    with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`,
+    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`,
+    unit variance :math:`v(\mu)` and
+    unit deviance :math:`d(y,\mu)`.
+
+    Attributes
+    ----------
+    lower_bound
+    upper_bound
+
+    Methods
+    -------
+    in_y_range
+    unit_variance
+    unit_variance_derivative
+    variance
+    variance_derivative
+    unit_deviance
+    unit_deviance_derivative
+    deviance
+    deviance_derivative
+    starting_mu
+
+    _score
+    _fisher_matrix
+    _observed_information
+    _deviance
+    _deviance_derivative
+    _deviance_hessian
+
+    References
+    ----------
+    See https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+    """
+
+    @abstractproperty
+    def lower_bound(self):
+        """The lower bound of values of Y~EDM.
+        """
+        raise NotImplementedError()
+
+    @abstractproperty
+    def upper_bound(self):
+        """The upper bound of values of Y~EDM.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def in_y_range(self, x):
+        """Returns true if x is in the valid range of Y~EDM.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def unit_variance(self, mu):
+        """The unit variance :math:`v(mu)` determines the variance as
+        a function of the mean mu by
+        :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`.
+        It can also be derived from the unit deviance :math:`d(y,\mu)` as
+
+        .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{
+            \partial\mu^2}}\big|_{y=\mu}
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def unit_variance_derivative(self, mu):
+        """The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`.
+        """
+        raise NotImplementedError()
+
+    def variance(self, mu, phi=1, weight=1):
+        """The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is
+        :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`,
+        with unit variance v(mu).
+        """
+        return phi/weight * self.unit_variance(mu)
+
+    def variance_derivative(self, mu, phi=1, weight=1):
+        """The derivative of the variance w.r.t. mu,
+        :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
+        =phi/w_i*v'(\mu_i)`, with unit variance v(mu).
+        """
+        return phi/weight * self.unit_variance_derivative(mu)
+
+    @abstractmethod
+    def unit_deviance(self, y, mu):
+        """The unit_deviance :math:`d(y,\mu)`.
+        In terms of the log-likelihood it is given by
+        :math:`d(y,\mu) = -2\phi\cdot
+        \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).`
+        """
+        raise NotImplementedError()
+
+    def unit_deviance_derivative(self, y, mu):
+        """The derivative w.r.t. mu of the unit_deviance
+        :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
+        with unit variance :math:`v(\mu)`.
+
+        Returns
+        -------
+        derivative: array, shape = (n_samples,)
+        """
+        return -2*(y-mu)/self.unit_variance(mu)
+
+    def deviance(self, y, mu, weight=1):
+        """The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu)
+        with weight :math:`w_i` and unit_deviance :math:`d(y,mu)`.
+        In terms of the likelihood it is :math:`D = -2\phi\cdot
+        \left(loglike(y,\mu,\frac{phi}{w})
+        - loglike(y,y,\frac{phi}{w})\right).`
+        """
+        return np.sum(weight*self.unit_deviance(y,mu))
+
+    def _deviance(self, coef, X, y, weight, link):
+        """The deviance as a function of the coefficients ``coef``
+        (:math:`beta`).
+        """
+        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        mu = link.inverse(lin_pred)
+        return self.deviance(y, mu, weight)
+
+    def deviance_derivative(self, y, mu, weight=1):
+        """The derivative w.r.t. mu of the deviance.`
+        """
+        return weight*self.unit_deviance_derivative(y,mu)
+
+    def _score(self, coef, phi, X, y, weight, link):
+        """The score function :math:`s` is the derivative of the
+        log-likelihood w.r.t. the ``coef`` (:math:`\beta`).
+        It is given by
+
+        .. math:
+
+            \mathbf{s}(\boldsymbol{\beta}) = \mathbf{X}^T \mathbf{D}
+            \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,,
+
+        with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and
+        :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}(y_1),\ldots)`.
+        """
+        n_samples = X.shape[0]
+        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
+        d = link.inverse_derivative(lin_pred)
+        d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0),
+            shape=(n_samples, n_samples))
+        temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False)
+        score = safe_sparse_dot(X.T, temp, dense_output=False)
+        return score
+
+    def _fisher_matrix(self, coef, phi, X, y, weight, link):
+        """The Fisher information matrix, also known as expected
+        information matrix. It is given by
+
+        .. math:
+
+            \mathbf{F}(\boldsymbol{\beta}) = \mathrm{E}\left[
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta}
+            \partial\boldsymbol{\beta}^T}\right]
+            = \mathbf{X}^T W \mathbf{X} \,,
+
+        with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`,
+        see score function.
+        """
+        n_samples = X.shape[0]
+        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
+        d2 = link.inverse_derivative(lin_pred)**2
+        d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0),
+            shape=(n_samples, n_samples))
+        temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False)
+        fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False)
+        return fisher_matrix
+
+    def _observed_information(self, coef, phi, X, y, weight, link):
+        """The observed information matrix, also known as the negative of
+        the Hessian matrix of the log-likelihood. It is given by
+
+        .. math:
+
+            \mathbf{H}(\boldsymbol{\beta}) =
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta}
+            \partial\boldsymbol{\beta}^T}
+            = \mathbf{X}^T \legt[
+            - \mathbf{D}' \mathbf{R}
+            + \mathbf{D}^2 \mathbf{V} \mathbf{R}
+            + \mathbf{D}^2
+            \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,,
+
+        with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`,
+        :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{
+        v(\mu_i)}
+        \right)`,
+        see score function and Fisher matrix.
+        """
+        n_samples = X.shape[0]
+        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
+        dp = link.inverse_derivative2(lin_pred)
+        d2 = link.inverse_derivative(lin_pred)**2
+        v = self.unit_variance_derivative(mu)/self.unit_variance(mu)
+        r = y - mu
+        temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0),
+            shape=(n_samples, n_samples))
+        temp = safe_sparse_dot(temp, X, dense_output=False)
+        observed_information = safe_sparse_dot(X.T, temp, dense_output=False)
+        return observed_information
+
+    def _deviance_derivative(self, coef, X, y, weight, link):
+        """The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a
+        function of the coefficients ``coef``.
+        This is equivalent to :math:`-2\phi` times the score function
+        :math:`s` (derivative of the log-likelihood).
+        """
+        score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight,
+            link=link)
+        return -2*score
+
+    def _deviance_hessian(self, coef, X, y, weight, link):
+        """The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance
+        as a function of the coefficients ``coef``.
+        This is equivalent to :math:`+2\phi` times the observed information
+        matrix.
+        """
+        info_matrix = self._observed_information(coef=coef, phi=1,
+            X=X, y=y, weight=weight, link=link)
+        return 2*info_matrix
+
+    def starting_mu(self, y, weight=1):
+        """Starting values for the mean mu_i in IRLS."""
+        return (weight*y+np.mean(weight*y))/(2.*np.sum(np.ones_like(y)*weight))
+
+
+class TweedieDistribution(ExponentialDispersionModel):
+    """A class for the Tweedie distribution.
+    They have mu=E[X] and Var[X] \propto mu**power.
+
+    Attributes
+    ----------
+    power : float
+            The variance power of the unit_variance
+            :math:`v(mu) = mu^{power}`.
+    """
+    def __init__(self, power=0):
+        self.power = power
+        self._upper_bound = np.Inf
+        self._upper_compare = lambda x: np.less(x, self.upper_bound)
+        if power < 0:
+            #Extreme Stable
+            self._lower_bound = -np.Inf
+            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+        elif power == 0:
+            #GaussianDistribution
+            self._lower_bound = -np.Inf
+            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+        elif (power > 0) and (power < 1):
+            raise ValueError('For 0<power<1, no distribution exists.')
+        elif power == 1:
+            #PoissonDistribution
+            self._lower_bound = 0
+            self._lower_compare = (
+                lambda x: np.greater_equal(x, self.lower_bound))
+        elif (power > 1) and (power < 2):
+            #Compound Poisson
+            self._lower_bound = 0
+            self._lower_compare = (
+                lambda x: np.greater_equal(x, self.lower_bound))
+        elif power == 2:
+            #GammaDistribution
+            self._lower_bound = 0
+            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+        elif (power > 2) and (power < 3):
+            #Positive Stable
+            self._lower_bound = 0
+            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+        elif power == 3:
+            #InverseGaussianDistribution
+            self._lower_bound = 0
+            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+        elif power > 3:
+            #Positive Stable
+            self._lower_bound = 0
+            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+
+    @property
+    def power(self):
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        if not isinstance(power, numbers.Real):
+            raise TypeError('power must be a real number, input was {0}'
+                .format(power))
+        self._power = power
+
+    @property
+    def lower_bound(self):
+        return self._lower_bound
+
+    @property
+    def upper_bound(self):
+        return self._upper_bound
+
+    def in_y_range(self, x):
+        return np.logical_and(self._lower_compare(x), self._upper_compare(x))
+
+    def unit_variance(self, mu):
+        """The unit variance of a Tweedie distribution is v(mu)=mu**power.
+        """
+        return np.power(mu, self.power)
+
+    def unit_variance_derivative(self, mu):
+        """The derivative of the unit variance of a Tweedie distribution is
+        v(mu)=power*mu**(power-1).
+        """
+        return self.power*np.power(mu, self.power-1)
+
+    def unit_deviance(self, y, mu):
+        p = self.power
+        if p == 0:
+            #NormalDistribution
+            return (y-mu)**2
+        if p == 1:
+            #PoissonDistribution
+            return 2 * (np.where(y==0,0,y*np.log(y/mu))-y+mu)
+        elif p == 2:
+            #GammaDistribution
+            return 2 * (np.log(mu/y)+y/mu-1)
+        else:
+            #return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
+            #    - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p))
+            return 2 * (np.power(np.maximum(y,0), 2-p)/((1-p)*(2-p))
+                - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
+
+    def likelihood(self, y, X, beta, phi, weight=1):
+        raise NotImplementedError('This function is not (yet) implemented.')
+
+
+class NormalDistribution(TweedieDistribution):
+    """Class for the Normal (aka Gaussian) distribution"""
+    def __init__(self):
+        super(NormalDistribution, self).__init__(power=0)
+
+GaussianDistribution = NormalDistribution
+
+class PoissonDistribution(TweedieDistribution):
+    """Class for the scaled Poisson distribution"""
+    def __init__(self):
+        super(PoissonDistribution, self).__init__(power=1)
+
+class GammaDistribution(TweedieDistribution):
+    """Class for the Gamma distribution"""
+    def __init__(self):
+        super(GammaDistribution, self).__init__(power=2)
+
+class InverseGaussianDistribution(TweedieDistribution):
+    """Class for the scaled InverseGaussianDistribution distribution"""
+    def __init__(self):
+        super(InverseGaussianDistribution, self).__init__(power=3)
+
+class GeneralizedHyperbolicSecand(ExponentialDispersionModel):
+    """A class for the von Generalized Hyperbolic Secand (GHS) distribution.
+
+    The GHS distribution is for data y in (-inf, inf).
+    """
+    def __init__(self):
+        self._lower_bound = -np.Inf
+        self._upper_bound = np.Inf
+
+    @property
+    def lower_bound(self):
+        return self._lower_bound
+
+    @property
+    def upper_bound(self):
+        return self._upper_bound
+
+    def in_y_range(self, x):
+        np.logical_and(
+            np.greater(x, self.lower_bound),
+            np.less(x, self.lower_bound)
+            )
+
+    def unit_variance(self, mu):
+        return 1 + mu**2
+
+    def unit_variance_derivative(self, mu):
+        return 2*mu
+
+    def unit_deviance(self, y, mu):
+        return (2*y*(np.arctan(y) - np.arctan(mu))
+            + np.log((1+mu**2)/(1+y**2)))
+
+
+
+class GeneralizedLinearModel(LinearModel, RegressorMixin):
+    """
+    Class to fit a Generalized Linear Model (GLM) based on reproductive
+    Exponential Dispersion Models (EDM).
+
+    Assumptions:
+
+    - The target values y_i are realizations of random variables
+      :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion
+      parameter :math:`\phi` and weights :math:`w_i`.
+    - The expectation of :math:`Y_i` is :math:`mu_i=\mathrm{E}[Y]=h(\eta_i)`
+      whith the linear predictor :math:`\eta=X*\beta`, inverse link function
+      :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta`
+      to be estimated.
+
+    Note that the first assumption implies
+    :math:`\mathrm{Var}[Y_i]=\frac{\phi}{w_i} v(\mu_i)` with uni variance
+    function :math:`v(\mu)`.
+
+    The fit itself does not need Y to be from an EDM, but only assumes
+    the first two moments :math:`E[Y_i]=\mu_i=h(\eta_i)` and
+    :math:`Var[Y_i]=\frac{\phi}{w_i} v(\mu_i)`
+
+    The parameters :math:`\beta` are estimated by maximum likelihood which is
+    equivalent to minimizing the deviance.
+
+    TODO: Estimation of the dispersion parameter phi.
+
+    TODO: Notes on 'scaled' Poisson and weights
+
+    Parameters
+    ----------
+    fit_intercept : boolean, optional, default True
+        whether to calculate the intercept for this model. If set
+        to False, no intercept will be used in calculations
+        (e.g. data is expected to be already centered).
+
+    family : ExponentialDispersionModel, optional, default NormalDistribution()
+        the distributional assumption of the GLM
+
+    link : Link, optional, default IdentityLink()
+        the link function (class) of the GLM
+
+    fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr'
+        method for estimation of the dispersion parameter phi. Whether to use
+        the chi squared statisic or the deviance statistic. If None, the
+        dispersion is not estimated.
+
+    solver : {'irls', 'newton-cg', 'lbfgs'}, defaul 'irls'
+        Algorithm to use in the optimization problem.
+
+        - 'irls' is iterated reweighted least squares. It is the standard
+            algorithm for GLMs.
+
+        - 'newton-cg', 'lbfgs'
+
+    max_iter : int, default 100
+        TODO
+
+    tol : float
+        Stopping criterion. For the irls, newton-cg and lbfgs solvers,
+        the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol``
+        where ``g_i`` is the i-th component of the gradient (derivative of
+        the deviance).
+
+    start_params : {array shape (n_features, ), 'ols'}, default None
+        sets the start values for coef_ in the fit.
+        If None, default values are taken.
+        If 'ols' the result of an ordinary least squares in the link space
+        (linear predictor) is taken.
+        If an array is given, these values are taken as coef_ to start with.
+        If fit_intercept is true, the first value is assumed to be the start
+        value for the intercept_.
+
+    verbose : int, default: 0
+        For the lbfgs solver set verbose to any positive
+        number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array, shape (1, n_features)
+        Estimated coefficients for the linear predictor (X*coef_) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    dispersion_ : float
+        The dispersion parameter :math:`\phi` if fit_dispersion is set.
+
+    n_iter_ : int
+        Actual number of iterations of the solver.
+
+    Notes
+    -----
+
+    References
+    ----------
+    TODO
+    """
+
+    def __init__(self, fit_intercept=True, family=NormalDistribution(),
+        link=IdentityLink(), fit_dispersion='chisqr', solver='irls', max_iter=100,
+        tol=1e-4, start_params=None, verbose=0):
+        self.fit_intercept = fit_intercept
+        self.family = family
+        self.link = link
+        self.fit_dispersion = fit_dispersion
+        self.solver = solver
+        self.max_iter = 100
+        self.tol = tol
+        self.start_params = start_params
+        self.verbose = verbose
+
+    def fit(self, X, y, weight=None):
+        """
+        Fit a generalized linear model.
+
+        Parameters
+        ----------
+        X : numpy array or sparse matrix of shape [n_samples,n_features]
+            Training data
+
+        y : numpy array of shape [n_samples]
+            Target values
+
+        weight : numpy array of shape [n_samples]
+            Individual weights for each sample.
+            Var[Y_i]=phi/weight_i * v(mu)
+            If Y_i ~ EDM(mu, phi/w_i) then
+            sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w))
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        if not isinstance(self.family, ExponentialDispersionModel):
+            raise ValueError("The argument family must be an instance of class"
+                             "ExponentialDispersionModel.")
+        if not isinstance(self.fit_intercept, bool):
+            raise ValueError("The argument fit_intercept must be bool,"
+                             " got {0}".format(self.fit_intercept))
+        if not self.solver in ['irls', 'lbfgs', 'newton-cg']:
+            raise ValueError("GLM Regression supports only irls, lbfgs and"
+                             "newton-cg solvers, got {0}".format(self.solver))
+        if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
+            raise ValueError("Maximum number of iteration must be positive;"
+                             " got (max_iter={0!r})".format(self.max_iter))
+        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
+            raise ValueError("Tolerance for stopping criteria must be "
+                             "positive; got (tol={0!r})".format(self.tol))
+        start_params = self.start_params
+        if start_params is not None and start_params is not 'ols':
+            start_params = np.atleast_1d(start_params)
+            if start_params.shape[0] != X.shape[1] + self.fit_intercept:
+                raise ValueError("Start values for parameters must have the"
+                    "right length; required length {0}, got {1}".format(
+                    X.shape[1] + self.fit_intercept, start_params.shape[0]))
+
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                         y_numeric=True, multi_output=False)
+        y = y.astype(np.float64)
+
+        if not np.all(self.family.in_y_range(y)):
+            raise ValueError("Some value(s) of y are out of the valid "
+            "range for family {0}".format(self.family.__class__.__name__))
+
+        if weight is None:
+            weight = np.ones_like(y)
+        elif np.isscalar(weight):
+            weight = weight*np.ones_like(y)
+        else:
+            weight = np.atleast_1d(weight)
+            if weight.ndim > 1:
+                raise ValueError("Weights must be 1D array or scalar")
+            elif weight.shape[0] != y.shape[0]:
+                raise ValueError("Weights must have the same length as y")
+
+
+        if self.fit_intercept:
+            #intercept is first column <=> coef[0] is for intecept
+            if sparse.issparse(X):
+                Xnew = sparse.hstack([np.ones([X.shape[0],1]), X])
+            else:
+                Xnew = np.concatenate((np.ones((X.shape[0],1)), X), axis=1)
+        else:
+            Xnew = X
+
+        n_samples, n_features = Xnew.shape
+
+        #Note: Since phi does not enter the estimation of mu_i=E[y_i]
+        #      set it to 1 where convenient.
+
+        #set start values for coef
+        coef = None
+        if start_params is None:
+            #Use mu_start and apply one irls step to calculate coef
+            mu = self.family.starting_mu(y, weight)
+            #linear predictor
+            eta = self.link.link(mu)
+            #h'(eta)
+            hp = self.link.inverse_derivative(eta)
+            #working weights w, in principle a diagonal matrix
+            #therefore here just as 1d array
+            w = (hp**2 / self.family.variance(mu, phi=1, weight=weight))
+            wroot = np.sqrt(w)
+            #working observations
+            yw = eta + (y-mu)/hp
+            #least squares rescaled with wroot
+            wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples))
+            X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True)
+            yw_rescale = safe_sparse_dot(wroot, y, dense_output=True)
+            coef = linalg.lstsq(X_rescale, yw_rescale)[0]
+        elif start_params is 'ols':
+            reg = LinearRegression(copy_X=False,
+                fit_intercept=False)
+            reg.fit(Xnew, self.link.link(y))
+            coef = reg.coef_
+        else:
+            coef = start_params
+
+        #algorithms for optimiation
+        #TODO: Parallelize it
+        self.n_iter_ = 0
+        converged = False
+        if self.solver == 'irls':
+            #linear predictor
+            eta = safe_sparse_dot(Xnew, coef, dense_output=True)
+            mu = self.link.inverse(eta)
+            while self.n_iter_ < self.max_iter:
+                self.n_iter_ += 1
+                #coef_old not used so far.
+                #coef_old = coef
+                #h'(eta)
+                hp = self.link.inverse_derivative(eta)
+                #working weights w, in principle a diagonal matrix
+                #therefore here just as 1d array
+                w = (hp**2 / self.family.variance(mu, phi=1, weight=weight))
+                wroot = np.sqrt(w)
+                #working observations
+                yw = eta + (y-mu)/hp
+                #least squares rescaled with wroot
+                wroot = sparse.dia_matrix((wroot, 0),
+                    shape=(n_samples, n_samples))
+                X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True)
+                yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True)
+                coef, residues, rank, singular_ =  (
+                    linalg.lstsq(X_rescale, yw_rescale))
+
+                #updated linear predictor
+                #do it here for updated values for tolerance
+                eta = safe_sparse_dot(Xnew, coef, dense_output=True)
+                mu = self.link.inverse(eta)
+
+                #which tolerace? |coef - coef_old| or gradient?
+                #use gradient for compliance with newton-cg and lbfgs
+                #TODO: faster computation of gradient, use mu and eta directly
+                gradient = self.family._deviance_derivative(coef=coef,
+                    X=Xnew, y=y, weight=weight, link=self.link)
+                if (np.max(np.abs(gradient)) <= self.tol):
+                    converged = True
+                    break
+
+            if not converged:
+                warnings.warn("irls failed to converge. Increase the number "
+                    "of iterations (currently {0})".format(self.max_iter))
+
+        #TODO: performance: make one function return both deviance and gradient
+        elif self.solver == 'lbfgs':
+            func = self.family._deviance
+            fprime = self.family._deviance_derivative
+            args = (Xnew, y, weight, self.link)
+            coef, loss, info = optimize.fmin_l_bfgs_b(
+                func, coef, fprime=fprime,
+                args=args,
+                iprint=(self.verbose > 0) - 1, pgtol=self.tol,
+                maxiter=self.max_iter)
+            if self.verbose > 0:
+                if info["warnflag"] == 1:
+                    warnings.warn("lbfgs failed to converge."
+                        " Increase the number of iterations.")
+                elif info["warnflag"] == 2:
+                    warnings.warn("lbfgs failed for the reason: {0}".format(
+                        info["task"]))
+            self.n_iter_ = info['nit']
+        elif self.solver == 'newton-cg':
+            func = self.family._deviance
+            grad = self.family._deviance_derivative
+            def grad_hess(coef, X, y, weight, link):
+                grad = (self.family
+                    ._deviance_derivative(coef, X, y, weight, link))
+                hessian = (self.family
+                    ._deviance_hessian(coef, X, y, weight,link))
+                def Hs(s):
+                    ret = np.dot(hessian, s)
+                    return ret
+                return grad, Hs
+            hess = grad_hess
+            args = (Xnew, y, weight, self.link)
+            coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args,
+                maxiter=self.max_iter, tol=self.tol)
+            self.coef_ = coef
+
+        if self.fit_intercept is True:
+            self.intercept_ = coef[0]
+            self.coef_ = coef[1:]
+        else:
+            self.coef_ = coef
+
+        if self.fit_dispersion in ['chisqr', 'deviance']:
+            self.dispersion_ = self.estimate_phi(y, X, weight)
+
+        return self
+
+    def predict(self, X, weight=1):
+        check_is_fitted(self, "coef_")
+        eta = safe_sparse_dot(X, self.coef_, dense_output=True)
+        if self.fit_intercept is True:
+            eta += self.intercept_
+        mu = self.link.inverse(eta)
+        return mu*weight
+
+    def estimate_phi(self, y, X, weight):
+        n_samples, n_features = X.shape
+        eta = safe_sparse_dot(X, self.coef_, dense_output=True)
+        if self.fit_intercept is True:
+            eta += self.intercept_
+        mu = self.link.inverse(eta)
+        if self.fit_dispersion == 'chisqr':
+            chisq = np.sum(weight*(y-mu)**2/self.family.unit_variance(mu))
+            return chisq/(n_samples - n_features)
+        elif self.fit_dispersion == 'deviance':
+            dev = self.family.deviance(y, mu, weight)
+            return dev/(n_samples - n_features)
+
+    def score(self, X, y, weight=1):
+        """The natural score for a GLM is -deviance.
+        Returns the weight averaged negitive deviance (the better the score,
+        the better the fit). Maximum score is therefore 0.
+        """
+        #RegressorMixin has R^2 score.
+        #TODO: Make it more compatible with the score function in
+        #      sklearn.metrics.regression.py
+        eta = safe_sparse_dot(X, self.coef_, dense_output=True)
+        if self.fit_intercept is True:
+            eta += self.intercept_
+        mu = self.link.inverse(eta)
+        output_errors = self.family.unit_deviance(y,mu)
+        weight = weight * np.ones_like(y)
+        return np.average(output_errors, weights=weight)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
new file mode 100644
index 0000000000000..2a88a7ec899d9
--- /dev/null
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -0,0 +1,73 @@
+import numpy as np
+
+from sklearn.linear_model.glm import (Link, IdentityLink, LogLink,
+                 ExponentialDispersionModel, TweedieDistribution,
+                 NormalDistribution, GaussianDistribution,
+                 PoissonDistribution, GammaDistribution,
+                 InverseGaussianDistribution, GeneralizedHyperbolicSecand,
+                 GeneralizedLinearModel)
+
+from sklearn.utils.testing import (assert_equal, assert_array_equal,
+    assert_array_almost_equal)
+
+def test_family_bounds():
+    """Test the valid range of distributions
+    """
+    family = NormalDistribution()
+    result = family.in_y_range([-1,0,1])
+    assert_array_equal(result, [True, True, True])
+
+    family = PoissonDistribution()
+    result = family.in_y_range([-1,0,1])
+    assert_array_equal(result, [False, True, True])
+
+    family = TweedieDistribution(power=1.5)
+    result = family.in_y_range([-1,0,1])
+    assert_array_equal(result, [False, True, True])
+
+    family = GammaDistribution()
+    result = family.in_y_range([-1,0,1])
+    assert_array_equal(result, [False, False, True])
+
+    family = InverseGaussianDistribution()
+    result = family.in_y_range([-1,0,1])
+    assert_array_equal(result, [False, False, True])
+
+    family = TweedieDistribution(power=4.5)
+    result = family.in_y_range([-1,0,1])
+    assert_array_equal(result, [False, False, True])
+
+def test_glm_identiy_regression():
+    """Test linear regression on a simple dataset
+    """
+    coef = [1,2]
+    X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T
+    y = np.dot(X, coef)
+    for solver in ['irls', 'lbfgs', 'newton-cg']:
+        for family in (GaussianDistribution(), PoissonDistribution(),
+            GammaDistribution(), InverseGaussianDistribution(),
+            TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)):
+            glm = GeneralizedLinearModel(family=family,
+                fit_intercept=False, solver=solver)
+            res = glm.fit(X, y)
+            assert_array_almost_equal(res.coef_, coef)
+
+def test_glm_log_regression():
+    """Test linear regression on a simple dataset
+    """
+    coef = [1,2]
+    X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T
+    y = np.exp(np.dot(X, coef))
+    #for solver in ['irls', 'lbfgs', 'newton-cg']:
+    for solver in ['irls']:
+        #for family in [GaussianDistribution(), PoissonDistribution(),
+        #    GammaDistribution(), InverseGaussianDistribution(),
+        #    TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]:
+        for family in [GaussianDistribution()]:
+            glm = GeneralizedLinearModel(family=family,
+                link=LogLink(),
+                fit_intercept=False, solver=solver, start_params='ols')
+            res = glm.fit(X, y)
+            assert_array_almost_equal(res.coef_, coef)
+
+#TODO: Test compatibility with R's glm, glmnet

From 2fc189d8351c9710c1329750545539afe3e6e40c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 19 Jul 2017 17:33:04 +0200
Subject: [PATCH 002/209] [WIP] Add Generalized Linear Models (#9405)

* Fixed pep8
* Fixed flake8
* Rename GeneralizedLinearModel as GeneralizedLinearRegressor
* Use of six.with_metaclass
* PEP257: summary should be on same line as quotes
* Docstring of class GeneralizedLinearRegressor: \ before mu
* Arguments family and link accept strings
* Use of ConvergenceWarning
---
 sklearn/linear_model/__init__.py       |  13 +-
 sklearn/linear_model/glm.py            | 231 ++++++++++++++-----------
 sklearn/linear_model/tests/test_glm.py |  75 ++++----
 3 files changed, 180 insertions(+), 139 deletions(-)

diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 5acc51e9dc87f..0c5840f343a3a 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,12 +18,8 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
-from .glm import (Link, IdentityLink, LogLink,
-                 ExponentialDispersionModel, TweedieDistribution,
-                 NormalDistribution, GaussianDistribution,
-                 PoissonDistribution, GammaDistribution,
-                 InverseGaussianDistribution, GeneralizedHyperbolicSecand,
-                 GeneralizedLinearModel)
+from .glm import (TweedieDistribution,
+                  GeneralizedLinearRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
@@ -44,7 +40,6 @@
            'BayesianRidge',
            'ElasticNet',
            'ElasticNetCV',
-           'GeneralizedLinearModel',
            'Hinge',
            'Huber',
            'HuberRegressor',
@@ -84,4 +79,6 @@
            'orthogonal_mp',
            'orthogonal_mp_gram',
            'ridge_regression',
-           'RANSACRegressor']
+           'RANSACRegressor',
+           'GeneralizedLinearRegressor',
+           'TweedieDistribution']
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 8b6eb8f3bf16c..cf91a64fafc12 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -5,9 +5,6 @@
 # Author: Christian Lorentzen <lorentzen.ch@googlemail.ch>
 # License: BSD 3 clause
 
-# TODO: Which name? GeneralizedLinearModel vs GeneralizedLinearRegression.
-#       So far, it is GeneralizedLinearModel, since it could very easily
-#       extended by Bernoulli/Binomial distribution.
 # TODO: Which name/symbol for coefficients and weights in docu?
 #       sklearn.linear_models uses w for coefficients.
 #       So far, coefficients=beta and weight=w (as standard literature)
@@ -17,6 +14,10 @@
 # TODO: Write docu and examples
 
 # Design Decisions:
+# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
+#   So far, it is GeneralizedLinearModel, since it could very easily
+#   extended by Bernoulli/Binomial distribution.
+#   Solution: GeneralizedLinearRegressor
 # - The link funtion (instance of class Link) is necessary for the evaluation
 #   of deviance, score, Fisher and Hessian matrix as functions of the
 #   coefficients, which is needed by optimizers.
@@ -28,16 +29,17 @@
 import numpy as np
 from scipy import linalg, optimize, sparse
 import warnings
-from .base import LinearModel, LinearRegression
-from ..base import RegressorMixin
+from .base import LinearRegression
+from ..base import BaseEstimator, RegressorMixin
+from ..exceptions import ConvergenceWarning
+from ..externals import six
 from ..utils import check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import newton_cg
 from ..utils.validation import check_is_fitted
 
 
-
-class Link(metaclass=ABCMeta):
+class Link(six.with_metaclass(ABCMeta)):
     """Abstract base class for Link funtions
     """
 
@@ -73,6 +75,7 @@ def inverse_derivative2(self, lin_pred):
         """
         raise NotImplementedError
 
+
 class IdentityLink(Link):
     """The identity link function g(x)=x.
     """
@@ -113,7 +116,7 @@ def inverse_derivative2(self, lin_pred):
         return np.exp(lin_pred)
 
 
-class ExponentialDispersionModel(metaclass=ABCMeta):
+class ExponentialDispersionModel(six.with_metaclass(ABCMeta)):
     """Base class for reproductive Exponential Dispersion Models (EDM).
 
     The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
@@ -235,7 +238,7 @@ def deviance(self, y, mu, weight=1):
         \left(loglike(y,\mu,\frac{phi}{w})
         - loglike(y,y,\frac{phi}{w})\right).`
         """
-        return np.sum(weight*self.unit_deviance(y,mu))
+        return np.sum(weight*self.unit_deviance(y, mu))
 
     def _deviance(self, coef, X, y, weight, link):
         """The deviance as a function of the coefficients ``coef``
@@ -248,7 +251,7 @@ def _deviance(self, coef, X, y, weight, link):
     def deviance_derivative(self, y, mu, weight=1):
         """The derivative w.r.t. mu of the deviance.`
         """
-        return weight*self.unit_deviance_derivative(y,mu)
+        return weight*self.unit_deviance_derivative(y, mu)
 
     def _score(self, coef, phi, X, y, weight, link):
         """The score function :math:`s` is the derivative of the
@@ -269,7 +272,7 @@ def _score(self, coef, phi, X, y, weight, link):
         sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
         d = link.inverse_derivative(lin_pred)
         d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0),
-            shape=(n_samples, n_samples))
+                                        shape=(n_samples, n_samples))
         temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False)
         score = safe_sparse_dot(X.T, temp, dense_output=False)
         return score
@@ -294,7 +297,7 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link):
         sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
         d2 = link.inverse_derivative(lin_pred)**2
         d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0),
-            shape=(n_samples, n_samples))
+                                         shape=(n_samples, n_samples))
         temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False)
         fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False)
         return fisher_matrix
@@ -329,7 +332,7 @@ def _observed_information(self, coef, phi, X, y, weight, link):
         v = self.unit_variance_derivative(mu)/self.unit_variance(mu)
         r = y - mu
         temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0),
-            shape=(n_samples, n_samples))
+                                 shape=(n_samples, n_samples))
         temp = safe_sparse_dot(temp, X, dense_output=False)
         observed_information = safe_sparse_dot(X.T, temp, dense_output=False)
         return observed_information
@@ -341,7 +344,7 @@ def _deviance_derivative(self, coef, X, y, weight, link):
         :math:`s` (derivative of the log-likelihood).
         """
         score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight,
-            link=link)
+                            link=link)
         return -2*score
 
     def _deviance_hessian(self, coef, X, y, weight, link):
@@ -350,8 +353,8 @@ def _deviance_hessian(self, coef, X, y, weight, link):
         This is equivalent to :math:`+2\phi` times the observed information
         matrix.
         """
-        info_matrix = self._observed_information(coef=coef, phi=1,
-            X=X, y=y, weight=weight, link=link)
+        info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y,
+                                                 weight=weight, link=link)
         return 2*info_matrix
 
     def starting_mu(self, y, weight=1):
@@ -374,39 +377,39 @@ def __init__(self, power=0):
         self._upper_bound = np.Inf
         self._upper_compare = lambda x: np.less(x, self.upper_bound)
         if power < 0:
-            #Extreme Stable
+            # Extreme Stable
             self._lower_bound = -np.Inf
             self._lower_compare = lambda x: np.greater(x, self.lower_bound)
         elif power == 0:
-            #GaussianDistribution
+            # GaussianDistribution
             self._lower_bound = -np.Inf
             self._lower_compare = lambda x: np.greater(x, self.lower_bound)
         elif (power > 0) and (power < 1):
             raise ValueError('For 0<power<1, no distribution exists.')
         elif power == 1:
-            #PoissonDistribution
+            # PoissonDistribution
             self._lower_bound = 0
             self._lower_compare = (
                 lambda x: np.greater_equal(x, self.lower_bound))
         elif (power > 1) and (power < 2):
-            #Compound Poisson
+            # Compound Poisson
             self._lower_bound = 0
             self._lower_compare = (
                 lambda x: np.greater_equal(x, self.lower_bound))
         elif power == 2:
-            #GammaDistribution
+            # GammaDistribution
             self._lower_bound = 0
             self._lower_compare = lambda x: np.greater(x, self.lower_bound)
         elif (power > 2) and (power < 3):
-            #Positive Stable
+            # Positive Stable
             self._lower_bound = 0
             self._lower_compare = lambda x: np.greater(x, self.lower_bound)
         elif power == 3:
-            #InverseGaussianDistribution
+            # InverseGaussianDistribution
             self._lower_bound = 0
             self._lower_compare = lambda x: np.greater(x, self.lower_bound)
         elif power > 3:
-            #Positive Stable
+            # Positive Stable
             self._lower_bound = 0
             self._lower_compare = lambda x: np.greater(x, self.lower_bound)
 
@@ -418,7 +421,7 @@ def power(self):
     def power(self, power):
         if not isinstance(power, numbers.Real):
             raise TypeError('power must be a real number, input was {0}'
-                .format(power))
+                            .format(power))
         self._power = power
 
     @property
@@ -446,19 +449,19 @@ def unit_variance_derivative(self, mu):
     def unit_deviance(self, y, mu):
         p = self.power
         if p == 0:
-            #NormalDistribution
+            # NormalDistribution
             return (y-mu)**2
         if p == 1:
-            #PoissonDistribution
-            return 2 * (np.where(y==0,0,y*np.log(y/mu))-y+mu)
+            # PoissonDistribution
+            return 2 * (np.where(y == 0, 0, y*np.log(y/mu))-y+mu)
         elif p == 2:
-            #GammaDistribution
+            # GammaDistribution
             return 2 * (np.log(mu/y)+y/mu-1)
         else:
-            #return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
+            # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
             #    - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p))
-            return 2 * (np.power(np.maximum(y,0), 2-p)/((1-p)*(2-p))
-                - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
+            return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
+                        y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
 
     def likelihood(self, y, X, beta, phi, weight=1):
         raise NotImplementedError('This function is not (yet) implemented.')
@@ -469,23 +472,25 @@ class NormalDistribution(TweedieDistribution):
     def __init__(self):
         super(NormalDistribution, self).__init__(power=0)
 
-GaussianDistribution = NormalDistribution
 
 class PoissonDistribution(TweedieDistribution):
     """Class for the scaled Poisson distribution"""
     def __init__(self):
         super(PoissonDistribution, self).__init__(power=1)
 
+
 class GammaDistribution(TweedieDistribution):
     """Class for the Gamma distribution"""
     def __init__(self):
         super(GammaDistribution, self).__init__(power=2)
 
+
 class InverseGaussianDistribution(TweedieDistribution):
     """Class for the scaled InverseGaussianDistribution distribution"""
     def __init__(self):
         super(InverseGaussianDistribution, self).__init__(power=3)
 
+
 class GeneralizedHyperbolicSecand(ExponentialDispersionModel):
     """A class for the von Generalized Hyperbolic Secand (GHS) distribution.
 
@@ -516,12 +521,11 @@ def unit_variance_derivative(self, mu):
         return 2*mu
 
     def unit_deviance(self, y, mu):
-        return (2*y*(np.arctan(y) - np.arctan(mu))
-            + np.log((1+mu**2)/(1+y**2)))
-
+        return (2*y*(np.arctan(y) - np.arctan(mu)) +
+                np.log((1+mu**2)/(1+y**2)))
 
 
-class GeneralizedLinearModel(LinearModel, RegressorMixin):
+class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """
     Class to fit a Generalized Linear Model (GLM) based on reproductive
     Exponential Dispersion Models (EDM).
@@ -531,7 +535,7 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin):
     - The target values y_i are realizations of random variables
       :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion
       parameter :math:`\phi` and weights :math:`w_i`.
-    - The expectation of :math:`Y_i` is :math:`mu_i=\mathrm{E}[Y]=h(\eta_i)`
+    - The expectation of :math:`Y_i` is :math:`\mu_i=\mathrm{E}[Y]=h(\eta_i)`
       whith the linear predictor :math:`\eta=X*\beta`, inverse link function
       :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta`
       to be estimated.
@@ -549,7 +553,9 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin):
 
     TODO: Estimation of the dispersion parameter phi.
 
-    TODO: Notes on 'scaled' Poisson and weights
+    TODO: Notes on weights and 'scaled' Poisson, e.g. fit y = x/w with
+    with x=counts and w=exposure (time, money, persons, ...) => y is a
+    ratio with weights w.
 
     Parameters
     ----------
@@ -558,10 +564,12 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin):
         to False, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
-    family : ExponentialDispersionModel, optional, default NormalDistribution()
-        the distributional assumption of the GLM
+    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance
+        of a subclass of ExponentialDispersionModel, optional, default 'normal'
+        the distributional assumption of the GLM.
 
-    link : Link, optional, default IdentityLink()
+    link : {'identity', 'log'} or an instance of a subclass of Link,
+        optional, default IdentityLink()
         the link function (class) of the GLM
 
     fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr'
@@ -622,8 +630,8 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin):
     """
 
     def __init__(self, fit_intercept=True, family=NormalDistribution(),
-        link=IdentityLink(), fit_dispersion='chisqr', solver='irls', max_iter=100,
-        tol=1e-4, start_params=None, verbose=0):
+                 link=IdentityLink(), fit_dispersion='chisqr', solver='irls',
+                 max_iter=100, tol=1e-4, start_params=None, verbose=0):
         self.fit_intercept = fit_intercept
         self.family = family
         self.link = link
@@ -635,8 +643,7 @@ def __init__(self, fit_intercept=True, family=NormalDistribution(),
         self.verbose = verbose
 
     def fit(self, X, y, weight=None):
-        """
-        Fit a generalized linear model.
+        """Fit a generalized linear model.
 
         Parameters
         ----------
@@ -657,12 +664,32 @@ def fit(self, X, y, weight=None):
         self : returns an instance of self.
         """
         if not isinstance(self.family, ExponentialDispersionModel):
-            raise ValueError("The argument family must be an instance of class"
-                             "ExponentialDispersionModel.")
+            if self.family == 'normal':
+                self.family = NormalDistribution()
+            elif self.family == 'poisson':
+                self.family = PoissonDistribution()
+            elif self.family == 'gamma':
+                self.family = GammaDistribution()
+            elif self.family == 'inverse.gaussian':
+                self.family = InverseGaussianDistribution()
+            else:
+                raise ValueError(
+                    "The argument family must be an instance of class"
+                    " ExponentialDispersionModel or an element of"
+                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].")
+        if not isinstance(self.link, Link):
+            if self.link == 'identity':
+                self.link = IdentityLink()
+            if self.link == 'log':
+                self.link = LogLink()
+            else:
+                raise ValueError(
+                    "The argument link must be an instance of class Link or"
+                    " an element of ['identity', 'log'].")
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool,"
                              " got {0}".format(self.fit_intercept))
-        if not self.solver in ['irls', 'lbfgs', 'newton-cg']:
+        if self.solver not in ['irls', 'lbfgs', 'newton-cg']:
             raise ValueError("GLM Regression supports only irls, lbfgs and"
                              "newton-cg solvers, got {0}".format(self.solver))
         if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
@@ -676,8 +703,9 @@ def fit(self, X, y, weight=None):
             start_params = np.atleast_1d(start_params)
             if start_params.shape[0] != X.shape[1] + self.fit_intercept:
                 raise ValueError("Start values for parameters must have the"
-                    "right length; required length {0}, got {1}".format(
-                    X.shape[1] + self.fit_intercept, start_params.shape[0]))
+                                 "right length; required length {0}, got {1}"
+                                 .format(X.shape[1] + self.fit_intercept,
+                                         start_params.shape[0]))
 
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          y_numeric=True, multi_output=False)
@@ -685,7 +713,8 @@ def fit(self, X, y, weight=None):
 
         if not np.all(self.family.in_y_range(y)):
             raise ValueError("Some value(s) of y are out of the valid "
-            "range for family {0}".format(self.family.__class__.__name__))
+                             "range for family {0}"
+                             .format(self.family.__class__.__name__))
 
         if weight is None:
             weight = np.ones_like(y)
@@ -698,96 +727,96 @@ def fit(self, X, y, weight=None):
             elif weight.shape[0] != y.shape[0]:
                 raise ValueError("Weights must have the same length as y")
 
-
         if self.fit_intercept:
-            #intercept is first column <=> coef[0] is for intecept
+            # intercept is first column <=> coef[0] is for intecept
             if sparse.issparse(X):
-                Xnew = sparse.hstack([np.ones([X.shape[0],1]), X])
+                Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X])
             else:
-                Xnew = np.concatenate((np.ones((X.shape[0],1)), X), axis=1)
+                Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
         else:
             Xnew = X
 
         n_samples, n_features = Xnew.shape
 
-        #Note: Since phi does not enter the estimation of mu_i=E[y_i]
-        #      set it to 1 where convenient.
+        # Note: Since dispersion_ alias phi does not enter the estimation
+        #       of mu_i=E[y_i] set it to 1 where convenient.
 
-        #set start values for coef
+        # set start values for coef
         coef = None
         if start_params is None:
-            #Use mu_start and apply one irls step to calculate coef
+            # Use mu_start and apply one irls step to calculate coef
             mu = self.family.starting_mu(y, weight)
-            #linear predictor
+            # linear predictor
             eta = self.link.link(mu)
-            #h'(eta)
+            # h'(eta)
             hp = self.link.inverse_derivative(eta)
-            #working weights w, in principle a diagonal matrix
-            #therefore here just as 1d array
+            # working weights w, in principle a diagonal matrix
+            # therefore here just as 1d array
             w = (hp**2 / self.family.variance(mu, phi=1, weight=weight))
             wroot = np.sqrt(w)
-            #working observations
+            # working observations
             yw = eta + (y-mu)/hp
-            #least squares rescaled with wroot
+            # least squares rescaled with wroot
             wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples))
             X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True)
             yw_rescale = safe_sparse_dot(wroot, y, dense_output=True)
             coef = linalg.lstsq(X_rescale, yw_rescale)[0]
         elif start_params is 'ols':
-            reg = LinearRegression(copy_X=False,
-                fit_intercept=False)
+            reg = LinearRegression(copy_X=False, fit_intercept=False)
             reg.fit(Xnew, self.link.link(y))
             coef = reg.coef_
         else:
             coef = start_params
 
-        #algorithms for optimiation
-        #TODO: Parallelize it
+        # algorithms for optimiation
+        # TODO: Parallelize it
         self.n_iter_ = 0
         converged = False
         if self.solver == 'irls':
-            #linear predictor
+            # linear predictor
             eta = safe_sparse_dot(Xnew, coef, dense_output=True)
             mu = self.link.inverse(eta)
             while self.n_iter_ < self.max_iter:
                 self.n_iter_ += 1
-                #coef_old not used so far.
-                #coef_old = coef
-                #h'(eta)
+                # coef_old not used so far.
+                # coef_old = coef
+                # h'(eta)
                 hp = self.link.inverse_derivative(eta)
-                #working weights w, in principle a diagonal matrix
-                #therefore here just as 1d array
+                # working weights w, in principle a diagonal matrix
+                # therefore here just as 1d array
                 w = (hp**2 / self.family.variance(mu, phi=1, weight=weight))
                 wroot = np.sqrt(w)
-                #working observations
+                # working observations
                 yw = eta + (y-mu)/hp
-                #least squares rescaled with wroot
+                # least squares rescaled with wroot
                 wroot = sparse.dia_matrix((wroot, 0),
-                    shape=(n_samples, n_samples))
+                                          shape=(n_samples, n_samples))
                 X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True)
                 yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True)
-                coef, residues, rank, singular_ =  (
+                coef, residues, rank, singular_ = (
                     linalg.lstsq(X_rescale, yw_rescale))
 
-                #updated linear predictor
-                #do it here for updated values for tolerance
+                # updated linear predictor
+                # do it here for updated values for tolerance
                 eta = safe_sparse_dot(Xnew, coef, dense_output=True)
                 mu = self.link.inverse(eta)
 
-                #which tolerace? |coef - coef_old| or gradient?
-                #use gradient for compliance with newton-cg and lbfgs
-                #TODO: faster computation of gradient, use mu and eta directly
-                gradient = self.family._deviance_derivative(coef=coef,
-                    X=Xnew, y=y, weight=weight, link=self.link)
+                # which tolerace? |coef - coef_old| or gradient?
+                # use gradient for compliance with newton-cg and lbfgs
+                # TODO: faster computation of gradient, use mu and eta directly
+                gradient = self.family._deviance_derivative(
+                    coef=coef, X=Xnew, y=y, weight=weight, link=self.link)
                 if (np.max(np.abs(gradient)) <= self.tol):
                     converged = True
                     break
 
             if not converged:
                 warnings.warn("irls failed to converge. Increase the number "
-                    "of iterations (currently {0})".format(self.max_iter))
+                              "of iterations (currently {0})"
+                              .format(self.max_iter), ConvergenceWarning)
 
-        #TODO: performance: make one function return both deviance and gradient
+        # TODO: performance: make one function return both deviance and
+        #       gradient of deviance
         elif self.solver == 'lbfgs':
             func = self.family._deviance
             fprime = self.family._deviance_derivative
@@ -800,7 +829,8 @@ def fit(self, X, y, weight=None):
             if self.verbose > 0:
                 if info["warnflag"] == 1:
                     warnings.warn("lbfgs failed to converge."
-                        " Increase the number of iterations.")
+                                  " Increase the number of iterations.",
+                                  ConvergenceWarning)
                 elif info["warnflag"] == 2:
                     warnings.warn("lbfgs failed for the reason: {0}".format(
                         info["task"]))
@@ -808,11 +838,13 @@ def fit(self, X, y, weight=None):
         elif self.solver == 'newton-cg':
             func = self.family._deviance
             grad = self.family._deviance_derivative
+
             def grad_hess(coef, X, y, weight, link):
-                grad = (self.family
-                    ._deviance_derivative(coef, X, y, weight, link))
-                hessian = (self.family
-                    ._deviance_hessian(coef, X, y, weight,link))
+                grad = (self.family._deviance_derivative(
+                    coef, X, y, weight, link))
+                hessian = (self.family._deviance_hessian(
+                    coef, X, y, weight, link))
+
                 def Hs(s):
                     ret = np.dot(hessian, s)
                     return ret
@@ -820,7 +852,7 @@ def Hs(s):
             hess = grad_hess
             args = (Xnew, y, weight, self.link)
             coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args,
-                maxiter=self.max_iter, tol=self.tol)
+                                       maxiter=self.max_iter, tol=self.tol)
             self.coef_ = coef
 
         if self.fit_intercept is True:
@@ -835,6 +867,9 @@ def Hs(s):
         return self
 
     def predict(self, X, weight=1):
+        """Prediction with features X.
+        If weights are given, returns prediction*weights.
+        """
         check_is_fitted(self, "coef_")
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
@@ -860,13 +895,13 @@ def score(self, X, y, weight=1):
         Returns the weight averaged negitive deviance (the better the score,
         the better the fit). Maximum score is therefore 0.
         """
-        #RegressorMixin has R^2 score.
-        #TODO: Make it more compatible with the score function in
+        # RegressorMixin has R^2 score.
+        # TODO: Make it more compatible with the score function in
         #      sklearn.metrics.regression.py
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
             eta += self.intercept_
         mu = self.link.inverse(eta)
-        output_errors = self.family.unit_deviance(y,mu)
+        output_errors = self.family.unit_deviance(y, mu)
         weight = weight * np.ones_like(y)
         return np.average(output_errors, weights=weight)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 2a88a7ec899d9..a4d4ea8650860 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -1,73 +1,82 @@
 import numpy as np
 
-from sklearn.linear_model.glm import (Link, IdentityLink, LogLink,
-                 ExponentialDispersionModel, TweedieDistribution,
-                 NormalDistribution, GaussianDistribution,
-                 PoissonDistribution, GammaDistribution,
-                 InverseGaussianDistribution, GeneralizedHyperbolicSecand,
-                 GeneralizedLinearModel)
+from sklearn.linear_model.glm import (
+    # Link, IdentityLink,
+    LogLink,
+    TweedieDistribution,
+    NormalDistribution, PoissonDistribution,
+    GammaDistribution, InverseGaussianDistribution,
+    # GeneralizedHyperbolicSecand,
+    GeneralizedLinearRegressor)
+
+from sklearn.utils.testing import (
+    # assert_equal,
+    assert_array_equal, assert_array_almost_equal)
 
-from sklearn.utils.testing import (assert_equal, assert_array_equal,
-    assert_array_almost_equal)
 
 def test_family_bounds():
     """Test the valid range of distributions
     """
     family = NormalDistribution()
-    result = family.in_y_range([-1,0,1])
+    result = family.in_y_range([-1, 0, 1])
     assert_array_equal(result, [True, True, True])
 
     family = PoissonDistribution()
-    result = family.in_y_range([-1,0,1])
+    result = family.in_y_range([-1, 0, 1])
     assert_array_equal(result, [False, True, True])
 
     family = TweedieDistribution(power=1.5)
-    result = family.in_y_range([-1,0,1])
+    result = family.in_y_range([-1, 0, 1])
     assert_array_equal(result, [False, True, True])
 
     family = GammaDistribution()
-    result = family.in_y_range([-1,0,1])
+    result = family.in_y_range([-1, 0, 1])
     assert_array_equal(result, [False, False, True])
 
     family = InverseGaussianDistribution()
-    result = family.in_y_range([-1,0,1])
+    result = family.in_y_range([-1, 0, 1])
     assert_array_equal(result, [False, False, True])
 
     family = TweedieDistribution(power=4.5)
-    result = family.in_y_range([-1,0,1])
+    result = family.in_y_range([-1, 0, 1])
     assert_array_equal(result, [False, False, True])
 
+
 def test_glm_identiy_regression():
     """Test linear regression on a simple dataset
     """
-    coef = [1,2]
-    X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T
+    coef = [1, 2]
+    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
+    families = (
+        NormalDistribution(), PoissonDistribution(),
+        GammaDistribution(), InverseGaussianDistribution(),
+        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5))
     for solver in ['irls', 'lbfgs', 'newton-cg']:
-        for family in (GaussianDistribution(), PoissonDistribution(),
-            GammaDistribution(), InverseGaussianDistribution(),
-            TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)):
-            glm = GeneralizedLinearModel(family=family,
-                fit_intercept=False, solver=solver)
+        for family in families:
+            glm = GeneralizedLinearRegressor(
+                family=family, fit_intercept=False, solver=solver)
             res = glm.fit(X, y)
             assert_array_almost_equal(res.coef_, coef)
 
+
 def test_glm_log_regression():
     """Test linear regression on a simple dataset
     """
-    coef = [1,2]
-    X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T
+    coef = [1, 2]
+    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.exp(np.dot(X, coef))
-    #for solver in ['irls', 'lbfgs', 'newton-cg']:
-    for solver in ['irls']:
-        #for family in [GaussianDistribution(), PoissonDistribution(),
-        #    GammaDistribution(), InverseGaussianDistribution(),
-        #    TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]:
-        for family in [GaussianDistribution()]:
-            glm = GeneralizedLinearModel(family=family,
-                link=LogLink(),
-                fit_intercept=False, solver=solver, start_params='ols')
+    families = (
+        NormalDistribution(), PoissonDistribution(),
+        GammaDistribution(), InverseGaussianDistribution(),
+        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5))
+    for solver in ['irls', 'lbfgs', 'newton-cg']:
+        for family in families:
+            glm = GeneralizedLinearRegressor(
+                family=family, link=LogLink(), fit_intercept=False,
+                solver=solver, start_params='ols')
             res = glm.fit(X, y)
             assert_array_almost_equal(res.coef_, coef)
 
-#TODO: Test compatibility with R's glm, glmnet
+
+# TODO: Test compatibility with R's glm, glmnet

From a6137d85401ef72976327c211f44d721d9f81e00 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 19 Jul 2017 17:41:49 +0200
Subject: [PATCH 003/209] [WIP] Add Generalized Linear Models (#9405)

* GeneralizedLinearRegressor added to doc/modules/classes.rst
---
 doc/modules/classes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index e09ca0422d8a7..2d451b6758eb1 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -720,6 +720,7 @@ Kernels:
    linear_model.BayesianRidge
    linear_model.ElasticNet
    linear_model.ElasticNetCV
+   linear_model.GeneralizedLinearRegressor
    linear_model.HuberRegressor
    linear_model.Lars
    linear_model.LarsCV

From b0be167080588a35dc1f4b762d961edb897b1019 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 9 Aug 2017 13:38:49 +0200
Subject: [PATCH 004/209] [WIP] Add Generalized Linear Models (#9405)

* fixed bug: init parameter max_iter
* fix API for family and link:
  default parameter changed to string
  non public variables self._family_instance and self._link_instance
* fixed bug in score, minus sign forgotten
* added check_is_fitted to estimate_phi and score
* added check_array(X) in predict
* replaced lambda functions in TweedieDistribution
* some documentation
---
 sklearn/linear_model/glm.py | 196 ++++++++++++++++++++++--------------
 1 file changed, 122 insertions(+), 74 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index cf91a64fafc12..0ee1564049329 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -15,9 +15,10 @@
 
 # Design Decisions:
 # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
-#   So far, it is GeneralizedLinearModel, since it could very easily
-#   extended by Bernoulli/Binomial distribution.
-#   Solution: GeneralizedLinearRegressor
+#   Estimators in sklearn are either regressors or classifiers. A Generalized
+#   Linear Model does both depending on the chosen distribution, e.g. Normal =>
+#   regressor, Bernoulli/Binomial => classifier.
+#   Solution: GeneralizedLinearRegressor since this is the focus.
 # - The link funtion (instance of class Link) is necessary for the evaluation
 #   of deviance, score, Fisher and Hessian matrix as functions of the
 #   coefficients, which is needed by optimizers.
@@ -33,7 +34,7 @@
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..externals import six
-from ..utils import check_X_y
+from ..utils import check_array, check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import newton_cg
 from ..utils.validation import check_is_fitted
@@ -372,46 +373,67 @@ class TweedieDistribution(ExponentialDispersionModel):
             The variance power of the unit_variance
             :math:`v(mu) = mu^{power}`.
     """
+    def _less_upper_bound(self, x):
+        return np.less(x, self.upper_bound)
+
+    def _less_equal_upper_bound(self, x):
+        return np.less_equal(x, self.upper_bound)
+
+    def _greater_lower_bound(self, x):
+        return np.greater(x, self.lower_bound)
+
+    def _greater_equal_lower_bound(self, x):
+        return np.greater_equal(x, self.lower_bound)
+
     def __init__(self, power=0):
         self.power = power
         self._upper_bound = np.Inf
-        self._upper_compare = lambda x: np.less(x, self.upper_bound)
+        # self._upper_compare = lambda x: np.less(x, self.upper_bound)
+        self._upper_compare = self._less_upper_bound
         if power < 0:
             # Extreme Stable
             self._lower_bound = -np.Inf
-            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            self._lower_compare = self._greater_lower_bound
         elif power == 0:
-            # GaussianDistribution
+            # NormalDistribution
             self._lower_bound = -np.Inf
-            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            self._lower_compare = self._greater_lower_bound
         elif (power > 0) and (power < 1):
             raise ValueError('For 0<power<1, no distribution exists.')
         elif power == 1:
             # PoissonDistribution
             self._lower_bound = 0
-            self._lower_compare = (
-                lambda x: np.greater_equal(x, self.lower_bound))
+            # self._lower_compare = (
+            #     lambda x: np.greater_equal(x, self.lower_bound))
+            self._lower_compare = self._greater_equal_lower_bound
         elif (power > 1) and (power < 2):
             # Compound Poisson
             self._lower_bound = 0
-            self._lower_compare = (
-                lambda x: np.greater_equal(x, self.lower_bound))
+            # self._lower_compare = (
+            #     lambda x: np.greater_equal(x, self.lower_bound))
+            self._lower_compare = self._greater_equal_lower_bound
         elif power == 2:
             # GammaDistribution
             self._lower_bound = 0
-            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            self._lower_compare = self._greater_lower_bound
         elif (power > 2) and (power < 3):
             # Positive Stable
             self._lower_bound = 0
-            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            self._lower_compare = self._greater_lower_bound
         elif power == 3:
             # InverseGaussianDistribution
             self._lower_bound = 0
-            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            self._lower_compare = self._greater_lower_bound
         elif power > 3:
             # Positive Stable
             self._lower_bound = 0
-            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            self._lower_compare = self._greater_lower_bound
 
     @property
     def power(self):
@@ -530,6 +552,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     Class to fit a Generalized Linear Model (GLM) based on reproductive
     Exponential Dispersion Models (EDM).
 
+    #TODO: This belongs to User Guide
     Assumptions:
 
     - The target values y_i are realizations of random variables
@@ -559,25 +582,26 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     Parameters
     ----------
-    fit_intercept : boolean, optional, default True
-        whether to calculate the intercept for this model. If set
-        to False, no intercept will be used in calculations
-        (e.g. data is expected to be already centered).
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
 
     family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance
-        of a subclass of ExponentialDispersionModel, optional, default 'normal'
+        of a subclass of ExponentialDispersionModel, optional
+        (default='normal')
         the distributional assumption of the GLM.
 
     link : {'identity', 'log'} or an instance of a subclass of Link,
-        optional, default IdentityLink()
-        the link function (class) of the GLM
+        optional (default='identity')
+        the link function of the GLM, i.e. mapping from linear predictor
+        (X*coef) to expectation (mu).
 
-    fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr'
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul='chisqr')
         method for estimation of the dispersion parameter phi. Whether to use
         the chi squared statisic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'irls', 'newton-cg', 'lbfgs'}, defaul 'irls'
+    solver : {'irls', 'newton-cg', 'lbfgs'}, optional (defaul='irls')
         Algorithm to use in the optimization problem.
 
         - 'irls' is iterated reweighted least squares. It is the standard
@@ -585,16 +609,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         - 'newton-cg', 'lbfgs'
 
-    max_iter : int, default 100
+    max_iter : int, optional (default=100)
         TODO
 
-    tol : float
+    tol : float, optional (default=1e-4)
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
         the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative of
         the deviance).
 
-    start_params : {array shape (n_features, ), 'ols'}, default None
+    start_params : {array shape (n_features, ), 'ols'}, optional (default=None)
         sets the start values for coef_ in the fit.
         If None, default values are taken.
         If 'ols' the result of an ordinary least squares in the link space
@@ -603,9 +627,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         If fit_intercept is true, the first value is assumed to be the start
         value for the intercept_.
 
-    verbose : int, default: 0
-        For the lbfgs solver set verbose to any positive
-        number for verbosity.
+    verbose : int, optional (default=0)
+        For the lbfgs solver set verbose to any positive number for verbosity.
 
     Attributes
     ----------
@@ -629,15 +652,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     TODO
     """
 
-    def __init__(self, fit_intercept=True, family=NormalDistribution(),
-                 link=IdentityLink(), fit_dispersion='chisqr', solver='irls',
+    def __init__(self, fit_intercept=True, family='normal',
+                 link='identity', fit_dispersion='chisqr', solver='irls',
                  max_iter=100, tol=1e-4, start_params=None, verbose=0):
         self.fit_intercept = fit_intercept
         self.family = family
         self.link = link
         self.fit_dispersion = fit_dispersion
         self.solver = solver
-        self.max_iter = 100
+        self.max_iter = max_iter
         self.tol = tol
         self.start_params = start_params
         self.verbose = verbose
@@ -663,29 +686,38 @@ def fit(self, X, y, weight=None):
         -------
         self : returns an instance of self.
         """
-        if not isinstance(self.family, ExponentialDispersionModel):
+        # Garantee that self._family_instance is an instance of class
+        # ExponentialDispersionModel
+        if isinstance(self.family, ExponentialDispersionModel):
+            self._family_instance = self.family
+        else:
             if self.family == 'normal':
-                self.family = NormalDistribution()
+                self._family_instance = NormalDistribution()
             elif self.family == 'poisson':
-                self.family = PoissonDistribution()
+                self._family_instance = PoissonDistribution()
             elif self.family == 'gamma':
-                self.family = GammaDistribution()
+                self._family_instance = GammaDistribution()
             elif self.family == 'inverse.gaussian':
-                self.family = InverseGaussianDistribution()
+                self._family_instance = InverseGaussianDistribution()
             else:
                 raise ValueError(
-                    "The argument family must be an instance of class"
+                    "The family must be an instance of class"
                     " ExponentialDispersionModel or an element of"
                     " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].")
-        if not isinstance(self.link, Link):
+
+        # Garantee that self._link_instance is set to an instance of class Link
+        if isinstance(self.link, Link):
+            self._link_instance = self.link
+        else:
             if self.link == 'identity':
-                self.link = IdentityLink()
-            if self.link == 'log':
-                self.link = LogLink()
+                self._link_instance = IdentityLink()
+            elif self.link == 'log':
+                self._link_instance = LogLink()
             else:
                 raise ValueError(
-                    "The argument link must be an instance of class Link or"
+                    "The link must be an instance of class Link or"
                     " an element of ['identity', 'log'].")
+
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool,"
                              " got {0}".format(self.fit_intercept))
@@ -711,10 +743,13 @@ def fit(self, X, y, weight=None):
                          y_numeric=True, multi_output=False)
         y = y.astype(np.float64)
 
-        if not np.all(self.family.in_y_range(y)):
+        family = self._family_instance
+        link = self._link_instance
+
+        if not np.all(family.in_y_range(y)):
             raise ValueError("Some value(s) of y are out of the valid "
                              "range for family {0}"
-                             .format(self.family.__class__.__name__))
+                             .format(family.__class__.__name__))
 
         if weight is None:
             weight = np.ones_like(y)
@@ -745,14 +780,14 @@ def fit(self, X, y, weight=None):
         coef = None
         if start_params is None:
             # Use mu_start and apply one irls step to calculate coef
-            mu = self.family.starting_mu(y, weight)
+            mu = family.starting_mu(y, weight)
             # linear predictor
-            eta = self.link.link(mu)
+            eta = link.link(mu)
             # h'(eta)
-            hp = self.link.inverse_derivative(eta)
+            hp = link.inverse_derivative(eta)
             # working weights w, in principle a diagonal matrix
             # therefore here just as 1d array
-            w = (hp**2 / self.family.variance(mu, phi=1, weight=weight))
+            w = (hp**2 / family.variance(mu, phi=1, weight=weight))
             wroot = np.sqrt(w)
             # working observations
             yw = eta + (y-mu)/hp
@@ -763,7 +798,7 @@ def fit(self, X, y, weight=None):
             coef = linalg.lstsq(X_rescale, yw_rescale)[0]
         elif start_params is 'ols':
             reg = LinearRegression(copy_X=False, fit_intercept=False)
-            reg.fit(Xnew, self.link.link(y))
+            reg.fit(Xnew, link.link(y))
             coef = reg.coef_
         else:
             coef = start_params
@@ -775,16 +810,16 @@ def fit(self, X, y, weight=None):
         if self.solver == 'irls':
             # linear predictor
             eta = safe_sparse_dot(Xnew, coef, dense_output=True)
-            mu = self.link.inverse(eta)
+            mu = link.inverse(eta)
             while self.n_iter_ < self.max_iter:
                 self.n_iter_ += 1
                 # coef_old not used so far.
                 # coef_old = coef
                 # h'(eta)
-                hp = self.link.inverse_derivative(eta)
+                hp = link.inverse_derivative(eta)
                 # working weights w, in principle a diagonal matrix
                 # therefore here just as 1d array
-                w = (hp**2 / self.family.variance(mu, phi=1, weight=weight))
+                w = (hp**2 / family.variance(mu, phi=1, weight=weight))
                 wroot = np.sqrt(w)
                 # working observations
                 yw = eta + (y-mu)/hp
@@ -799,13 +834,13 @@ def fit(self, X, y, weight=None):
                 # updated linear predictor
                 # do it here for updated values for tolerance
                 eta = safe_sparse_dot(Xnew, coef, dense_output=True)
-                mu = self.link.inverse(eta)
+                mu = link.inverse(eta)
 
                 # which tolerace? |coef - coef_old| or gradient?
                 # use gradient for compliance with newton-cg and lbfgs
                 # TODO: faster computation of gradient, use mu and eta directly
-                gradient = self.family._deviance_derivative(
-                    coef=coef, X=Xnew, y=y, weight=weight, link=self.link)
+                gradient = family._deviance_derivative(
+                    coef=coef, X=Xnew, y=y, weight=weight, link=link)
                 if (np.max(np.abs(gradient)) <= self.tol):
                     converged = True
                     break
@@ -818,9 +853,9 @@ def fit(self, X, y, weight=None):
         # TODO: performance: make one function return both deviance and
         #       gradient of deviance
         elif self.solver == 'lbfgs':
-            func = self.family._deviance
-            fprime = self.family._deviance_derivative
-            args = (Xnew, y, weight, self.link)
+            func = family._deviance
+            fprime = family._deviance_derivative
+            args = (Xnew, y, weight, link)
             coef, loss, info = optimize.fmin_l_bfgs_b(
                 func, coef, fprime=fprime,
                 args=args,
@@ -836,13 +871,13 @@ def fit(self, X, y, weight=None):
                         info["task"]))
             self.n_iter_ = info['nit']
         elif self.solver == 'newton-cg':
-            func = self.family._deviance
-            grad = self.family._deviance_derivative
+            func = family._deviance
+            grad = family._deviance_derivative
 
             def grad_hess(coef, X, y, weight, link):
-                grad = (self.family._deviance_derivative(
+                grad = (family._deviance_derivative(
                     coef, X, y, weight, link))
-                hessian = (self.family._deviance_hessian(
+                hessian = (family._deviance_hessian(
                     coef, X, y, weight, link))
 
                 def Hs(s):
@@ -850,7 +885,7 @@ def Hs(s):
                     return ret
                 return grad, Hs
             hess = grad_hess
-            args = (Xnew, y, weight, self.link)
+            args = (Xnew, y, weight, link)
             coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args,
                                        maxiter=self.max_iter, tol=self.tol)
             self.coef_ = coef
@@ -871,37 +906,50 @@ def predict(self, X, weight=1):
         If weights are given, returns prediction*weights.
         """
         check_is_fitted(self, "coef_")
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
+        # TODO: validation of weight
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
             eta += self.intercept_
-        mu = self.link.inverse(eta)
+        mu = self._link_instance.inverse(eta)
         return mu*weight
 
     def estimate_phi(self, y, X, weight):
+        """Estimation of the dispersion parameter.
+        Returns the estimate.
+        """
+        check_is_fitted(self, "coef_")
         n_samples, n_features = X.shape
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
             eta += self.intercept_
-        mu = self.link.inverse(eta)
+        mu = self._link_instance.inverse(eta)
         if self.fit_dispersion == 'chisqr':
-            chisq = np.sum(weight*(y-mu)**2/self.family.unit_variance(mu))
+            chisq = np.sum(weight*(y-mu)**2 /
+                           self._family_instance.unit_variance(mu))
             return chisq/(n_samples - n_features)
         elif self.fit_dispersion == 'deviance':
-            dev = self.family.deviance(y, mu, weight)
+            dev = self._family_instance.deviance(y, mu, weight)
             return dev/(n_samples - n_features)
 
+# TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5"
+#       in check_estimator for score
+#       from sklearn.utils.estimator_checks import check_estimator
+#       from sklearn.linear_model import GeneralizedLinearRegressor
+#       check_estimator(GeneralizedLinearRegressor)
     def score(self, X, y, weight=1):
         """The natural score for a GLM is -deviance.
-        Returns the weight averaged negitive deviance (the better the score,
+        Returns the weight averaged negative deviance (the better the score,
         the better the fit). Maximum score is therefore 0.
         """
         # RegressorMixin has R^2 score.
         # TODO: Make it more compatible with the score function in
         #      sklearn.metrics.regression.py
+        check_is_fitted(self, "coef_")
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
             eta += self.intercept_
-        mu = self.link.inverse(eta)
-        output_errors = self.family.unit_deviance(y, mu)
+        mu = self._link_instance.inverse(eta)
+        output_errors = self._family_instance.unit_deviance(y, mu)
         weight = weight * np.ones_like(y)
-        return np.average(output_errors, weights=weight)
+        return -np.average(output_errors, weights=weight)

From 85c52ec9c6adb3b1f75650cfa7fe0b770393d24e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 13 Aug 2017 01:46:16 +0200
Subject: [PATCH 005/209] [WIP] Add Generalized Linear Models (#9405)

* make raw docstrings where appropriate
* make ExponentialDispersionModel (i.e. TweedieDistribution) pickable:
  ExponentialDispersionModel has new properties include_lower_bound,
  method in_y_range is not abstract anymore.
* set self.intercept_=0 if fit_intercept=False, such that it is always defined.
* set score to D2, a generalized R2 with deviance instead of squared error,
  as does glmnet. This also solves issues with
  check_regressors_train(GeneralizedLinearRegressor), which assumes R2 score.
* change of names: weight to weights in ExponentialDispersionModel and to
  sample_weight in GeneralizedLinearRegressor
* add class method linear_predictor
---
 sklearn/linear_model/glm.py | 330 ++++++++++++++++++++++--------------
 1 file changed, 199 insertions(+), 131 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 0ee1564049329..b80842f817f4d 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -5,13 +5,16 @@
 # Author: Christian Lorentzen <lorentzen.ch@googlemail.ch>
 # License: BSD 3 clause
 
+# TODO: Write more tests
 # TODO: Which name/symbol for coefficients and weights in docu?
 #       sklearn.linear_models uses w for coefficients.
-#       So far, coefficients=beta and weight=w (as standard literature)
-# TODO: Add l2-penalty
+#       So far, coefficients=beta and weights=w (as standard literature)
+# TODO: Add l2-penalty (maybe more general w.P.w with P penalty matrix)
 # TODO: Add l1-penalty (elastic net)
 # TODO: Add cross validation
 # TODO: Write docu and examples
+# TODO: Make it as much consistent to other estimators in linear_model as
+#       possible
 
 # Design Decisions:
 # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
@@ -118,7 +121,7 @@ def inverse_derivative2(self, lin_pred):
 
 
 class ExponentialDispersionModel(six.with_metaclass(ABCMeta)):
-    """Base class for reproductive Exponential Dispersion Models (EDM).
+    r"""Base class for reproductive Exponential Dispersion Models (EDM).
 
     The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
 
@@ -136,6 +139,8 @@ class ExponentialDispersionModel(six.with_metaclass(ABCMeta)):
     ----------
     lower_bound
     upper_bound
+    include_lower_bound
+    include_upper_bound
 
     Methods
     -------
@@ -174,15 +179,39 @@ def upper_bound(self):
         """
         raise NotImplementedError()
 
-    @abstractmethod
+    @abstractproperty
+    def include_lower_bound(self):
+        """If True, values of y may equal lower bound: y >= lower_bound.
+        """
+        raise NotImplementedError()
+
+    @abstractproperty
+    def include_upper_bound(self):
+        """If True, values of y may equal upper bound: y <= upper_bound.
+        """
+        raise NotImplementedError()
+
     def in_y_range(self, x):
         """Returns true if x is in the valid range of Y~EDM.
         """
-        raise NotImplementedError()
+        if self.include_lower_bound:
+            if self.include_upper_bound:
+                return np.logical_and(np.greater_equal(x, self.lower_bound),
+                                      np.less_equal(x, self.upper_bound))
+            else:
+                return np.logical_and(np.greater_equal(x, self.lower_bound),
+                                      np.less(x, self.upper_bound))
+        else:
+            if self.include_upper_bound:
+                return np.logical_and(np.greater(x, self.lower_bound),
+                                      np.less_equal(x, self.upper_bound))
+            else:
+                return np.logical_and(np.greater(x, self.lower_bound),
+                                      np.less(x, self.upper_bound))
 
     @abstractmethod
     def unit_variance(self, mu):
-        """The unit variance :math:`v(mu)` determines the variance as
+        r"""The unit variance :math:`v(mu)` determines the variance as
         a function of the mean mu by
         :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`.
         It can also be derived from the unit deviance :math:`d(y,\mu)` as
@@ -194,27 +223,27 @@ def unit_variance(self, mu):
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
-        """The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`.
+        r"""The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`.
         """
         raise NotImplementedError()
 
-    def variance(self, mu, phi=1, weight=1):
-        """The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is
+    def variance(self, mu, phi=1, weights=1):
+        r"""The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is
         :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`,
         with unit variance v(mu).
         """
-        return phi/weight * self.unit_variance(mu)
+        return phi/weights * self.unit_variance(mu)
 
-    def variance_derivative(self, mu, phi=1, weight=1):
-        """The derivative of the variance w.r.t. mu,
+    def variance_derivative(self, mu, phi=1, weights=1):
+        r"""The derivative of the variance w.r.t. mu,
         :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
         =phi/w_i*v'(\mu_i)`, with unit variance v(mu).
         """
-        return phi/weight * self.unit_variance_derivative(mu)
+        return phi/weights * self.unit_variance_derivative(mu)
 
     @abstractmethod
     def unit_deviance(self, y, mu):
-        """The unit_deviance :math:`d(y,\mu)`.
+        r"""The unit_deviance :math:`d(y,\mu)`.
         In terms of the log-likelihood it is given by
         :math:`d(y,\mu) = -2\phi\cdot
         \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).`
@@ -222,7 +251,7 @@ def unit_deviance(self, y, mu):
         raise NotImplementedError()
 
     def unit_deviance_derivative(self, y, mu):
-        """The derivative w.r.t. mu of the unit_deviance
+        r"""The derivative w.r.t. mu of the unit_deviance
         :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
         with unit variance :math:`v(\mu)`.
 
@@ -232,30 +261,30 @@ def unit_deviance_derivative(self, y, mu):
         """
         return -2*(y-mu)/self.unit_variance(mu)
 
-    def deviance(self, y, mu, weight=1):
-        """The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu)
-        with weight :math:`w_i` and unit_deviance :math:`d(y,mu)`.
+    def deviance(self, y, mu, weights=1):
+        r"""The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu)
+        with weights :math:`w_i` and unit_deviance :math:`d(y,mu)`.
         In terms of the likelihood it is :math:`D = -2\phi\cdot
         \left(loglike(y,\mu,\frac{phi}{w})
         - loglike(y,y,\frac{phi}{w})\right).`
         """
-        return np.sum(weight*self.unit_deviance(y, mu))
+        return np.sum(weights*self.unit_deviance(y, mu))
 
-    def _deviance(self, coef, X, y, weight, link):
+    def _deviance(self, coef, X, y, weights, link):
         """The deviance as a function of the coefficients ``coef``
         (:math:`beta`).
         """
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
         mu = link.inverse(lin_pred)
-        return self.deviance(y, mu, weight)
+        return self.deviance(y, mu, weights)
 
-    def deviance_derivative(self, y, mu, weight=1):
+    def deviance_derivative(self, y, mu, weights=1):
         """The derivative w.r.t. mu of the deviance.`
         """
-        return weight*self.unit_deviance_derivative(y, mu)
+        return weights*self.unit_deviance_derivative(y, mu)
 
-    def _score(self, coef, phi, X, y, weight, link):
-        """The score function :math:`s` is the derivative of the
+    def _score(self, coef, phi, X, y, weights, link):
+        r"""The score function :math:`s` is the derivative of the
         log-likelihood w.r.t. the ``coef`` (:math:`\beta`).
         It is given by
 
@@ -270,7 +299,7 @@ def _score(self, coef, phi, X, y, weight, link):
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
         mu = link.inverse(lin_pred)
-        sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         d = link.inverse_derivative(lin_pred)
         d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0),
                                         shape=(n_samples, n_samples))
@@ -278,8 +307,8 @@ def _score(self, coef, phi, X, y, weight, link):
         score = safe_sparse_dot(X.T, temp, dense_output=False)
         return score
 
-    def _fisher_matrix(self, coef, phi, X, y, weight, link):
-        """The Fisher information matrix, also known as expected
+    def _fisher_matrix(self, coef, phi, X, y, weights, link):
+        r"""The Fisher information matrix, also known as expected
         information matrix. It is given by
 
         .. math:
@@ -295,7 +324,7 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link):
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
         mu = link.inverse(lin_pred)
-        sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         d2 = link.inverse_derivative(lin_pred)**2
         d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0),
                                          shape=(n_samples, n_samples))
@@ -303,8 +332,8 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link):
         fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False)
         return fisher_matrix
 
-    def _observed_information(self, coef, phi, X, y, weight, link):
-        """The observed information matrix, also known as the negative of
+    def _observed_information(self, coef, phi, X, y, weights, link):
+        r"""The observed information matrix, also known as the negative of
         the Hessian matrix of the log-likelihood. It is given by
 
         .. math:
@@ -327,7 +356,7 @@ def _observed_information(self, coef, phi, X, y, weight, link):
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
         mu = link.inverse(lin_pred)
-        sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         dp = link.inverse_derivative2(lin_pred)
         d2 = link.inverse_derivative(lin_pred)**2
         v = self.unit_variance_derivative(mu)/self.unit_variance(mu)
@@ -338,33 +367,34 @@ def _observed_information(self, coef, phi, X, y, weight, link):
         observed_information = safe_sparse_dot(X.T, temp, dense_output=False)
         return observed_information
 
-    def _deviance_derivative(self, coef, X, y, weight, link):
-        """The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a
+    def _deviance_derivative(self, coef, X, y, weights, link):
+        r"""The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a
         function of the coefficients ``coef``.
         This is equivalent to :math:`-2\phi` times the score function
         :math:`s` (derivative of the log-likelihood).
         """
-        score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight,
+        score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights,
                             link=link)
         return -2*score
 
-    def _deviance_hessian(self, coef, X, y, weight, link):
-        """The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance
+    def _deviance_hessian(self, coef, X, y, weights, link):
+        r"""The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance
         as a function of the coefficients ``coef``.
         This is equivalent to :math:`+2\phi` times the observed information
         matrix.
         """
         info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y,
-                                                 weight=weight, link=link)
+                                                 weights=weights, link=link)
         return 2*info_matrix
 
-    def starting_mu(self, y, weight=1):
+    def starting_mu(self, y, weights=1):
         """Starting values for the mean mu_i in IRLS."""
-        return (weight*y+np.mean(weight*y))/(2.*np.sum(np.ones_like(y)*weight))
+        return ((weights*y+np.mean(weights*y))
+                / (2.*np.sum(np.ones_like(y)*weights)))
 
 
 class TweedieDistribution(ExponentialDispersionModel):
-    """A class for the Tweedie distribution.
+    r"""A class for the Tweedie distribution.
     They have mu=E[X] and Var[X] \propto mu**power.
 
     Attributes
@@ -373,67 +403,44 @@ class TweedieDistribution(ExponentialDispersionModel):
             The variance power of the unit_variance
             :math:`v(mu) = mu^{power}`.
     """
-    def _less_upper_bound(self, x):
-        return np.less(x, self.upper_bound)
-
-    def _less_equal_upper_bound(self, x):
-        return np.less_equal(x, self.upper_bound)
-
-    def _greater_lower_bound(self, x):
-        return np.greater(x, self.lower_bound)
-
-    def _greater_equal_lower_bound(self, x):
-        return np.greater_equal(x, self.lower_bound)
-
     def __init__(self, power=0):
         self.power = power
         self._upper_bound = np.Inf
-        # self._upper_compare = lambda x: np.less(x, self.upper_bound)
-        self._upper_compare = self._less_upper_bound
+        self._include_upper_bound = False
         if power < 0:
             # Extreme Stable
             self._lower_bound = -np.Inf
-            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
-            self._lower_compare = self._greater_lower_bound
+            self._include_lower_bound = False
         elif power == 0:
             # NormalDistribution
             self._lower_bound = -np.Inf
-            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
-            self._lower_compare = self._greater_lower_bound
+            self._include_lower_bound = False
         elif (power > 0) and (power < 1):
             raise ValueError('For 0<power<1, no distribution exists.')
         elif power == 1:
             # PoissonDistribution
             self._lower_bound = 0
-            # self._lower_compare = (
-            #     lambda x: np.greater_equal(x, self.lower_bound))
-            self._lower_compare = self._greater_equal_lower_bound
+            self._include_lower_bound = True
         elif (power > 1) and (power < 2):
             # Compound Poisson
             self._lower_bound = 0
-            # self._lower_compare = (
-            #     lambda x: np.greater_equal(x, self.lower_bound))
-            self._lower_compare = self._greater_equal_lower_bound
+            self._include_lower_bound = True
         elif power == 2:
             # GammaDistribution
             self._lower_bound = 0
-            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
-            self._lower_compare = self._greater_lower_bound
+            self._include_lower_bound = False
         elif (power > 2) and (power < 3):
             # Positive Stable
             self._lower_bound = 0
-            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
-            self._lower_compare = self._greater_lower_bound
+            self._include_lower_bound = False
         elif power == 3:
             # InverseGaussianDistribution
             self._lower_bound = 0
-            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
-            self._lower_compare = self._greater_lower_bound
+            self._include_lower_bound = False
         elif power > 3:
             # Positive Stable
             self._lower_bound = 0
-            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
-            self._lower_compare = self._greater_lower_bound
+            self._include_lower_bound = False
 
     @property
     def power(self):
@@ -454,8 +461,13 @@ def lower_bound(self):
     def upper_bound(self):
         return self._upper_bound
 
-    def in_y_range(self, x):
-        return np.logical_and(self._lower_compare(x), self._upper_compare(x))
+    @property
+    def include_lower_bound(self):
+        return self._include_lower_bound
+
+    @property
+    def include_upper_bound(self):
+        return self._include_upper_bound
 
     def unit_variance(self, mu):
         """The unit variance of a Tweedie distribution is v(mu)=mu**power.
@@ -485,7 +497,7 @@ def unit_deviance(self, y, mu):
             return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
                         y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
 
-    def likelihood(self, y, X, beta, phi, weight=1):
+    def likelihood(self, y, X, beta, phi, weights=1):
         raise NotImplementedError('This function is not (yet) implemented.')
 
 
@@ -521,6 +533,8 @@ class GeneralizedHyperbolicSecand(ExponentialDispersionModel):
     def __init__(self):
         self._lower_bound = -np.Inf
         self._upper_bound = np.Inf
+        self._include_lower_bound = False
+        self._include_upper_bound = False
 
     @property
     def lower_bound(self):
@@ -530,11 +544,13 @@ def lower_bound(self):
     def upper_bound(self):
         return self._upper_bound
 
-    def in_y_range(self, x):
-        np.logical_and(
-            np.greater(x, self.lower_bound),
-            np.less(x, self.lower_bound)
-            )
+    @property
+    def include_lower_bound(self):
+        return self._include_lower_bound
+
+    @property
+    def include_upper_bound(self):
+        return self._include_upper_bound
 
     def unit_variance(self, mu):
         return 1 + mu**2
@@ -548,7 +564,7 @@ def unit_deviance(self, y, mu):
 
 
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
-    """
+    r"""
     Class to fit a Generalized Linear Model (GLM) based on reproductive
     Exponential Dispersion Models (EDM).
 
@@ -665,7 +681,7 @@ def __init__(self, fit_intercept=True, family='normal',
         self.start_params = start_params
         self.verbose = verbose
 
-    def fit(self, X, y, weight=None):
+    def fit(self, X, y, sample_weight=None):
         """Fit a generalized linear model.
 
         Parameters
@@ -676,11 +692,12 @@ def fit(self, X, y, weight=None):
         y : numpy array of shape [n_samples]
             Target values
 
-        weight : numpy array of shape [n_samples]
+        sample_weight : numpy array of shape [n_samples]
             Individual weights for each sample.
             Var[Y_i]=phi/weight_i * v(mu)
             If Y_i ~ EDM(mu, phi/w_i) then
-            sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w))
+            sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
+            weighted average with weights=sample_weight.
 
         Returns
         -------
@@ -751,16 +768,17 @@ def fit(self, X, y, weight=None):
                              "range for family {0}"
                              .format(family.__class__.__name__))
 
-        if weight is None:
-            weight = np.ones_like(y)
-        elif np.isscalar(weight):
-            weight = weight*np.ones_like(y)
+        if sample_weight is None:
+            weights = np.ones_like(y)
+        elif np.isscalar(sample_weight):
+            weights = sample_weight*np.ones_like(y)
         else:
-            weight = np.atleast_1d(weight)
-            if weight.ndim > 1:
-                raise ValueError("Weights must be 1D array or scalar")
-            elif weight.shape[0] != y.shape[0]:
-                raise ValueError("Weights must have the same length as y")
+            weights = np.atleast_1d(sample_weight)
+            if weights.ndim > 1:
+                raise ValueError("Sample weight must be 1D array or scalar")
+            elif weights.shape[0] != y.shape[0]:
+                raise ValueError("Sample weights must have the same length as"
+                                 " y")
 
         if self.fit_intercept:
             # intercept is first column <=> coef[0] is for intecept
@@ -780,14 +798,14 @@ def fit(self, X, y, weight=None):
         coef = None
         if start_params is None:
             # Use mu_start and apply one irls step to calculate coef
-            mu = family.starting_mu(y, weight)
+            mu = family.starting_mu(y, weights)
             # linear predictor
             eta = link.link(mu)
             # h'(eta)
             hp = link.inverse_derivative(eta)
             # working weights w, in principle a diagonal matrix
             # therefore here just as 1d array
-            w = (hp**2 / family.variance(mu, phi=1, weight=weight))
+            w = (hp**2 / family.variance(mu, phi=1, weights=weights))
             wroot = np.sqrt(w)
             # working observations
             yw = eta + (y-mu)/hp
@@ -819,7 +837,7 @@ def fit(self, X, y, weight=None):
                 hp = link.inverse_derivative(eta)
                 # working weights w, in principle a diagonal matrix
                 # therefore here just as 1d array
-                w = (hp**2 / family.variance(mu, phi=1, weight=weight))
+                w = (hp**2 / family.variance(mu, phi=1, weights=weights))
                 wroot = np.sqrt(w)
                 # working observations
                 yw = eta + (y-mu)/hp
@@ -840,7 +858,7 @@ def fit(self, X, y, weight=None):
                 # use gradient for compliance with newton-cg and lbfgs
                 # TODO: faster computation of gradient, use mu and eta directly
                 gradient = family._deviance_derivative(
-                    coef=coef, X=Xnew, y=y, weight=weight, link=link)
+                    coef=coef, X=Xnew, y=y, weights=weights, link=link)
                 if (np.max(np.abs(gradient)) <= self.tol):
                     converged = True
                     break
@@ -855,7 +873,7 @@ def fit(self, X, y, weight=None):
         elif self.solver == 'lbfgs':
             func = family._deviance
             fprime = family._deviance_derivative
-            args = (Xnew, y, weight, link)
+            args = (Xnew, y, weights, link)
             coef, loss, info = optimize.fmin_l_bfgs_b(
                 func, coef, fprime=fprime,
                 args=args,
@@ -874,47 +892,72 @@ def fit(self, X, y, weight=None):
             func = family._deviance
             grad = family._deviance_derivative
 
-            def grad_hess(coef, X, y, weight, link):
+            def grad_hess(coef, X, y, weights, link):
                 grad = (family._deviance_derivative(
-                    coef, X, y, weight, link))
+                    coef, X, y, weights, link))
                 hessian = (family._deviance_hessian(
-                    coef, X, y, weight, link))
+                    coef, X, y, weights, link))
 
                 def Hs(s):
                     ret = np.dot(hessian, s)
                     return ret
                 return grad, Hs
             hess = grad_hess
-            args = (Xnew, y, weight, link)
+            args = (Xnew, y, weights, link)
             coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args,
                                        maxiter=self.max_iter, tol=self.tol)
             self.coef_ = coef
 
-        if self.fit_intercept is True:
+        if self.fit_intercept:
             self.intercept_ = coef[0]
             self.coef_ = coef[1:]
         else:
+            self.intercept_ = 0.
             self.coef_ = coef
 
         if self.fit_dispersion in ['chisqr', 'deviance']:
-            self.dispersion_ = self.estimate_phi(y, X, weight)
+            self.dispersion_ = self.estimate_phi(y, X, weights)
 
         return self
 
-    def predict(self, X, weight=1):
-        """Prediction with features X.
-        If weights are given, returns prediction*weights.
+    def linear_predictor(self, X):
+        """The linear_predictor X*coef_ + intercept_.
+
+        Parameters
+        ----------
+        X : numpy array or sparse matrix of shape [n_samples,n_features]
+            Samples.
+
+        Returns
+        -------
+        C : array, shape = (n_samples)
+            Returns predicted values of linear predictor.
         """
         check_is_fitted(self, "coef_")
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
-        # TODO: validation of weight
-        eta = safe_sparse_dot(X, self.coef_, dense_output=True)
-        if self.fit_intercept is True:
-            eta += self.intercept_
+        return safe_sparse_dot(X, self.coef_,
+                               dense_output=True) + self.intercept_
+
+    def predict(self, X, sample_weight=1):
+        """Predict uing GLM with feature matrix X.
+        If sample_weight is given, returns prediction*sample_weight.
+
+        Parameters
+        ----------
+        X : numpy array or sparse matrix of shape [n_samples,n_features]
+            Samples.
+
+        Returns
+        -------
+        C : array, shape = (n_samples)
+            Returns predicted values times sample_weight.
+        """
+        # TODO: validation of sample_weight
+        eta = self.linear_predictor(X)
         mu = self._link_instance.inverse(eta)
-        return mu*weight
+        return mu*sample_weight
 
-    def estimate_phi(self, y, X, weight):
+    def estimate_phi(self, y, X, sample_weight):
         """Estimation of the dispersion parameter.
         Returns the estimate.
         """
@@ -925,11 +968,11 @@ def estimate_phi(self, y, X, weight):
             eta += self.intercept_
         mu = self._link_instance.inverse(eta)
         if self.fit_dispersion == 'chisqr':
-            chisq = np.sum(weight*(y-mu)**2 /
+            chisq = np.sum(sample_weight*(y-mu)**2 /
                            self._family_instance.unit_variance(mu))
             return chisq/(n_samples - n_features)
         elif self.fit_dispersion == 'deviance':
-            dev = self._family_instance.deviance(y, mu, weight)
+            dev = self._family_instance.deviance(y, mu, sample_weight)
             return dev/(n_samples - n_features)
 
 # TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5"
@@ -937,19 +980,44 @@ def estimate_phi(self, y, X, weight):
 #       from sklearn.utils.estimator_checks import check_estimator
 #       from sklearn.linear_model import GeneralizedLinearRegressor
 #       check_estimator(GeneralizedLinearRegressor)
-    def score(self, X, y, weight=1):
-        """The natural score for a GLM is -deviance.
-        Returns the weight averaged negative deviance (the better the score,
-        the better the fit). Maximum score is therefore 0.
+    def score(self, X, y, sample_weight=None):
+        r"""Returns D^2, a generalization of the coefficient of determination
+        R^2, which uses deviance instead of squared error.
+
+        D^2 is defined as
+        :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}`
+        is the null deviance, i.e. the deviance of a model with intercept
+        alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean
+        :math:`\bar{y}` is average by sample_weight. In the case of a Normal
+        distribution, this D^2 equals R^2.
+        Best possible score is 1.0 and it can be negative (because the
+        model can be arbitrarily worse).
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Test samples
+
+        y : array-like of shape = (n_samples)
+            True valeus for X.
+
+        sample_weight : array-like, shape = (n_samples), optional
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            D^2 of self.predict(X) wrt. y.
         """
-        # RegressorMixin has R^2 score.
-        # TODO: Make it more compatible with the score function in
-        #      sklearn.metrics.regression.py
-        check_is_fitted(self, "coef_")
-        eta = safe_sparse_dot(X, self.coef_, dense_output=True)
-        if self.fit_intercept is True:
-            eta += self.intercept_
-        mu = self._link_instance.inverse(eta)
-        output_errors = self._family_instance.unit_deviance(y, mu)
-        weight = weight * np.ones_like(y)
-        return -np.average(output_errors, weights=weight)
+        # Note, default score defined in RegressorMixin is R^2 score.
+        # TODO: make D^2 a score function in module metrics (and thereby get
+        #       input validation and so on)
+        if sample_weight is None:
+            weights = np.ones_like(y)
+        else:
+            weights = np.atleast_1d(sample_weight)
+        mu = self.predict(X)
+        dev = self._family_instance.deviance(y, mu, weights=weights)
+        y_mean = np.average(y, weights=weights)
+        dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
+        return 1. - dev / dev_null

From 0f4bdb3a8c5c45e80786b8156398da93bfc597e8 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Mon, 18 Sep 2017 23:41:19 +0200
Subject: [PATCH 006/209] [WIP] Add Generalized Linear Models (#9405)

* added L2 penalty
* api change: alpha, l1_ratio, P1, P2, warm_start, check_input, copy_X
* added entry in user guide
* improved docstrings
* helper function _irls_step
---
 doc/modules/linear_model.rst           | 113 +++-
 sklearn/linear_model/glm.py            | 702 ++++++++++++++++++-------
 sklearn/linear_model/tests/test_glm.py | 115 +++-
 3 files changed, 721 insertions(+), 209 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 75b95f6c7a44f..51b3821fa6207 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -786,7 +786,7 @@ non-smooth `penalty="l1"`. This is therefore the solver of choice for sparse
 multinomial logistic regression. It is also the only solver that supports
 `penalty="elasticnet"`.
 
-The "lbfgs" is an optimization algorithm that approximates the 
+The "lbfgs" is an optimization algorithm that approximates the
 Broyden–Fletcher–Goldfarb–Shanno algorithm [8]_, which belongs to
 quasi-Newton methods. The "lbfgs" solver is recommended for use for
 small data-sets but for larger datasets its performance suffers. [9]_
@@ -874,6 +874,117 @@ to warm-starting (see :term:`Glossary <warm_start>`).
     .. [9] `"Performance Evaluation of Lbfgs vs other solvers"
             <http://www.fuzihao.org/blog/2016/01/16/Comparison-of-Gradient-Descent-Stochastic-Gradient-Descent-and-L-BFGS/>`_
 
+.. _Generalized_linear_regression:
+
+Generalized linear regression
+=============================
+
+:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two
+ways [1]_. First, the predicted values :math:`\hat{y}` are linked to a linear
+combination of the input variables :math:`X` via an inverse link function
+:math:`h` as
+
+.. math::    \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p).
+
+Secondly, the squared loss function is replaced by the deviance :math:`D` of an
+exponential dispersion model (EDM) [2]_. The objective function beeing minimized
+becomes
+
+.. math::    \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1
+            +\frac{\alpha(1-\rho)}{2} w^T P_2 w
+
+with sample weights :math:`s`.
+:math:`P_1` can be used to exclude some of the coefficients in the L1
+penalty, :math:`P_2` (must be positive semi-definite) allows for a more
+versatile L2 penalty.
+
+Use cases, where a loss different from the squared loss might be appropriate,
+are the following:
+
+  * If the target values :math:`y` are counts (integer valued) or frequencies, you might try a Poisson deviance.
+
+  * If the target values are positive valued and skewed, you might try a Gamma deviance.
+
+  * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family).
+
+Since the linear predictor :math:`Xw` can be negative and
+Poisson, Gamma and Inverse Gaussian distributions don't have negative values,
+it is convenient to apply a link function different from the identity link
+:math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with
+:math:`h(Xw)=\exp(Xw)`.
+
+Note that the feature matrix `X` should be standardized before fitting. This
+ensures that the penalty treats features equally.
+
+    >>> from sklearn import linear_model
+    >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, l1_ratio=0)
+    >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
+    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
+    >>> reg.coef_
+    array([ 0.24630255,  0.43373521])
+    >>> reg.intercept_
+    -0.76383575123143277
+
+Mathematical formulation
+------------------------
+
+In the unpenalized case, the assumptions are the folowing:
+
+    * The target values :math:`y_i` are realizations of random variables
+      :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})`
+      with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter
+      :math:`\phi` and sample weights :math:`s_i`.
+    * The aim is to predict the expectation :math:`\mu_i` with
+      :math:`\hat{y_i} = h(\eta_i)`, linear predictor
+      :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`.
+
+Note that the first assumption implies
+:math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance
+function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the
+same as specifying a unit variance function (they are one-to-one).
+
+Including penalties helps to avoid overfitting or, in case of L1 penalty, to
+obtain sparse solutions. But there are also other motivations to include them,
+e.g. accounting fo dependence structure of :math:`y`.
+
+The objective function, which is independent of :math:`\phi`, is minimized with
+respect to the coefficients :math:`w`.
+
+The deviance is defined by
+
+.. math::     D(y, \mu) = -2\phi\cdot
+              \left(loglike(y,\mu,\frac{\phi}{s})
+              - loglike(y,y,\frac{\phi}{s})\right)
+
+=====================================  =================================
+Distribution                           Variance Function :math:`v(\mu)`
+=====================================  =================================
+Normal ("normal")                      :math:`1`
+Poisson ("poisson")                    :math:`\mu`
+Gamma ("gamma")                        :math:`\mu^2`
+Inverse Gaussian ("inverse.gaussian")  :math:`\mu^3`
+=====================================  =================================
+
+Two remarks:
+
+* The deviances for at least Normal, Poisson and Gamma distributions are
+  strictly consistent scoring functions for the mean :math:`\mu`, see Eq.
+  (19)-(20) in [3]_.
+
+* If you want to model a frequency, i.e. counts per exposure (time, volume, ...)
+  you can do so by a Poisson distribution and passing
+  :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together
+  with :math:`s=\mathrm{exposure}` as sample weights.
+
+
+.. topic:: References:
+
+    .. [1] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+
+    .. [2] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
+           See also `Exponential dispersion model. <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
+
+    .. [3] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
 
 Stochastic Gradient Descent - SGD
 =================================
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index b80842f817f4d..2db3c56d5e1c1 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -6,15 +6,15 @@
 # License: BSD 3 clause
 
 # TODO: Write more tests
-# TODO: Which name/symbol for coefficients and weights in docu?
-#       sklearn.linear_models uses w for coefficients.
-#       So far, coefficients=beta and weights=w (as standard literature)
-# TODO: Add l2-penalty (maybe more general w.P.w with P penalty matrix)
 # TODO: Add l1-penalty (elastic net)
+# TODO: deal with option self.copy_X
+# TODO: Should the option `normalize` be included (like other linear models)?
+#       So far, it is not included. User must pass a normalized X.
 # TODO: Add cross validation
-# TODO: Write docu and examples
+# TODO: Write examples and more docu
 # TODO: Make it as much consistent to other estimators in linear_model as
 #       possible
+# TODO: options P1 and P2 in fit() or in __init__()???
 
 # Design Decisions:
 # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
@@ -22,10 +22,21 @@
 #   Linear Model does both depending on the chosen distribution, e.g. Normal =>
 #   regressor, Bernoulli/Binomial => classifier.
 #   Solution: GeneralizedLinearRegressor since this is the focus.
+# - Allow for finer control of penalty terms:
+#   L1: ||P1*w||_1 with P1*w a componentwise product, this allows to exclude
+#       factors from the L1 penalty.
+#   L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be
+#   a 1st or 2nd order difference matrix (compare B-spline penalties and
+#   Tikhonov regularization).
 # - The link funtion (instance of class Link) is necessary for the evaluation
 #   of deviance, score, Fisher and Hessian matrix as functions of the
 #   coefficients, which is needed by optimizers.
 #   Solution: link as argument in those functions
+# - Which name/symbol for sample_weight in docu?
+#   sklearn.linear_models uses w for coefficients, standard literature on
+#   GLMs use beta for coefficients and w for (sample) weights.
+#   So far, coefficients=w and sample weights=s.
+
 
 from __future__ import division
 from abc import ABCMeta, abstractmethod, abstractproperty
@@ -34,6 +45,8 @@
 from scipy import linalg, optimize, sparse
 import warnings
 from .base import LinearRegression
+from .coordinate_descent import ElasticNet
+from .ridge import Ridge
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..externals import six
@@ -164,7 +177,8 @@ class ExponentialDispersionModel(six.with_metaclass(ABCMeta)):
 
     References
     ----------
-    See https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+
+    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
     """
 
     @abstractproperty
@@ -192,7 +206,7 @@ def include_upper_bound(self):
         raise NotImplementedError()
 
     def in_y_range(self, x):
-        """Returns true if x is in the valid range of Y~EDM.
+        """Returns true if `x` is in the valid range of Y~EDM.
         """
         if self.include_lower_bound:
             if self.include_upper_bound:
@@ -211,33 +225,36 @@ def in_y_range(self, x):
 
     @abstractmethod
     def unit_variance(self, mu):
-        r"""The unit variance :math:`v(mu)` determines the variance as
-        a function of the mean mu by
-        :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`.
+        r"""The unit variance :math:`v(\mu)` determines the variance as
+        a function of the mean :math:`\mu` by
+        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`.
         It can also be derived from the unit deviance :math:`d(y,\mu)` as
 
         .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{
             \partial\mu^2}}\big|_{y=\mu}
+
+        See also :func:`variance`.
         """
         raise NotImplementedError()
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
-        r"""The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`.
+        r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`.
         """
         raise NotImplementedError()
 
     def variance(self, mu, phi=1, weights=1):
-        r"""The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is
-        :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`,
-        with unit variance v(mu).
+        r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
+        :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`,
+        with unit variance :math:`v(\mu)` and weights :math:`s_i`.
         """
         return phi/weights * self.unit_variance(mu)
 
     def variance_derivative(self, mu, phi=1, weights=1):
-        r"""The derivative of the variance w.r.t. mu,
+        r"""The derivative of the variance w.r.t. `mu`,
         :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
-        =phi/w_i*v'(\mu_i)`, with unit variance v(mu).
+        =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)`
+        and weights :math:`s_i`.
         """
         return phi/weights * self.unit_variance_derivative(mu)
 
@@ -251,8 +268,8 @@ def unit_deviance(self, y, mu):
         raise NotImplementedError()
 
     def unit_deviance_derivative(self, y, mu):
-        r"""The derivative w.r.t. mu of the unit_deviance
-        :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
+        r"""The derivative w.r.t. `mu` of the unit deviance
+        :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
         with unit variance :math:`v(\mu)`.
 
         Returns
@@ -262,39 +279,39 @@ def unit_deviance_derivative(self, y, mu):
         return -2*(y-mu)/self.unit_variance(mu)
 
     def deviance(self, y, mu, weights=1):
-        r"""The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu)
-        with weights :math:`w_i` and unit_deviance :math:`d(y,mu)`.
+        r"""The deviance is given by :math:`D = \sum_i s_i \cdot d(y, \mu)
+        with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`.
         In terms of the likelihood it is :math:`D = -2\phi\cdot
-        \left(loglike(y,\mu,\frac{phi}{w})
-        - loglike(y,y,\frac{phi}{w})\right).`
+        \left(loglike(y,\mu,\frac{phi}{s})
+        - loglike(y,y,\frac{phi}{s})\right)`.
         """
         return np.sum(weights*self.unit_deviance(y, mu))
 
     def _deviance(self, coef, X, y, weights, link):
-        """The deviance as a function of the coefficients ``coef``
-        (:math:`beta`).
+        """The deviance as a function of the coefficients `coef`
+        (:math:`w`).
         """
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
         mu = link.inverse(lin_pred)
         return self.deviance(y, mu, weights)
 
     def deviance_derivative(self, y, mu, weights=1):
-        """The derivative w.r.t. mu of the deviance.`
+        """The derivative w.r.t. `mu` of the deviance.
         """
         return weights*self.unit_deviance_derivative(y, mu)
 
     def _score(self, coef, phi, X, y, weights, link):
-        r"""The score function :math:`s` is the derivative of the
-        log-likelihood w.r.t. the ``coef`` (:math:`\beta`).
+        r"""The score function is the derivative of the
+        log-likelihood w.r.t. `coef` (:math:`w`).
         It is given by
 
         .. math:
 
-            \mathbf{s}(\boldsymbol{\beta}) = \mathbf{X}^T \mathbf{D}
+            \mathbf{score}(\boldsymbol{w}) = \mathbf{X}^T \mathbf{D}
             \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,,
 
         with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and
-        :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}(y_1),\ldots)`.
+        :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`.
         """
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
@@ -303,23 +320,27 @@ def _score(self, coef, phi, X, y, weights, link):
         d = link.inverse_derivative(lin_pred)
         d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0),
                                         shape=(n_samples, n_samples))
-        temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False)
-        score = safe_sparse_dot(X.T, temp, dense_output=False)
+        temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=True)
+        score = safe_sparse_dot(X.T, temp, dense_output=True)
         return score
 
     def _fisher_matrix(self, coef, phi, X, y, weights, link):
-        r"""The Fisher information matrix, also known as expected
-        information matrix. It is given by
+        r"""The Fisher information matrix.
+        The Fisher information matrix, also known as expected information
+        matrix is given by
 
         .. math:
 
-            \mathbf{F}(\boldsymbol{\beta}) = \mathrm{E}\left[
-            -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta}
-            \partial\boldsymbol{\beta}^T}\right]
+            \mathbf{F}(\boldsymbol{w}) =
+            \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial
+            \boldsymbol{w}} \right]
+            = \mathrm{E}\left[
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
+            \partial\boldsymbol{w}^T}\right]
             = \mathbf{X}^T W \mathbf{X} \,,
 
         with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`,
-        see score function.
+        see func:`score_function`.
         """
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
@@ -333,14 +354,15 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link):
         return fisher_matrix
 
     def _observed_information(self, coef, phi, X, y, weights, link):
-        r"""The observed information matrix, also known as the negative of
+        r"""The observed information matrix.
+        The observed information matrix, also known as the negative of
         the Hessian matrix of the log-likelihood. It is given by
 
         .. math:
 
-            \mathbf{H}(\boldsymbol{\beta}) =
-            -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta}
-            \partial\boldsymbol{\beta}^T}
+            \mathbf{H}(\boldsymbol{w}) =
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
+            \partial\boldsymbol{w}^T}
             = \mathbf{X}^T \legt[
             - \mathbf{D}' \mathbf{R}
             + \mathbf{D}^2 \mathbf{V} \mathbf{R}
@@ -351,7 +373,7 @@ def _observed_information(self, coef, phi, X, y, weights, link):
         :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{
         v(\mu_i)}
         \right)`,
-        see score function and Fisher matrix.
+        see :func:`score_` function and :func:`_fisher_matrix`.
         """
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
@@ -368,18 +390,18 @@ def _observed_information(self, coef, phi, X, y, weights, link):
         return observed_information
 
     def _deviance_derivative(self, coef, X, y, weights, link):
-        r"""The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a
-        function of the coefficients ``coef``.
+        r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a
+        function of the coefficients `coef`.
         This is equivalent to :math:`-2\phi` times the score function
-        :math:`s` (derivative of the log-likelihood).
+        :func:`score_function` (derivative of the log-likelihood).
         """
         score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights,
                             link=link)
         return -2*score
 
     def _deviance_hessian(self, coef, X, y, weights, link):
-        r"""The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance
-        as a function of the coefficients ``coef``.
+        r"""The hessian matrix w.r.t. `coef` (:math:`w`) of the deviance
+        as a function of the coefficients `coef`.
         This is equivalent to :math:`+2\phi` times the observed information
         matrix.
         """
@@ -388,20 +410,21 @@ def _deviance_hessian(self, coef, X, y, weights, link):
         return 2*info_matrix
 
     def starting_mu(self, y, weights=1):
-        """Starting values for the mean mu_i in IRLS."""
-        return ((weights*y+np.mean(weights*y))
-                / (2.*np.sum(np.ones_like(y)*weights)))
+        """Starting values for the mean mu_i in (unpenalized) IRLS."""
+        return ((weights*y+np.mean(weights*y)) /
+                (2.*np.sum(np.ones_like(y)*weights)))
 
 
 class TweedieDistribution(ExponentialDispersionModel):
     r"""A class for the Tweedie distribution.
-    They have mu=E[X] and Var[X] \propto mu**power.
+    They have :math:`\mu=\mathrm{E}[Y]` and
+    :math:`\mathrm{Var}[Y] \propto \mu^power.
 
     Attributes
     ----------
     power : float
             The variance power of the unit_variance
-            :math:`v(mu) = mu^{power}`.
+            :math:`v(\mu) = \mu^{power}`.
     """
     def __init__(self, power=0):
         self.power = power
@@ -497,7 +520,7 @@ def unit_deviance(self, y, mu):
             return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
                         y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
 
-    def likelihood(self, y, X, beta, phi, weights=1):
+    def likelihood(self, y, X, w, phi, weights=1):
         raise NotImplementedError('This function is not (yet) implemented.')
 
 
@@ -563,51 +586,135 @@ def unit_deviance(self, y, mu):
                 np.log((1+mu**2)/(1+y**2)))
 
 
+def _irls_step(X, W, P2, z):
+    """One step in iteratively reweighted least squares
+
+    Solve A w = b for w with
+    A = (X' W X + P2)
+    b = X' W z
+    z = eta + D^-1 (y-mu)
+
+    See also fit method of :class:`GeneralizedLinearRegressor`.
+
+    Parameters
+    ----------
+    X : numpy array or sparse matrix of shape (n_samples, n_features)
+        Training data (with intercept included if present)
+
+    W : numpy array of shape (n_samples, )
+
+    P2 : numpy array or sparse matrix of shape (n_features, n_features)
+        The l2-penalty matrix or vector (=diagonal matrix)
+
+    z  : numpy array of shape (n_samples, )
+        Working observations
+
+    Returns
+    -------
+    coef: array, shape = (X.shape[1])
+    """
+    # TODO: scipy.linalg.solve if faster, but ordinary least squares uses
+    #       scipy.linalg.lstsq. What is more appropriate?
+    n_samples, n_features = X.shape
+    if sparse.issparse(X):
+        W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr()
+        if P2.ndim == 1:
+            L2 = (sparse.dia_matrix((P2, 0), shape=(n_features, n_features))
+                  ).tocsr()
+        else:
+            L2 = sparse.csr_matrix(P2)
+        XtW = X.transpose() * W
+        A = XtW * X + L2
+        b = XtW * z
+        coef = sparse.linalg.spsolve(A, b)
+    else:
+        XtW = (X.T * W)
+        A = XtW.dot(X)
+        if P2.ndim == 1:
+            A[np.diag_indices_from(A)] += P2
+        else:
+            A += P2
+        b = XtW.dot(z)
+        coef = linalg.solve(A, b)
+    return coef
+
+
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
-    r"""
-    Class to fit a Generalized Linear Model (GLM) based on reproductive
-    Exponential Dispersion Models (EDM).
+    """Regression via a Generalized Linear Model (GLM) based on reproductive
+    Exponential Dispersion Models (EDM) with combined L1 and L2 priors as
+    regularizer.
+
+    Minimizes the objective function::
+
+            1/(2s) * deviance(y, h(X*w))
+            + alpha * l1_ratio * ||P1*w||_1
+            + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
+
+    with inverse link function `h` and s=sum of `sample_weight` (which equals
+    n_samples for `sample_weight=None`).
+    For `P1`=`P2`=identity, the penalty is the elastic net::
 
-    #TODO: This belongs to User Guide
-    Assumptions:
+            alpha * l1_ratio * ||w||_1
+            + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2
 
-    - The target values y_i are realizations of random variables
-      :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion
-      parameter :math:`\phi` and weights :math:`w_i`.
-    - The expectation of :math:`Y_i` is :math:`\mu_i=\mathrm{E}[Y]=h(\eta_i)`
-      whith the linear predictor :math:`\eta=X*\beta`, inverse link function
-      :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta`
-      to be estimated.
+    If you are interested in controlling the L1 and L2 penalty
+    separately, keep in mind that this is equivalent to::
 
-    Note that the first assumption implies
-    :math:`\mathrm{Var}[Y_i]=\frac{\phi}{w_i} v(\mu_i)` with uni variance
-    function :math:`v(\mu)`.
+            a * L1 + b * L2
+
+    where::
+
+            alpha = a + b and l1_ratio = a / (a + b)
+
+    The parameter `l1_ratio` corresponds to alpha in the glmnet R package while
+    alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio
+    = 1 is the lasso penalty.
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
 
     The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments :math:`E[Y_i]=\mu_i=h(\eta_i)` and
-    :math:`Var[Y_i]=\frac{\phi}{w_i} v(\mu_i)`
+    the first two moments :math:`E[Y_i]=\\mu_i=h(\\eta_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{w_i} v(\\mu_i)`.
+
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    (penalized) maximum likelihood which is equivalent to minimizing the
+    deviance.
 
-    The parameters :math:`\beta` are estimated by maximum likelihood which is
-    equivalent to minimizing the deviance.
+    TODO: For `alpha` > 0, the feature matrix `X` is assumed to be
+    standardized. Call
+    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
 
     TODO: Estimation of the dispersion parameter phi.
 
-    TODO: Notes on weights and 'scaled' Poisson, e.g. fit y = x/w with
-    with x=counts and w=exposure (time, money, persons, ...) => y is a
-    ratio with weights w.
+    TODO: Notes on weights and 'scaled' distributions. For Poisson, this means
+    to fit y = z/w with z=counts and w=exposure (time, money, persons, ...)
+    => y is a ratio with weights w. Same for other distributions.
 
     Parameters
     ----------
+    alpha : float, optional (default=1)
+        Constant that multiplies the penalty terms und thus determines the
+        regularization strength.
+        See the notes for the exact mathematical meaning of this
+        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        case, the design matrix X must have full column rank
+        (no collinearities).
+
+    l1_ratio : float, optional (defaul=0)
+        The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For
+        ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it
+        is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
+        combination of L1 and L2.
+
     fit_intercept : boolean, optional (default=True)
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance
-        of a subclass of ExponentialDispersionModel, optional
-        (default='normal')
+    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\
+            of class ExponentialDispersionModel, optional(default='normal')
         the distributional assumption of the GLM.
 
-    link : {'identity', 'log'} or an instance of a subclass of Link,
+    link : {'identity', 'log'} or an instance of class Link,
         optional (default='identity')
         the link function of the GLM, i.e. mapping from linear predictor
         (X*coef) to expectation (mu).
@@ -634,28 +741,41 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         where ``g_i`` is the i-th component of the gradient (derivative of
         the deviance).
 
-    start_params : {array shape (n_features, ), 'ols'}, optional (default=None)
-        sets the start values for coef_ in the fit.
-        If None, default values are taken.
-        If 'ols' the result of an ordinary least squares in the link space
-        (linear predictor) is taken.
-        If an array is given, these values are taken as coef_ to start with.
-        If fit_intercept is true, the first value is assumed to be the start
-        value for the intercept_.
+    warm_start : boolean, optional (default=False)
+        If set to ``True``, reuse the solution of the previous call to fit as
+        initialization for ``coef_`` and ``intercept_`` (supersedes option
+        ``start_params``). If set to ``True`` or if the attribute ``coef_``
+        does not exit (first call to fit), option ``start_params`` sets the
+        starting values for ``coef_`` and ``intercept_``.
+
+    start_params : None or array of shape (n_features, ) or 'least_squares'}, \
+            optional (default=None)
+        If an array of size n_features is supplied, use these as start values
+        for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
+        is assumed to be the start value for the ``intercept_``.
+        If 'least_squares' is set, the result of a least squares fit in the
+        link space (linear predictor) is taken. If ``None``, the start values
+        are calculated by setting mu to family.starting_mu(..) and one step of
+        irls.
+        This option only applies if ``warm_start=False`` or if fit is called
+        the first time (``self.coef_`` does not exist).
+
+    copy_X : boolean, optional, default True
+        If ``True``, X will be copied; else, it may be overwritten.
 
     verbose : int, optional (default=0)
         For the lbfgs solver set verbose to any positive number for verbosity.
 
     Attributes
     ----------
-    coef_ : array, shape (1, n_features)
+    coef_ : array, shape (n_features, )
         Estimated coefficients for the linear predictor (X*coef_) in the GLM.
 
     intercept_ : float
         Intercept (a.k.a. bias) added to linear predictor.
 
     dispersion_ : float
-        The dispersion parameter :math:`\phi` if fit_dispersion is set.
+        The dispersion parameter :math:`\\phi` if fit_dispersion is set.
 
     n_iter_ : int
         Actual number of iterations of the solver.
@@ -667,10 +787,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     ----------
     TODO
     """
-
-    def __init__(self, fit_intercept=True, family='normal',
-                 link='identity', fit_dispersion='chisqr', solver='irls',
-                 max_iter=100, tol=1e-4, start_params=None, verbose=0):
+    def __init__(self, alpha=1.0, l1_ratio=0,
+                 fit_intercept=True, family='normal', link='identity',
+                 fit_dispersion='chisqr', solver='irls', max_iter=100,
+                 tol=1e-4, warm_start=False, start_params=None, copy_X=True,
+                 verbose=0):
+        self.alpha = alpha
+        self.l1_ratio = l1_ratio
         self.fit_intercept = fit_intercept
         self.family = family
         self.link = link
@@ -678,31 +801,86 @@ def __init__(self, fit_intercept=True, family='normal',
         self.solver = solver
         self.max_iter = max_iter
         self.tol = tol
+        self.warm_start = warm_start
         self.start_params = start_params
+        self.copy_X = copy_X
         self.verbose = verbose
 
-    def fit(self, X, y, sample_weight=None):
+    def fit(self, X, y, sample_weight=None, P1=None, P2=None,
+            check_input=True):
         """Fit a generalized linear model.
 
         Parameters
         ----------
-        X : numpy array or sparse matrix of shape [n_samples,n_features]
+        X : numpy array or sparse matrix of shape (n_samples, n_features)
             Training data
 
-        y : numpy array of shape [n_samples]
+        y : numpy array of shape (n_samples, )
             Target values
 
-        sample_weight : numpy array of shape [n_samples]
+        sample_weight : array of shape (n_samples, ) or None,\
+                optinal (default=None)
             Individual weights for each sample.
             Var[Y_i]=phi/weight_i * v(mu)
             If Y_i ~ EDM(mu, phi/w_i) then
             sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
             weighted average with weights=sample_weight.
 
+        P1 : None or array of shape (n_features*, ), optional\
+                (default=None)
+            With this array, you can exclude coefficients from ths L1 penalty.
+            Set the corresponding value to 1 (include) or 0 (exclude). The
+            default value ``None`` is the same as an array of ones.
+            Note that n_features* = X.shape[1] = length of coef_ (intercept
+            always excluded from counting).
+
+        P2 : None or array of shape (n_features*, n_features*)
+            With this square matrix the L2 penalty is calculated as `w P2 w`.
+            This gives a fine control over this penalty (Tikhonov
+            regularization).
+            Note that n_features* = X.shape[1] = length of coef_ (intercept
+            always excluded from counting).
+
+        check_input : boolean, optional (default=True)
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you do.
+
         Returns
         -------
         self : returns an instance of self.
         """
+        #######################################################################
+        # 1. input validation                                                 #
+        #######################################################################
+        # 1.1 validate arguments of fit #######################################
+        _dtype = [np.float64, np.float32]
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                         dtype=_dtype, y_numeric=True, multi_output=False)
+        y = y.astype(np.float64)
+
+        if sample_weight is None:
+            weights = np.ones_like(y)
+        elif np.isscalar(sample_weight):
+            weights = sample_weight*np.ones_like(y)
+        else:
+            weights = np.atleast_1d(sample_weight)
+            if weights.ndim > 1:
+                raise ValueError("Sample weight must be 1D array or scalar")
+            elif weights.shape[0] != y.shape[0]:
+                raise ValueError("Sample weights must have the same length as"
+                                 " y")
+        # IMPORTANT NOTE: Since we want to minimize
+        # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
+        # deviance = sum(sample_weight * unit_deviance),
+        # we rescale weights such that sum(weights) = 1 and this becomes
+        # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
+        weights = weights/np.sum(weights)
+
+        if not isinstance(check_input, bool):
+            raise ValueError("The argument check_input must be bool; got "
+                             "(check_input={0})".format(check_input))
+
+        # 1.2 validate arguments of __init__ ##################################
         # Garantee that self._family_instance is an instance of class
         # ExponentialDispersionModel
         if isinstance(self.family, ExponentialDispersionModel):
@@ -720,7 +898,8 @@ def fit(self, X, y, sample_weight=None):
                 raise ValueError(
                     "The family must be an instance of class"
                     " ExponentialDispersionModel or an element of"
-                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].")
+                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];"
+                    " got (family={0})".format(self.family))
 
         # Garantee that self._link_instance is set to an instance of class Link
         if isinstance(self.link, Link):
@@ -733,132 +912,232 @@ def fit(self, X, y, sample_weight=None):
             else:
                 raise ValueError(
                     "The link must be an instance of class Link or"
-                    " an element of ['identity', 'log'].")
-
+                    " an element of ['identity', 'log']; got (link={0})"
+                    .format(self.link))
+
+        if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
+            raise ValueError("Penalty term must be non-negative;"
+                             " got (alpha={0})".format(self.alpha))
+        if (not isinstance(self.l1_ratio, numbers.Number) or
+                self.l1_ratio < 0 or self.l1_ratio > 1):
+            raise ValueError("l1_ratio must be in interval [0, 1]; got"
+                             " (l1_ratio={0]})".format(self.l1_ratio))
         if not isinstance(self.fit_intercept, bool):
-            raise ValueError("The argument fit_intercept must be bool,"
+            raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
         if self.solver not in ['irls', 'lbfgs', 'newton-cg']:
             raise ValueError("GLM Regression supports only irls, lbfgs and"
                              "newton-cg solvers, got {0}".format(self.solver))
+        if self.alpha > 0:
+            if (self.l1_ratio > 0 and
+                    self.solver not in []):
+                # TODO: Add solver for L1
+                # raise ValueError("The solver option (solver={0}) is not "
+                #                  "appropriate for the chosen penalty which"
+                #                  " includes L1 (alpha={1})."
+                #                  .format(self.solver, self.alpha))
+                raise NotImplementedError("Currently, no solver is implemented"
+                                          " that can deal with L1 penalties.")
         if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
             raise ValueError("Maximum number of iteration must be positive;"
                              " got (max_iter={0!r})".format(self.max_iter))
         if not isinstance(self.tol, numbers.Number) or self.tol < 0:
             raise ValueError("Tolerance for stopping criteria must be "
                              "positive; got (tol={0!r})".format(self.tol))
+        if not isinstance(self.warm_start, bool):
+            raise ValueError("The argument warm_start must be bool;"
+                             " got {0}".format(self.warm_start))
         start_params = self.start_params
-        if start_params is not None and start_params is not 'ols':
+        if start_params is not None and start_params is not 'least_squares':
             start_params = np.atleast_1d(start_params)
-            if start_params.shape[0] != X.shape[1] + self.fit_intercept:
+            if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
+                    (start_params.ndim != 1)):
                 raise ValueError("Start values for parameters must have the"
-                                 "right length; required length {0}, got {1}"
+                                 "right length and dimension; required (length"
+                                 "={0}, ndim=1), got (length={1}, ndim={2})."
                                  .format(X.shape[1] + self.fit_intercept,
-                                         start_params.shape[0]))
-
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                         y_numeric=True, multi_output=False)
-        y = y.astype(np.float64)
+                                         start_params.shape[0],
+                                         start_params.ndim))
+        if not isinstance(self.copy_X, bool):
+            raise ValueError("The argument copy_X must be bool;"
+                             " got {0}".format(self.copy_X))
+
+        if P1 is None:
+            P1 = np.ones(X.shape[1])
+        else:
+            P1 = np.atleast_1d(P1)
+            if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1):
+                raise ValueError("P1 must be either None or an 1D array with "
+                                 "the length of X.shape[1]; "
+                                 "got (P1.shape[0]={0}), "
+                                 "needed (X.shape[1]={1})."
+                                 .format(P1.shape[0], X.shape[1]))
+        if P2 is None:
+            P2 = np.ones(X.shape[1])
+            if sparse.issparse(X):
+                P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0),
+                      shape=(X.shape[1], X.shape[1]))).tocsr()
+        else:
+            P2 = check_array(P2, accept_sparse=['csr', 'csc', 'coo'],
+                             dtype="numeric", ensure_2d=True)
+            if ((P2.shape[0] != P2.shape[1]) or
+                (P2.shape[0] != X.shape[1]) or
+                    (P2.ndim != 2)):
+                raise ValueError("P2 must be either None or an array of shape "
+                                 "(n_features, n_features) with "
+                                 "n_features=X.shape[1]; "
+                                 "got (P2.shape=({0},{1})), needed ({3},{3})"
+                                 .format(P2.shape[0], P2.shape[1], X.shape[1]))
 
         family = self._family_instance
         link = self._link_instance
 
-        if not np.all(family.in_y_range(y)):
-            raise ValueError("Some value(s) of y are out of the valid "
-                             "range for family {0}"
-                             .format(family.__class__.__name__))
-
-        if sample_weight is None:
-            weights = np.ones_like(y)
-        elif np.isscalar(sample_weight):
-            weights = sample_weight*np.ones_like(y)
-        else:
-            weights = np.atleast_1d(sample_weight)
-            if weights.ndim > 1:
-                raise ValueError("Sample weight must be 1D array or scalar")
-            elif weights.shape[0] != y.shape[0]:
-                raise ValueError("Sample weights must have the same length as"
-                                 " y")
-
         if self.fit_intercept:
             # intercept is first column <=> coef[0] is for intecept
             if sparse.issparse(X):
                 Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X])
             else:
                 Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
+            P1 = np.concatenate((np.array([0]), P1))
+            if P2.ndim == 1:
+                P2 = np.concatenate((np.array([0]), P2))
+            elif sparse.issparse(P2):
+                P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2),
+                                       dtype=P2.dtype).tocsr()
+            else:
+                P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))],
+                               [np.zeros((X.shape[1], 1)), P2]])
         else:
             Xnew = X
 
         n_samples, n_features = Xnew.shape
-
-        # Note: Since dispersion_ alias phi does not enter the estimation
-        #       of mu_i=E[y_i] set it to 1 where convenient.
+        l1 = self.alpha * self.l1_ratio
+        l2 = self.alpha * (1-self.l1_ratio)
+        P1 *= l1
+        P2 *= l2
+
+        # 1.3 additional validations ##########################################
+        if check_input:
+            if not np.all(family.in_y_range(y)):
+                raise ValueError("Some value(s) of y are out of the valid "
+                                 "range for family {0}"
+                                 .format(family.__class__.__name__))
+            # TODO: if alpha=0 check that Xnew is not rank deficient
+            # TODO: what else to check?
+
+        #######################################################################
+        # 2. initialization of coef = (intercept_, coef_)                     #
+        #######################################################################
+        # Note: Since phi=self.dispersion_ does not enter the estimation
+        #       of mu_i=E[y_i], set it to 1.
 
         # set start values for coef
         coef = None
-        if start_params is None:
-            # Use mu_start and apply one irls step to calculate coef
-            mu = family.starting_mu(y, weights)
-            # linear predictor
-            eta = link.link(mu)
-            # h'(eta)
-            hp = link.inverse_derivative(eta)
-            # working weights w, in principle a diagonal matrix
-            # therefore here just as 1d array
-            w = (hp**2 / family.variance(mu, phi=1, weights=weights))
-            wroot = np.sqrt(w)
-            # working observations
-            yw = eta + (y-mu)/hp
-            # least squares rescaled with wroot
-            wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples))
-            X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True)
-            yw_rescale = safe_sparse_dot(wroot, y, dense_output=True)
-            coef = linalg.lstsq(X_rescale, yw_rescale)[0]
-        elif start_params is 'ols':
-            reg = LinearRegression(copy_X=False, fit_intercept=False)
-            reg.fit(Xnew, link.link(y))
-            coef = reg.coef_
+        if self.warm_start and hasattr(self, "coef_"):
+            if self.fit_intercept:
+                coef = np.concatenate((self.intercept_, self.coef_))
+            else:
+                coef = self.coef_
+        elif self.start_params is None:
+            if self.l1_ratio == 0:
+                # See 3.1 IRLS
+                # Use mu_start and apply one irls step to calculate coef
+                mu = family.starting_mu(y, weights)
+                # linear predictor
+                eta = link.link(mu)
+                # h'(eta)
+                hp = link.inverse_derivative(eta)
+                # working weights W, in principle a diagonal matrix
+                # therefore here just as 1d array
+                W = (hp**2 / family.variance(mu, phi=1, weights=weights))
+                # working observations
+                z = eta + (y-mu)/hp
+                # solve A*coef = b
+                # A = X' W X + l2 P2, b = X' W z
+                coef = _irls_step(Xnew, W, P2, z)
+            else:
+                # with L1 penalty, start with coef = 0
+                coef = np.zeros(n_features)
+        elif self.start_params is 'least_squares':
+            if self.alpha == 0:
+                reg = LinearRegression(copy_X=True, fit_intercept=False)
+                reg.fit(Xnew, link.link(y))
+                coef = reg.coef_
+            elif self.l1_ratio <= 0.01:
+                # ElasticNet says l1_ratio <= 0.01 is not reliable, use Ridge
+                reg = Ridge(copy_X=True, fit_intercept=False,
+                            alpha=self.alpha)
+                reg.fit(Xnew, link.link(y))
+                coef = reg.coef_
+            else:
+                # TODO: Does this make sense?
+                reg = ElasticNet(copy_X=True, fit_intercept=False,
+                                 alpha=self.alpha, l1_ratio=self.l1_ratio)
+                reg.fit(Xnew, link.link(y))
+                coef = reg.coef_
         else:
             coef = start_params
 
+        #######################################################################
+        # 3. fit                                                              #
+        #######################################################################
         # algorithms for optimiation
         # TODO: Parallelize it
         self.n_iter_ = 0
         converged = False
+        # 3.1 IRLS ############################################################
+        # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj'
+        #   Obj = objective function = 1/2 Dev + l2/2 w P2 w
+        #   Dev = deviance, s = normalized weights, variance V(mu) but phi=1
+        #   D   = link.inverse_derivative(eta) = diag_matrix(h'(X w))
+        #   D2  = link.inverse_derivative(eta)^2 = D^2
+        #   W   = D2/V(mu)
+        #   l2  = alpha * (1 - l1_ratio)
+        #   Obj' = d(Obj)/d(w) = 1/2 Dev' + P2 w
+        #        = -X' D (y-mu)/V(mu) + l2 P2 w
+        #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
+        #   Use Fisher matrix instead of full info matrix -X'(...) X,
+        #    i.e. E[Dev''] with E[y-mu]=0:
+        #   Obj'' ~ X' W X + l2 P2
+        # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu)
+        # Note: P2 = l2*P2, see above
         if self.solver == 'irls':
-            # linear predictor
+            # eta = linear predictor
             eta = safe_sparse_dot(Xnew, coef, dense_output=True)
             mu = link.inverse(eta)
+            # D = h'(eta)
+            hp = link.inverse_derivative(eta)
+            V = family.variance(mu, phi=1, weights=weights)
             while self.n_iter_ < self.max_iter:
                 self.n_iter_ += 1
                 # coef_old not used so far.
                 # coef_old = coef
-                # h'(eta)
-                hp = link.inverse_derivative(eta)
-                # working weights w, in principle a diagonal matrix
+                # working weights W, in principle a diagonal matrix
                 # therefore here just as 1d array
-                w = (hp**2 / family.variance(mu, phi=1, weights=weights))
-                wroot = np.sqrt(w)
+                W = (hp**2 / V)
                 # working observations
-                yw = eta + (y-mu)/hp
-                # least squares rescaled with wroot
-                wroot = sparse.dia_matrix((wroot, 0),
-                                          shape=(n_samples, n_samples))
-                X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True)
-                yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True)
-                coef, residues, rank, singular_ = (
-                    linalg.lstsq(X_rescale, yw_rescale))
+                z = eta + (y-mu)/hp
+                # solve A*coef = b
+                # A = X' W X + l2 P2, b = X' W z
+                coef = _irls_step(Xnew, W, P2, z)
 
                 # updated linear predictor
                 # do it here for updated values for tolerance
                 eta = safe_sparse_dot(Xnew, coef, dense_output=True)
                 mu = link.inverse(eta)
+                hp = link.inverse_derivative(eta)
+                V = family.variance(mu, phi=1, weights=weights)
 
                 # which tolerace? |coef - coef_old| or gradient?
                 # use gradient for compliance with newton-cg and lbfgs
-                # TODO: faster computation of gradient, use mu and eta directly
-                gradient = family._deviance_derivative(
-                    coef=coef, X=Xnew, y=y, weights=weights, link=link)
+                # gradient = family._deviance_derivative(
+                #     coef=coef, X=Xnew, y=y, weights=weights, link=link)
+                # gradient = -X' D (y-mu)/V(mu) + l2 P2 w
+                gradient = -safe_sparse_dot(Xnew.T, hp*(y-mu)/V)
+                if P2.ndim == 1:
+                    gradient += P2*coef
+                else:
+                    gradient += safe_sparse_dot(P2, coef)
                 if (np.max(np.abs(gradient)) <= self.tol):
                     converged = True
                     break
@@ -868,50 +1147,73 @@ def fit(self, X, y, sample_weight=None):
                               "of iterations (currently {0})"
                               .format(self.max_iter), ConvergenceWarning)
 
+        # 3.2 L-BFGS and Newton-CG ############################################
         # TODO: performance: make one function return both deviance and
         #       gradient of deviance
-        elif self.solver == 'lbfgs':
-            func = family._deviance
-            fprime = family._deviance_derivative
-            args = (Xnew, y, weights, link)
-            coef, loss, info = optimize.fmin_l_bfgs_b(
-                func, coef, fprime=fprime,
-                args=args,
-                iprint=(self.verbose > 0) - 1, pgtol=self.tol,
-                maxiter=self.max_iter)
-            if self.verbose > 0:
-                if info["warnflag"] == 1:
-                    warnings.warn("lbfgs failed to converge."
-                                  " Increase the number of iterations.",
-                                  ConvergenceWarning)
-                elif info["warnflag"] == 2:
-                    warnings.warn("lbfgs failed for the reason: {0}".format(
-                        info["task"]))
-            self.n_iter_ = info['nit']
-        elif self.solver == 'newton-cg':
-            func = family._deviance
-            grad = family._deviance_derivative
+        elif self.solver in ['lbfgs', 'newton-cg']:
+            def func(coef, *args):
+                if P2.ndim == 1:
+                    L2 = safe_sparse_dot(coef.T, P2*coef)
+                else:
+                    L2 = safe_sparse_dot(coef.T, safe_sparse_dot(P2, coef))
+                    # A[np.diag_indices_from(A)] += P2
+                return 0.5*family._deviance(coef, *args) + 0.5*L2
+
+            def fprime(coef, *args):
+                if P2.ndim == 1:
+                    L2 = P2*coef
+                else:
+                    L2 = safe_sparse_dot(P2, coef)
+                return 0.5*family._deviance_derivative(coef, *args) + L2
 
             def grad_hess(coef, X, y, weights, link):
-                grad = (family._deviance_derivative(
-                    coef, X, y, weights, link))
-                hessian = (family._deviance_hessian(
-                    coef, X, y, weights, link))
+                if P2.ndim == 1:
+                    L2 = P2*coef
+                else:
+                    L2 = safe_sparse_dot(P2, coef)
+                grad = 0.5*family._deviance_derivative(
+                    coef, X, y, weights, link) + L2
+                hessian = 0.5*family._deviance_hessian(
+                    coef, X, y, weights, link)
+                if P2.ndim == 1:
+                    hessian[np.diag_indices_from(hessian)] += P2
+                else:
+                    hessian += P2
 
                 def Hs(s):
-                    ret = np.dot(hessian, s)
+                    ret = safe_sparse_dot(hessian, s)
                     return ret
                 return grad, Hs
-            hess = grad_hess
+
             args = (Xnew, y, weights, link)
-            coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args,
-                                       maxiter=self.max_iter, tol=self.tol)
-            self.coef_ = coef
 
+            if self.solver == 'lbfgs':
+                coef, loss, info = optimize.fmin_l_bfgs_b(
+                    func, coef, fprime=fprime, args=args,
+                    iprint=(self.verbose > 0) - 1, pgtol=self.tol,
+                    maxiter=self.max_iter)
+                if self.verbose > 0:
+                    if info["warnflag"] == 1:
+                        warnings.warn("lbfgs failed to converge."
+                                      " Increase the number of iterations.",
+                                      ConvergenceWarning)
+                    elif info["warnflag"] == 2:
+                        warnings.warn("lbfgs failed for the reason: {0}"
+                                      .format(info["task"]))
+                self.n_iter_ = info['nit']
+            elif self.solver == 'newton-cg':
+                coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef,
+                                           args=args, maxiter=self.max_iter,
+                                           tol=self.tol)
+
+        #######################################################################
+        # 4. postprocessing                                                   #
+        #######################################################################
         if self.fit_intercept:
             self.intercept_ = coef[0]
             self.coef_ = coef[1:]
         else:
+            # set intercept to zero as the other linear models do
             self.intercept_ = 0.
             self.coef_ = coef
 
@@ -988,8 +1290,8 @@ def score(self, X, y, sample_weight=None):
         :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}`
         is the null deviance, i.e. the deviance of a model with intercept
         alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean
-        :math:`\bar{y}` is average by sample_weight. In the case of a Normal
-        distribution, this D^2 equals R^2.
+        :math:`\bar{y}` is averaged by sample_weight. In the case of a Normal
+        distribution, D^2 equals R^2.
         Best possible score is 1.0 and it can be negative (because the
         model can be arbitrarily worse).
 
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index a4d4ea8650860..df0413b4d7836 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -1,19 +1,34 @@
 import numpy as np
 
 from sklearn.linear_model.glm import (
-    # Link, IdentityLink,
+    Link,
+    IdentityLink,
     LogLink,
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
-    # GeneralizedHyperbolicSecand,
+    GeneralizedHyperbolicSecand,
     GeneralizedLinearRegressor)
+from sklearn.linear_model.ridge import Ridge
 
 from sklearn.utils.testing import (
-    # assert_equal,
+    assert_equal, assert_almost_equal,
     assert_array_equal, assert_array_almost_equal)
 
 
+def test_link_properties():
+    """Test link inverse and derivative
+    """
+    rng = np.random.RandomState(0)
+    x = rng.rand(100)*100
+    from sklearn.linear_model.glm import Link
+    for link in vars()['Link'].__subclasses__():
+        link = link()
+        assert_almost_equal(link.link(link.inverse(x)), x, decimal=10)
+        assert_almost_equal(link.inverse_derivative(link.link(x)),
+                            1/link.derivative(x), decimal=10)
+
+
 def test_family_bounds():
     """Test the valid range of distributions
     """
@@ -42,8 +57,23 @@ def test_family_bounds():
     assert_array_equal(result, [False, False, True])
 
 
+def test_deviance_zero():
+    """Test deviance(y,y) = 0 for different families
+    """
+    for family in [NormalDistribution(), PoissonDistribution(),
+                   GammaDistribution(), InverseGaussianDistribution(),
+                   TweedieDistribution(power=-2.5),
+                   TweedieDistribution(power=-1),
+                   TweedieDistribution(power=1.5),
+                   TweedieDistribution(power=2.5),
+                   TweedieDistribution(power=4),
+                   GeneralizedHyperbolicSecand()]:
+        assert_almost_equal(family.deviance(0.1, 0.1), 0, decimal=10)
+        assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10)
+
+
 def test_glm_identiy_regression():
-    """Test linear regression on a simple dataset
+    """Test GLM regression with identity link on a simple dataset
     """
     coef = [1, 2]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
@@ -55,13 +85,13 @@ def test_glm_identiy_regression():
     for solver in ['irls', 'lbfgs', 'newton-cg']:
         for family in families:
             glm = GeneralizedLinearRegressor(
-                family=family, fit_intercept=False, solver=solver)
+                alpha=0, family=family, fit_intercept=False, solver=solver)
             res = glm.fit(X, y)
             assert_array_almost_equal(res.coef_, coef)
 
 
 def test_glm_log_regression():
-    """Test linear regression on a simple dataset
+    """Test GLM regression with log link on a simple dataset
     """
     coef = [1, 2]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
@@ -73,10 +103,79 @@ def test_glm_log_regression():
     for solver in ['irls', 'lbfgs', 'newton-cg']:
         for family in families:
             glm = GeneralizedLinearRegressor(
-                family=family, link=LogLink(), fit_intercept=False,
-                solver=solver, start_params='ols')
+                alpha=0, family=family, link=LogLink(), fit_intercept=False,
+                solver=solver, start_params='least_squares')
             res = glm.fit(X, y)
             assert_array_almost_equal(res.coef_, coef)
 
 
+def test_normal_ridge():
+    """Test ridge regression for Normal distributions
+
+    Compare to test_ridge in test_ridge.py.
+    """
+    rng = np.random.RandomState(0)
+    alpha = 1.0
+
+    # With more samples than features
+    n_samples, n_features, n_predict = 6, 5, 10
+    y = rng.randn(n_samples)
+    X = rng.randn(n_samples, n_features)
+    T = rng.randn(n_predict, n_features)
+
+    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True)
+    ridge.fit(X, y)
+    for solver in ['irls', 'lbfgs', 'newton-cg']:
+        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+                                         family='normal', link='identity',
+                                         fit_intercept=True, solver=solver)
+        glm.fit(X, y)
+        assert_equal(glm.coef_.shape, (X.shape[1], ))
+        assert_array_almost_equal(glm.coef_, ridge.coef_)
+        assert_almost_equal(glm.intercept_, ridge.intercept_)
+        assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False)
+    ridge.fit(X, y)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+                                     family='normal', link='identity',
+                                     fit_intercept=False, solver='irls')
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_)
+    assert_almost_equal(glm.intercept_, ridge.intercept_)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+
+    # With more features than samples
+    n_samples, n_features, n_predict = 5, 10, 10
+    y = rng.randn(n_samples)
+    X = rng.randn(n_samples, n_features)
+    T = rng.randn(n_predict, n_features)
+
+    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True)
+    ridge.fit(X, y)
+    for solver in ['irls', 'lbfgs', 'newton-cg']:
+        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+                                         family='normal', link='identity',
+                                         fit_intercept=True, solver=solver)
+        glm.fit(X, y)
+        assert_equal(glm.coef_.shape, (X.shape[1], ))
+        assert_array_almost_equal(glm.coef_, ridge.coef_)
+        assert_almost_equal(glm.intercept_, ridge.intercept_)
+        assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False)
+    ridge.fit(X, y)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+                                     family='normal', link='identity',
+                                     fit_intercept=False, solver='irls')
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_)
+    assert_almost_equal(glm.intercept_, ridge.intercept_)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+
+
 # TODO: Test compatibility with R's glm, glmnet

From 5b46c23977a8e386987a2767b2c12d4296d332af Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 19 Sep 2017 00:40:34 +0200
Subject: [PATCH 007/209] [WIP] Add Generalized Linear Models (#9405)

* fix some bugs in user guide linear_model.rst
* fix some pep8 issues in test_glm.py
---
 doc/modules/linear_model.rst           | 28 +++++++++++++++-----------
 sklearn/linear_model/tests/test_glm.py |  7 ++++---
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 51b3821fa6207..98736facd9b76 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -880,14 +880,14 @@ Generalized linear regression
 =============================
 
 :class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two
-ways [1]_. First, the predicted values :math:`\hat{y}` are linked to a linear
+ways [8]_. First, the predicted values :math:`\hat{y}` are linked to a linear
 combination of the input variables :math:`X` via an inverse link function
 :math:`h` as
 
 .. math::    \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p).
 
 Secondly, the squared loss function is replaced by the deviance :math:`D` of an
-exponential dispersion model (EDM) [2]_. The objective function beeing minimized
+exponential dispersion model (EDM) [9]_. The objective function beeing minimized
 becomes
 
 .. math::    \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1
@@ -914,16 +914,20 @@ it is convenient to apply a link function different from the identity link
 :math:`h(Xw)=\exp(Xw)`.
 
 Note that the feature matrix `X` should be standardized before fitting. This
-ensures that the penalty treats features equally.
+ensures that the penalty treats features equally. The estimator can be used as
+follows::
 
-    >>> from sklearn import linear_model
-    >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, l1_ratio=0)
-    >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
+    >>> from sklearn.linear_model import GeneralizedLinearRegressor
+    >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
+    GeneralizedLinearRegressor(alpha=0.5, copy_X=True, family='poisson',
+                  fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0,
+                  link='log', max_iter=100, solver='irls', start_params=None,
+                  tol=0.0001, verbose=0, warm_start=False)
     >>> reg.coef_
     array([ 0.24630255,  0.43373521])
-    >>> reg.intercept_
-    -0.76383575123143277
+    >>> reg.intercept_ #doctest: +ELLIPSIS
+    -0.76383575...
 
 Mathematical formulation
 ------------------------
@@ -969,7 +973,7 @@ Two remarks:
 
 * The deviances for at least Normal, Poisson and Gamma distributions are
   strictly consistent scoring functions for the mean :math:`\mu`, see Eq.
-  (19)-(20) in [3]_.
+  (19)-(20) in [10]_.
 
 * If you want to model a frequency, i.e. counts per exposure (time, volume, ...)
   you can do so by a Poisson distribution and passing
@@ -979,12 +983,12 @@ Two remarks:
 
 .. topic:: References:
 
-    .. [1] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+    .. [8] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
 
-    .. [2] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
+    .. [9] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
            See also `Exponential dispersion model. <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
 
-    .. [3] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
+    .. [10] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
 
 Stochastic Gradient Descent - SGD
 =================================
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index df0413b4d7836..b62b51b5bcb9e 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -2,7 +2,7 @@
 
 from sklearn.linear_model.glm import (
     Link,
-    IdentityLink,
+    # IdentityLink,
     LogLink,
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
@@ -21,8 +21,9 @@ def test_link_properties():
     """
     rng = np.random.RandomState(0)
     x = rng.rand(100)*100
-    from sklearn.linear_model.glm import Link
-    for link in vars()['Link'].__subclasses__():
+    # from sklearn.linear_model.glm import Link
+    # for link in vars()['Link'].__subclasses__():
+    for link in Link.__subclasses__():
         link = link()
         assert_almost_equal(link.link(link.inverse(x)), x, decimal=10)
         assert_almost_equal(link.inverse_derivative(link.link(x)),

From 10dd14603a5fc04f53ca4920621434aaff662064 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 3 Dec 2017 19:54:57 +0100
Subject: [PATCH 008/209] [WIP] Add Generalized Linear Models (#9405)

* added test: ridge poisson with log-link compared to glmnet
* fix ValueError message for l1_ratio
* fix ValueError message for P2
* string comparison: use '==' and '!=' instead of 'is' and 'is not'
* fix RuntimeWarnings in unit_deviance of poisson: x*log(x) as xlogy
* added test for fisher matrix
* added test for family argument
---
 sklearn/linear_model/glm.py            | 29 ++++++-----
 sklearn/linear_model/tests/test_glm.py | 72 ++++++++++++++++++++++++--
 2 files changed, 84 insertions(+), 17 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 2db3c56d5e1c1..93ce358a8a874 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -15,6 +15,8 @@
 # TODO: Make it as much consistent to other estimators in linear_model as
 #       possible
 # TODO: options P1 and P2 in fit() or in __init__()???
+# TODO: Include further classes in class.rst? ExponentialDispersionModel?
+#       TweedieDistribution?
 
 # Design Decisions:
 # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
@@ -42,7 +44,7 @@
 from abc import ABCMeta, abstractmethod, abstractproperty
 import numbers
 import numpy as np
-from scipy import linalg, optimize, sparse
+from scipy import linalg, optimize, sparse, special
 import warnings
 from .base import LinearRegression
 from .coordinate_descent import ElasticNet
@@ -340,7 +342,7 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link):
             = \mathbf{X}^T W \mathbf{X} \,,
 
         with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`,
-        see func:`score_function`.
+        see func:`_score`.
         """
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
@@ -363,7 +365,7 @@ def _observed_information(self, coef, phi, X, y, weights, link):
             \mathbf{H}(\boldsymbol{w}) =
             -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
             \partial\boldsymbol{w}^T}
-            = \mathbf{X}^T \legt[
+            = \mathbf{X}^T \left[
             - \mathbf{D}' \mathbf{R}
             + \mathbf{D}^2 \mathbf{V} \mathbf{R}
             + \mathbf{D}^2
@@ -393,7 +395,7 @@ def _deviance_derivative(self, coef, X, y, weights, link):
         r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a
         function of the coefficients `coef`.
         This is equivalent to :math:`-2\phi` times the score function
-        :func:`score_function` (derivative of the log-likelihood).
+        :func:`_score` (derivative of the log-likelihood).
         """
         score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights,
                             link=link)
@@ -510,7 +512,8 @@ def unit_deviance(self, y, mu):
             return (y-mu)**2
         if p == 1:
             # PoissonDistribution
-            return 2 * (np.where(y == 0, 0, y*np.log(y/mu))-y+mu)
+            # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
+            return 2 * (special.xlogy(y, y/mu) - y + mu)
         elif p == 2:
             # GammaDistribution
             return 2 * (np.log(mu/y)+y/mu-1)
@@ -921,7 +924,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
         if (not isinstance(self.l1_ratio, numbers.Number) or
                 self.l1_ratio < 0 or self.l1_ratio > 1):
             raise ValueError("l1_ratio must be in interval [0, 1]; got"
-                             " (l1_ratio={0]})".format(self.l1_ratio))
+                             " (l1_ratio={0})".format(self.l1_ratio))
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
@@ -948,7 +951,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
             raise ValueError("The argument warm_start must be bool;"
                              " got {0}".format(self.warm_start))
         start_params = self.start_params
-        if start_params is not None and start_params is not 'least_squares':
+        if start_params is not None and start_params != 'least_squares':
             start_params = np.atleast_1d(start_params)
             if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
                     (start_params.ndim != 1)):
@@ -986,7 +989,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
                 raise ValueError("P2 must be either None or an array of shape "
                                  "(n_features, n_features) with "
                                  "n_features=X.shape[1]; "
-                                 "got (P2.shape=({0},{1})), needed ({3},{3})"
+                                 "got (P2.shape=({0}, {1})), needed ({2}, {2})"
                                  .format(P2.shape[0], P2.shape[1], X.shape[1]))
 
         family = self._family_instance
@@ -1058,7 +1061,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
             else:
                 # with L1 penalty, start with coef = 0
                 coef = np.zeros(n_features)
-        elif self.start_params is 'least_squares':
+        elif self.start_params == 'least_squares':
             if self.alpha == 0:
                 reg = LinearRegression(copy_X=True, fit_intercept=False)
                 reg.fit(Xnew, link.link(y))
@@ -1277,11 +1280,9 @@ def estimate_phi(self, y, X, sample_weight):
             dev = self._family_instance.deviance(y, mu, sample_weight)
             return dev/(n_samples - n_features)
 
-# TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5"
-#       in check_estimator for score
-#       from sklearn.utils.estimator_checks import check_estimator
-#       from sklearn.linear_model import GeneralizedLinearRegressor
-#       check_estimator(GeneralizedLinearRegressor)
+    # Note: check_estimator(GeneralizedLinearRegressor) might raise
+    # "AssertionError: -0.28014056555724598 not greater than 0.5"
+    # unless GeneralizedLinearRegressor has a score which passes the test.
     def score(self, X, y, sample_weight=None):
         r"""Returns D^2, a generalization of the coefficient of determination
         R^2, which uses deviance instead of squared error.
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index b62b51b5bcb9e..de7de90db967b 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -1,4 +1,6 @@
 import numpy as np
+from numpy.testing import assert_allclose
+import scipy as sp
 
 from sklearn.linear_model.glm import (
     Link,
@@ -73,6 +75,46 @@ def test_deviance_zero():
         assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10)
 
 
+def test_fisher_matrix():
+    """Test the Fisher matrix numerically.
+    Trick: Use numerical differentiation with y = mu"""
+    for family in [NormalDistribution(), PoissonDistribution(),
+                   GammaDistribution(), InverseGaussianDistribution()]:
+        link = LogLink()
+        rng = np.random.RandomState(0)
+        coef = np.array([-2, 1, 0, 1, 2.5])
+        phi = 0.5
+        X = rng.randn(10, 5)
+        lin_pred = np.dot(X, coef)
+        mu = link.inverse(lin_pred)
+        weights = rng.randn(10)**2 + 1
+        fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu,
+                                       weights=weights, link=link)
+        approx = np.array([]).reshape(0, coef.shape[0])
+        for i in range(coef.shape[0]):
+            def f(coef):
+                return -family._score(coef=coef, phi=phi, X=X, y=mu,
+                                      weights=weights, link=link)[i]
+            approx = np.vstack(
+                [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]
+                )
+        assert_allclose(fisher, approx, rtol=1e-3)
+
+
+def test_glm_family_argument():
+    """Test GLM family argument set as string
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for (f, fam) in [('normal', NormalDistribution()),
+                     ('poisson', PoissonDistribution()),
+                     ('gamma', GammaDistribution()),
+                     ('inverse.gaussian', InverseGaussianDistribution())]:
+        glm = GeneralizedLinearRegressor(family=f, fit_intercept=False,
+                                         alpha=0).fit(X, y)
+        assert_equal(type(glm._family_instance), type(fam))
+
+
 def test_glm_identiy_regression():
     """Test GLM regression with identity link on a simple dataset
     """
@@ -82,7 +124,8 @@ def test_glm_identiy_regression():
     families = (
         NormalDistribution(), PoissonDistribution(),
         GammaDistribution(), InverseGaussianDistribution(),
-        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5))
+        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
+        GeneralizedHyperbolicSecand())
     for solver in ['irls', 'lbfgs', 'newton-cg']:
         for family in families:
             glm = GeneralizedLinearRegressor(
@@ -100,7 +143,8 @@ def test_glm_log_regression():
     families = (
         NormalDistribution(), PoissonDistribution(),
         GammaDistribution(), InverseGaussianDistribution(),
-        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5))
+        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
+        GeneralizedHyperbolicSecand())
     for solver in ['irls', 'lbfgs', 'newton-cg']:
         for family in families:
             glm = GeneralizedLinearRegressor(
@@ -179,4 +223,26 @@ def test_normal_ridge():
     assert_array_almost_equal(glm.predict(T), ridge.predict(T))
 
 
-# TODO: Test compatibility with R's glm, glmnet
+def test_poisson_ridge():
+    """Test ridge regression with poisson family and LogLink
+
+    Compare to R's glmnet"""
+    # library("glmnet")
+    # options(digits=10)
+    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
+    # x <- data.matrix(df[,c("a", "b")])
+    # y <- df$y
+    # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson",
+    #               standardize=F, thresh=1e-10, nlambda=10000)
+    # coef(fit, s=1)
+    # (Intercept) -0.12889386979
+    # a            0.29019207995
+    # b            0.03741173122
+    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
+    y = np.array([0, 1, 1, 2])
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, family='poisson',
+                                     link='log', tol=1e-10)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, -0.12889386979, decimal=7)
+    assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
+                              decimal=7)

From 72485b63e89879e65381bca12152b54600fd3970 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Mon, 8 Jan 2018 22:13:45 +0100
Subject: [PATCH 009/209] [WIP] Add Generalized Linear Models (#9405)

* put arguments P1, P2 and check_input from fit to __init__
* added check_input test: is P2 positive definite?
* added solver option: 'auto'
---
 sklearn/linear_model/glm.py | 181 +++++++++++++++++++++++-------------
 1 file changed, 117 insertions(+), 64 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 93ce358a8a874..3f6b91026ef9b 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -14,7 +14,7 @@
 # TODO: Write examples and more docu
 # TODO: Make it as much consistent to other estimators in linear_model as
 #       possible
-# TODO: options P1 and P2 in fit() or in __init__()???
+# TODO: which dtype to force for y and X? Which for P1, P2?
 # TODO: Include further classes in class.rst? ExponentialDispersionModel?
 #       TweedieDistribution?
 
@@ -709,6 +709,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
+    P1 : None or array of shape (n_features*, ), optional\
+            (default=None)
+        With this array, you can exclude coefficients from the L1 penalty.
+        Set the corresponding value to 1 (include) or 0 (exclude). The
+        default value ``None`` is the same as an array of ones.
+        Note that n_features* = X.shape[1] = length of coef_ (intercept
+        always excluded from counting).
+
+    P2 : None or array of shape (n_features*, n_features*)
+        With this square matrix the L2 penalty is calculated as `w P2 w`.
+        This gives a fine control over this penalty (Tikhonov
+        regularization).
+        Note that n_features* = X.shape[1] = length of coef_ (intercept
+        always excluded from counting). P2 must be positive semi-definite.
+
     fit_intercept : boolean, optional (default=True)
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
@@ -727,13 +742,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         the chi squared statisic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'irls', 'newton-cg', 'lbfgs'}, optional (defaul='irls')
+    solver : {'auto', 'irls', 'newton-cg', 'lbfgs'}, optional (defaul='auto')
         Algorithm to use in the optimization problem.
 
-        - 'irls' is iterated reweighted least squares. It is the standard
-            algorithm for GLMs.
+        - 'irls' is iterated reweighted least squares (Fisher scoring).
+            It is the standard algorithm for GLMs. Cannot deal with
+            L1 penalties.
+
+        - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties.
 
-        - 'newton-cg', 'lbfgs'
+        - 'auto' sets 'irls'.
 
     max_iter : int, optional (default=100)
         TODO
@@ -766,6 +784,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
 
+    check_input : boolean, optional (default=True)
+        Allow to bypass several checks on input: y values in range of family,
+        sample_weights non-negative, P2 positive semi-definite.
+        Don't use this parameter unless you know what you do.
+
     verbose : int, optional (default=0)
         For the lbfgs solver set verbose to any positive number for verbosity.
 
@@ -790,13 +813,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     ----------
     TODO
     """
-    def __init__(self, alpha=1.0, l1_ratio=0,
+    def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
                  fit_intercept=True, family='normal', link='identity',
-                 fit_dispersion='chisqr', solver='irls', max_iter=100,
+                 fit_dispersion='chisqr', solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False, start_params=None, copy_X=True,
-                 verbose=0):
+                 check_input=True, verbose=0):
         self.alpha = alpha
         self.l1_ratio = l1_ratio
+        self.P1 = P1
+        self.P2 = P2
         self.fit_intercept = fit_intercept
         self.family = family
         self.link = link
@@ -807,10 +832,10 @@ def __init__(self, alpha=1.0, l1_ratio=0,
         self.warm_start = warm_start
         self.start_params = start_params
         self.copy_X = copy_X
+        self.check_input = check_input
         self.verbose = verbose
 
-    def fit(self, X, y, sample_weight=None, P1=None, P2=None,
-            check_input=True):
+    def fit(self, X, y, sample_weight=None):
         """Fit a generalized linear model.
 
         Parameters
@@ -823,31 +848,13 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
 
         sample_weight : array of shape (n_samples, ) or None,\
                 optinal (default=None)
-            Individual weights for each sample.
-            Var[Y_i]=phi/weight_i * v(mu)
-            If Y_i ~ EDM(mu, phi/w_i) then
+            Individual weights w_i for each sample. Note that for an
+            Exponential Dispersion Model (EDM), one has
+            Var[Y_i]=phi/w_i * v(mu).
+            If Y_i ~ EDM(mu, phi/w_i), then
             sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
             weighted average with weights=sample_weight.
 
-        P1 : None or array of shape (n_features*, ), optional\
-                (default=None)
-            With this array, you can exclude coefficients from ths L1 penalty.
-            Set the corresponding value to 1 (include) or 0 (exclude). The
-            default value ``None`` is the same as an array of ones.
-            Note that n_features* = X.shape[1] = length of coef_ (intercept
-            always excluded from counting).
-
-        P2 : None or array of shape (n_features*, n_features*)
-            With this square matrix the L2 penalty is calculated as `w P2 w`.
-            This gives a fine control over this penalty (Tikhonov
-            regularization).
-            Note that n_features* = X.shape[1] = length of coef_ (intercept
-            always excluded from counting).
-
-        check_input : boolean, optional (default=True)
-            Allow to bypass several input checking.
-            Don't use this parameter unless you know what you do.
-
         Returns
         -------
         self : returns an instance of self.
@@ -872,16 +879,6 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
             elif weights.shape[0] != y.shape[0]:
                 raise ValueError("Sample weights must have the same length as"
                                  " y")
-        # IMPORTANT NOTE: Since we want to minimize
-        # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
-        # deviance = sum(sample_weight * unit_deviance),
-        # we rescale weights such that sum(weights) = 1 and this becomes
-        # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
-        weights = weights/np.sum(weights)
-
-        if not isinstance(check_input, bool):
-            raise ValueError("The argument check_input must be bool; got "
-                             "(check_input={0})".format(check_input))
 
         # 1.2 validate arguments of __init__ ##################################
         # Garantee that self._family_instance is an instance of class
@@ -928,17 +925,22 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
-        if self.solver not in ['irls', 'lbfgs', 'newton-cg']:
-            raise ValueError("GLM Regression supports only irls, lbfgs and"
-                             "newton-cg solvers, got {0}".format(self.solver))
+        if self.solver == 'auto':
+            solver = 'irls'
+        else:
+            solver = self.solver
+        if solver not in ['irls', 'lbfgs', 'newton-cg']:
+            raise ValueError("GeneralizedLinearRegressor supports only irls, "
+                             "lbfgs and newton-cg solvers, got {0}"
+                             "".format(solver))
         if self.alpha > 0:
             if (self.l1_ratio > 0 and
-                    self.solver not in []):
+                    solver not in []):
                 # TODO: Add solver for L1
                 # raise ValueError("The solver option (solver={0}) is not "
                 #                  "appropriate for the chosen penalty which"
                 #                  " includes L1 (alpha={1})."
-                #                  .format(self.solver, self.alpha))
+                #                  .format(solver, self.alpha))
                 raise NotImplementedError("Currently, no solver is implemented"
                                           " that can deal with L1 penalties.")
         if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
@@ -964,28 +966,32 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
         if not isinstance(self.copy_X, bool):
             raise ValueError("The argument copy_X must be bool;"
                              " got {0}".format(self.copy_X))
+        if not isinstance(self.check_input, bool):
+            raise ValueError("The attribute check_input must be bool; got "
+                             "(check_input={0})".format(self.check_input))
 
-        if P1 is None:
+        if self.P1 is None:
             P1 = np.ones(X.shape[1])
         else:
-            P1 = np.atleast_1d(P1)
+            P1 = np.atleast_1d(np.copy(self.P1))
             if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1):
                 raise ValueError("P1 must be either None or an 1D array with "
                                  "the length of X.shape[1]; "
                                  "got (P1.shape[0]={0}), "
                                  "needed (X.shape[1]={1})."
                                  .format(P1.shape[0], X.shape[1]))
-        if P2 is None:
+        if self.P2 is None:
             P2 = np.ones(X.shape[1])
             if sparse.issparse(X):
                 P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0),
                       shape=(X.shape[1], X.shape[1]))).tocsr()
         else:
-            P2 = check_array(P2, accept_sparse=['csr', 'csc', 'coo'],
+            P2 = check_array(self.P2, copy=True,
+                             accept_sparse=['csr', 'csc', 'coo'],
                              dtype="numeric", ensure_2d=True)
-            if ((P2.shape[0] != P2.shape[1]) or
-                (P2.shape[0] != X.shape[1]) or
-                    (P2.ndim != 2)):
+            if ((P2.ndim != 2) or
+                    (P2.shape[0] != P2.shape[1]) or
+                    (P2.shape[0] != X.shape[1])):
                 raise ValueError("P2 must be either None or an array of shape "
                                  "(n_features, n_features) with "
                                  "n_features=X.shape[1]; "
@@ -1020,16 +1026,39 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
         P2 *= l2
 
         # 1.3 additional validations ##########################################
-        if check_input:
+        if self.check_input:
             if not np.all(family.in_y_range(y)):
                 raise ValueError("Some value(s) of y are out of the valid "
                                  "range for family {0}"
                                  .format(family.__class__.__name__))
+            if not np.all(weights >= 0):
+                raise ValueError("Sample weights must be non-negative.")
+            # check that P2 is positive semidefinite
+            # np.linalg.cholesky(P2) 'only' asserts positive definite
+            if self.P2 is not None:
+                if sparse.issparse(P2):
+                    # TODO: check sparse P2 for non-negativeness
+                    raise NotImplementedError("Check sparse P2 for "
+                                              "non-negaitveness is not yet "
+                                              "implemented.")
+                elif P2.ndim == 2:
+                    if not np.all(np.linalg.eigvals(P2) >= -1e-15):
+                        raise ValueError("P2 must be positive definite.")
             # TODO: if alpha=0 check that Xnew is not rank deficient
             # TODO: what else to check?
 
         #######################################################################
-        # 2. initialization of coef = (intercept_, coef_)                     #
+        # 2. rescaling of weights (sample_weight)                             #
+        #######################################################################
+        # IMPORTANT NOTE: Since we want to minimize
+        # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
+        # deviance = sum(sample_weight * unit_deviance),
+        # we rescale weights such that sum(weights) = 1 and this becomes
+        # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
+        weights = weights/np.sum(weights)
+
+        #######################################################################
+        # 3. initialization of coef = (intercept_, coef_)                     #
         #######################################################################
         # Note: Since phi=self.dispersion_ does not enter the estimation
         #       of mu_i=E[y_i], set it to 1.
@@ -1082,13 +1111,13 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
             coef = start_params
 
         #######################################################################
-        # 3. fit                                                              #
+        # 4. fit                                                              #
         #######################################################################
         # algorithms for optimiation
         # TODO: Parallelize it
         self.n_iter_ = 0
         converged = False
-        # 3.1 IRLS ############################################################
+        # 4.1 IRLS ############################################################
         # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj'
         #   Obj = objective function = 1/2 Dev + l2/2 w P2 w
         #   Dev = deviance, s = normalized weights, variance V(mu) but phi=1
@@ -1104,7 +1133,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
         #   Obj'' ~ X' W X + l2 P2
         # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu)
         # Note: P2 = l2*P2, see above
-        if self.solver == 'irls':
+        if solver == 'irls':
             # eta = linear predictor
             eta = safe_sparse_dot(Xnew, coef, dense_output=True)
             mu = link.inverse(eta)
@@ -1150,10 +1179,10 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
                               "of iterations (currently {0})"
                               .format(self.max_iter), ConvergenceWarning)
 
-        # 3.2 L-BFGS and Newton-CG ############################################
+        # 4.2 L-BFGS and Newton-CG ############################################
         # TODO: performance: make one function return both deviance and
         #       gradient of deviance
-        elif self.solver in ['lbfgs', 'newton-cg']:
+        elif solver in ['lbfgs', 'newton-cg']:
             def func(coef, *args):
                 if P2.ndim == 1:
                     L2 = safe_sparse_dot(coef.T, P2*coef)
@@ -1190,7 +1219,7 @@ def Hs(s):
 
             args = (Xnew, y, weights, link)
 
-            if self.solver == 'lbfgs':
+            if solver == 'lbfgs':
                 coef, loss, info = optimize.fmin_l_bfgs_b(
                     func, coef, fprime=fprime, args=args,
                     iprint=(self.verbose > 0) - 1, pgtol=self.tol,
@@ -1204,13 +1233,37 @@ def Hs(s):
                         warnings.warn("lbfgs failed for the reason: {0}"
                                       .format(info["task"]))
                 self.n_iter_ = info['nit']
-            elif self.solver == 'newton-cg':
+            elif solver == 'newton-cg':
                 coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef,
                                            args=args, maxiter=self.max_iter,
                                            tol=self.tol)
 
+        # 4.3 coordinate descent ##############################################
+        # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+        # An Improved GLMNET for L1-regularized Logistic Regression,
+        # Journal of Machine Learning Research 13 (2012) 1999-2030
+        # Note: Use Fisher matrix instead of Hessian
+        #
+        # 1. find optimal descent direction d by minimizing
+        #    min_d F(w+d) = min_d F(w+d) - F(w)
+        #    F = f + g; f(w) = 1/2 dev; g(w) = 1/2*w*P2*w + ||P1*w||_1
+        # 2. quadrdatic approx of f(w+d)-f(w):
+        #    q(d) = f'(w)*d +1/2 d*H*d
+        #    min_d q(d) + g(w+d) - g(w)
+        # 3. coordinate descent by updating coordinate j (d -> d+z*e_j):
+        #    min_z q(d+z*e_j) + g(w+d+z*e_j) - g(w)
+        #    = min_z q(d+z e_j) - q(d) + g(w+d+z*e_j) - g(w+d)
+        # TODO
+        # elif solver == 'cd':
+            # line search parameters
+            # (beta, sigma) = (0.5, 0.01)
+            # for iteration k from 1 to maxiter
+            #     for coordinate j sample at random
+            #     np.random.choice(coord, replace = False)
+            #
+
         #######################################################################
-        # 4. postprocessing                                                   #
+        # 5. postprocessing                                                   #
         #######################################################################
         if self.fit_intercept:
             self.intercept_ = coef[0]

From 5c1369bde863a73aff46a502acd70e58e06dcb85 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 24 Jan 2018 15:22:08 +0100
Subject: [PATCH 010/209] [WIP] Add Generalized Linear Models (#9405)

* added coordinate descent solver
* skip doctest for GeneralizedLinearRegressor example
* symmetrize P2 => use P2 = 1/2 (P2+P2')
* better validation of parameter start_params
---
 doc/modules/linear_model.rst           |  21 +-
 sklearn/linear_model/glm.py            | 353 ++++++++++++++++++++-----
 sklearn/linear_model/tests/test_glm.py |  82 ++++--
 3 files changed, 365 insertions(+), 91 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 98736facd9b76..834466e494a4a 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -894,9 +894,9 @@ becomes
             +\frac{\alpha(1-\rho)}{2} w^T P_2 w
 
 with sample weights :math:`s`.
-:math:`P_1` can be used to exclude some of the coefficients in the L1
-penalty, :math:`P_2` (must be positive semi-definite) allows for a more
-versatile L2 penalty.
+:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in
+the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows
+for a more versatile L2 penalty.
 
 Use cases, where a loss different from the squared loss might be appropriate,
 are the following:
@@ -908,22 +908,23 @@ are the following:
   * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family).
 
 Since the linear predictor :math:`Xw` can be negative and
-Poisson, Gamma and Inverse Gaussian distributions don't have negative values,
+Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
 it is convenient to apply a link function different from the identity link
 :math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with
 :math:`h(Xw)=\exp(Xw)`.
 
 Note that the feature matrix `X` should be standardized before fitting. This
 ensures that the penalty treats features equally. The estimator can be used as
-follows::
+follows:
 
     >>> from sklearn.linear_model import GeneralizedLinearRegressor
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
-    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
-    GeneralizedLinearRegressor(alpha=0.5, copy_X=True, family='poisson',
-                  fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0,
-                  link='log', max_iter=100, solver='irls', start_params=None,
-                  tol=0.0001, verbose=0, warm_start=False)
+    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +SKIP
+    GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True,
+              copy_X=True, family='poisson', fit_dispersion='chisqr',
+              fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
+              random_state=None, selection='random', solver='auto',
+              start_params=None, tol=0.0001, verbose=0, warm_start=False)
     >>> reg.coef_
     array([ 0.24630255,  0.43373521])
     >>> reg.intercept_ #doctest: +ELLIPSIS
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 3f6b91026ef9b..3de82c20f33cf 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -3,20 +3,21 @@
 """
 
 # Author: Christian Lorentzen <lorentzen.ch@googlemail.ch>
+# some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
 # TODO: Write more tests
-# TODO: Add l1-penalty (elastic net)
+# TODO: Write examples and more docu
 # TODO: deal with option self.copy_X
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
 # TODO: Add cross validation
-# TODO: Write examples and more docu
-# TODO: Make it as much consistent to other estimators in linear_model as
-#       possible
-# TODO: which dtype to force for y and X? Which for P1, P2?
+# TODO: Should GeneralizedLinearRegressor inherit from LinearModel?
+#       So far, it does not.
 # TODO: Include further classes in class.rst? ExponentialDispersionModel?
 #       TweedieDistribution?
+# TODO: Negative values in P1 are not allowed so far. They could be used form
+#       group lasse.
 
 # Design Decisions:
 # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
@@ -52,10 +53,11 @@
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..externals import six
+from ..externals.six.moves import xrange
 from ..utils import check_array, check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import newton_cg
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, check_random_state
 
 
 class Link(six.with_metaclass(ABCMeta)):
@@ -309,7 +311,9 @@ def _score(self, coef, phi, X, y, weights, link):
 
         .. math:
 
-            \mathbf{score}(\boldsymbol{w}) = \mathbf{X}^T \mathbf{D}
+            \mathbf{score}(\boldsymbol{w})
+            = \frac{\partial loglike}{\partial\boldsymbol{w}}
+            = \mathbf{X}^T \mathbf{D}
             \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,,
 
         with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and
@@ -411,6 +415,29 @@ def _deviance_hessian(self, coef, X, y, weights, link):
                                                  weights=weights, link=link)
         return 2*info_matrix
 
+    def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link):
+        """Calculates eta (linear predictor), mu, score function (derivative
+        of log-likelihood) and Fisher matrix (all with phi=1) all in one go"""
+        n_samples, n_features = X.shape
+        # eta = linear predictor
+        eta = safe_sparse_dot(X, coef, dense_output=True)
+        mu = link.inverse(eta)
+        sigma_inv = 1./self.variance(mu, phi=phi, weights=weights)
+        d1 = link.inverse_derivative(eta)  # = h'(eta)
+        # Alternatively:
+        # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
+        # d1 = 1./link.derivative(mu)
+        d1_sigma_inv = sparse.dia_matrix((sigma_inv*d1, 0),
+                                         shape=(n_samples, n_samples))
+        temp = safe_sparse_dot(d1_sigma_inv, (y-mu), dense_output=True)
+        score = safe_sparse_dot(X.T, temp, dense_output=True)
+        #
+        d2_sigma_inv = sparse.dia_matrix((sigma_inv*(d1**2), 0),
+                                         shape=(n_samples, n_samples))
+        temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False)
+        fisher = safe_sparse_dot(X.T, temp, dense_output=False)
+        return eta, mu, score, fisher
+
     def starting_mu(self, y, weights=1):
         """Starting values for the mean mu_i in (unpenalized) IRLS."""
         return ((weights*y+np.mean(weights*y)) /
@@ -670,8 +697,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
             alpha = a + b and l1_ratio = a / (a + b)
 
     The parameter `l1_ratio` corresponds to alpha in the glmnet R package while
-    alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio
-    = 1 is the lasso penalty.
+    'alpha' corresponds to the lambda parameter in glmnet. Specifically,
+    l1_ratio = 1 is the lasso penalty.
 
     Read more in the :ref:`User Guide <Generalized_linear_regression>`.
 
@@ -686,6 +713,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     TODO: For `alpha` > 0, the feature matrix `X` is assumed to be
     standardized. Call
     :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
+    Otherwise, the strength of the penalty is different for the features.
 
     TODO: Estimation of the dispersion parameter phi.
 
@@ -742,19 +770,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         the chi squared statisic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'auto', 'irls', 'newton-cg', 'lbfgs'}, optional (defaul='auto')
+    solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \
+            optional (defaul='auto')
         Algorithm to use in the optimization problem.
 
+        - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'.
+
         - 'irls' is iterated reweighted least squares (Fisher scoring).
             It is the standard algorithm for GLMs. Cannot deal with
             L1 penalties.
 
         - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties.
 
-        - 'auto' sets 'irls'.
+        - 'cd' is the coordinate descent algorithm. It can deal with L1 and
+            L2 penalties.
 
     max_iter : int, optional (default=100)
-        TODO
+        The maximal number of iterations for solver algorithms.
 
     tol : float, optional (default=1e-4)
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
@@ -781,6 +813,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         This option only applies if ``warm_start=False`` or if fit is called
         the first time (``self.coef_`` does not exist).
 
+    selection : str, optional (default='random')
+        For the solver 'cd' (coordinate descent), the coordinates (features)
+        can be updated in either cyclic or random order.
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator that selects a random
+        feature to be updated for solver 'cd' (coordinate descent).
+        If int, random_state is the seed used by the random
+        number generator; if RandomState instance, random_state is the random
+        number generator; if None, the random number generator is the
+        RandomState instance used by `np.random`. Used when ``selection`` ==
+        'random'.
+
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
 
@@ -816,7 +865,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
                  fit_intercept=True, family='normal', link='identity',
                  fit_dispersion='chisqr', solver='auto', max_iter=100,
-                 tol=1e-4, warm_start=False, start_params=None, copy_X=True,
+                 tol=1e-4, warm_start=False, start_params=None,
+                 selection='random', random_state=None, copy_X=True,
                  check_input=True, verbose=0):
         self.alpha = alpha
         self.l1_ratio = l1_ratio
@@ -831,6 +881,8 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
         self.tol = tol
         self.warm_start = warm_start
         self.start_params = start_params
+        self.selection = selection
+        self.random_state = random_state
         self.copy_X = copy_X
         self.check_input = check_input
         self.verbose = verbose
@@ -925,24 +977,21 @@ def fit(self, X, y, sample_weight=None):
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
-        if self.solver == 'auto':
-            solver = 'irls'
-        else:
-            solver = self.solver
-        if solver not in ['irls', 'lbfgs', 'newton-cg']:
+        if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']:
             raise ValueError("GeneralizedLinearRegressor supports only irls, "
-                             "lbfgs and newton-cg solvers, got {0}"
-                             "".format(solver))
-        if self.alpha > 0:
-            if (self.l1_ratio > 0 and
-                    solver not in []):
-                # TODO: Add solver for L1
-                # raise ValueError("The solver option (solver={0}) is not "
-                #                  "appropriate for the chosen penalty which"
-                #                  " includes L1 (alpha={1})."
-                #                  .format(solver, self.alpha))
-                raise NotImplementedError("Currently, no solver is implemented"
-                                          " that can deal with L1 penalties.")
+                             "auto, lbfgs, newton-cg and cd solvers, got {0}"
+                             "".format(self.solver))
+        solver = self.solver
+        if self.solver == 'auto':
+            if self.l1_ratio == 0:
+                solver = 'irls'
+            else:
+                solver = 'cd'
+        if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']):
+                raise ValueError("The chosen solver (solver={0}) can't deal "
+                                 "with L1 penalties, which are included with "
+                                 "(alpha={1}) and (l1_ratio={2})."
+                                 .format(solver, self.alpha, self.l1_ratio))
         if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
             raise ValueError("Maximum number of iteration must be positive;"
                              " got (max_iter={0!r})".format(self.max_iter))
@@ -953,7 +1002,14 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("The argument warm_start must be bool;"
                              " got {0}".format(self.warm_start))
         start_params = self.start_params
-        if start_params is not None and start_params != 'least_squares':
+        if start_params is None:
+            pass
+        elif isinstance(start_params, six.string_types):
+            if start_params not in ['least_squares']:
+                raise ValueError("The argument start_params must be None, "
+                                 "'least-squares' or an array of right length,"
+                                 " got(start_params={0})".format(start_params))
+        else:
             start_params = np.atleast_1d(start_params)
             if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
                     (start_params.ndim != 1)):
@@ -963,6 +1019,12 @@ def fit(self, X, y, sample_weight=None):
                                  .format(X.shape[1] + self.fit_intercept,
                                          start_params.shape[0],
                                          start_params.ndim))
+
+        if self.selection not in ['cyclic', 'random']:
+            raise ValueError("The argument selection must be 'cyclic' or "
+                             "'random', got (selection={0})"
+                             .format(self.selection))
+        random_state = check_random_state(self.random_state)
         if not isinstance(self.copy_X, bool):
             raise ValueError("The argument copy_X must be bool;"
                              " got {0}".format(self.copy_X))
@@ -974,15 +1036,16 @@ def fit(self, X, y, sample_weight=None):
             P1 = np.ones(X.shape[1])
         else:
             P1 = np.atleast_1d(np.copy(self.P1))
-            if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1):
+            if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]):
                 raise ValueError("P1 must be either None or an 1D array with "
                                  "the length of X.shape[1]; "
                                  "got (P1.shape[0]={0}), "
                                  "needed (X.shape[1]={1})."
                                  .format(P1.shape[0], X.shape[1]))
         if self.P2 is None:
-            P2 = np.ones(X.shape[1])
-            if sparse.issparse(X):
+            if not sparse.issparse(X):
+                P2 = np.ones(X.shape[1])
+            else:
                 P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0),
                       shape=(X.shape[1], X.shape[1]))).tocsr()
         else:
@@ -1024,6 +1087,12 @@ def fit(self, X, y, sample_weight=None):
         l2 = self.alpha * (1-self.l1_ratio)
         P1 *= l1
         P2 *= l2
+        # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
+        # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
+        if sparse.issparse(P2):
+            P2 = 0.5 * (P2 + P2.transpose())
+        else:
+            P2 = 0.5 * (P2 + P2.T)
 
         # 1.3 additional validations ##########################################
         if self.check_input:
@@ -1033,14 +1102,20 @@ def fit(self, X, y, sample_weight=None):
                                  .format(family.__class__.__name__))
             if not np.all(weights >= 0):
                 raise ValueError("Sample weights must be non-negative.")
-            # check that P2 is positive semidefinite
+            # check if P1 has only non-negative values, negative values might
+            # indicate group lasso in the future.
+            if self.P1 is not None:
+                if not np.all(P1 >= 0):
+                    raise ValueError("P1 must not have negative values.")
+            # check if P2 is positive semidefinite
             # np.linalg.cholesky(P2) 'only' asserts positive definite
             if self.P2 is not None:
                 if sparse.issparse(P2):
                     # TODO: check sparse P2 for non-negativeness
-                    raise NotImplementedError("Check sparse P2 for "
-                                              "non-negaitveness is not yet "
-                                              "implemented.")
+                    # raise NotImplementedError("Check sparse P2 for "
+                    #                          "non-negaitveness is not yet "
+                    #                          "implemented.")
+                    pass
                 elif P2.ndim == 2:
                     if not np.all(np.linalg.eigvals(P2) >= -1e-15):
                         raise ValueError("P2 must be positive definite.")
@@ -1090,7 +1165,8 @@ def fit(self, X, y, sample_weight=None):
             else:
                 # with L1 penalty, start with coef = 0
                 coef = np.zeros(n_features)
-        elif self.start_params == 'least_squares':
+        elif (isinstance(self.start_params, six.string_types) and
+                self.start_params == 'least_squares'):
             if self.alpha == 0:
                 reg = LinearRegression(copy_X=True, fit_intercept=False)
                 reg.fit(Xnew, link.link(y))
@@ -1102,7 +1178,7 @@ def fit(self, X, y, sample_weight=None):
                 reg.fit(Xnew, link.link(y))
                 coef = reg.coef_
             else:
-                # TODO: Does this make sense?
+                # TODO: Does this make sense at all?
                 reg = ElasticNet(copy_X=True, fit_intercept=False,
                                  alpha=self.alpha, l1_ratio=self.l1_ratio)
                 reg.fit(Xnew, link.link(y))
@@ -1125,14 +1201,17 @@ def fit(self, X, y, sample_weight=None):
         #   D2  = link.inverse_derivative(eta)^2 = D^2
         #   W   = D2/V(mu)
         #   l2  = alpha * (1 - l1_ratio)
-        #   Obj' = d(Obj)/d(w) = 1/2 Dev' + P2 w
+        #   Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w
         #        = -X' D (y-mu)/V(mu) + l2 P2 w
         #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
         #   Use Fisher matrix instead of full info matrix -X'(...) X,
         #    i.e. E[Dev''] with E[y-mu]=0:
         #   Obj'' ~ X' W X + l2 P2
-        # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu)
-        # Note: P2 = l2*P2, see above
+        # (1): w = (X' W X + l2 P2)^-1 X' W z,
+        #      with z = eta + D^-1 (y-mu)
+        # Note: we already set P2 = l2*P2, see above
+        # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+        # Note: ' denotes derivative, but also transpose for matrices
         if solver == 'irls':
             # eta = linear predictor
             eta = safe_sparse_dot(Xnew, coef, dense_output=True)
@@ -1150,9 +1229,8 @@ def fit(self, X, y, sample_weight=None):
                 # working observations
                 z = eta + (y-mu)/hp
                 # solve A*coef = b
-                # A = X' W X + l2 P2, b = X' W z
+                # A = X' W X + P2, b = X' W z
                 coef = _irls_step(Xnew, W, P2, z)
-
                 # updated linear predictor
                 # do it here for updated values for tolerance
                 eta = safe_sparse_dot(Xnew, coef, dense_output=True)
@@ -1242,25 +1320,182 @@ def Hs(s):
         # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
         # An Improved GLMNET for L1-regularized Logistic Regression,
         # Journal of Machine Learning Research 13 (2012) 1999-2030
-        # Note: Use Fisher matrix instead of Hessian
+        # Note: Use Fisher matrix instead of Hessian for H
         #
         # 1. find optimal descent direction d by minimizing
         #    min_d F(w+d) = min_d F(w+d) - F(w)
-        #    F = f + g; f(w) = 1/2 dev; g(w) = 1/2*w*P2*w + ||P1*w||_1
-        # 2. quadrdatic approx of f(w+d)-f(w):
-        #    q(d) = f'(w)*d +1/2 d*H*d
-        #    min_d q(d) + g(w+d) - g(w)
+        #    F = f + g, f(w) = 1/2 deviance, g(w) = 1/2 w*P2*w + ||P1*w||_1
+        # 2. quadrdatic approximation of F(w+d)-F(w) = q(d):
+        #    using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives
+        #    q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
+        #           + ||P1*(w+d)||_1 - ||P1*w||_1
+        #    min_d q(d)
         # 3. coordinate descent by updating coordinate j (d -> d+z*e_j):
-        #    min_z q(d+z*e_j) + g(w+d+z*e_j) - g(w)
-        #    = min_z q(d+z e_j) - q(d) + g(w+d+z*e_j) - g(w+d)
-        # TODO
-        # elif solver == 'cd':
+        #    min_z q(d+z*e_j)
+        #    = min_z q(d+z*e_j) - q(d)
+        #    = min_z A_j z + 1/2 B_jj z^2
+        #            + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
+        #    A = f'(w) + d*H(w) + (w+d)*P2
+        #    B = H+P2
+        # Note: we already set P2 = l2*P2, P1 = l1*P1, see above
+        # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+        # Note: f' = -score, H = Fisher matrix
+        elif solver == 'cd':
             # line search parameters
-            # (beta, sigma) = (0.5, 0.01)
-            # for iteration k from 1 to maxiter
-            #     for coordinate j sample at random
-            #     np.random.choice(coord, replace = False)
-            #
+            (beta, sigma) = (0.5, 0.01)
+            # max inner loops (cycles through all features)
+            max_inner_iter = 1000
+            # some precalculations
+            eta, mu, score, fisher = family._eta_mu_score_fisher(
+                coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link)
+            # initial stopping tolerance of inner loop
+            # use L1-norm of minimum-norm of subgradient of F
+            # fp_wP2 = f'(w) + w*P2
+            if P2.ndim == 1:
+                fp_wP2 = -score + coef*P2
+            else:
+                fp_wP2 = -score + safe_sparse_dot(coef, P2)
+            inner_tol = (np.where(coef == 0,
+                         np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
+                         fp_wP2+np.sign(coef)*P1))
+            inner_tol = linalg.norm(inner_tol, ord=1)
+            # outer loop
+            while self.n_iter_ < self.max_iter:
+                self.n_iter_ += 1
+                # initialize search direction d (to be optimized)
+                d = np.zeros_like(coef)
+                # inner loop
+                # TODO: use sparsity (coefficient already 0 due to L1 penalty)
+                d = np.zeros_like(coef)
+                # A = f'(w) + d*H(w) + (w+d)*P2
+                # B = H+P2
+                # Note: f'=-score and H=fisher are updated at the end of outer
+                #       iteration
+                B = fisher
+                if P2.ndim == 1:
+                    coef_P2 = coef * P2
+                    B[np.diag_indices_from(B)] += P2
+                else:
+                    coef_P2 = safe_sparse_dot(coef, P2)
+                    B += P2
+                A = -score + coef_P2  # + d*(H+P2) but d=0 so far
+                inner_iter = 0
+                while inner_iter < max_inner_iter:
+                    inner_iter += 1
+                    if self.selection == 'random':
+                        featurelist = random_state.permutation(n_features)
+                    else:
+                        featurelist = np.arange(n_features)
+                    for j in featurelist:
+                        # minimize_z: a z + 1/2 b z^2 + c |d+z|
+                        # a = A_j
+                        # b = B_jj > 0
+                        # c = |P1_j| = P1_j > 0, ee 1.3
+                        # d = w_j + d_j
+                        # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
+                        # with beta = z+d, beta_hat = d-a/b and gamma = c/b
+                        # z = 1/b * S(bd-a,c) - d
+                        # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
+                        a = A[j]
+                        b = B[j, j]
+                        if P1[j] == 0:
+                            if b == 0:
+                                z = 0
+                            else:
+                                z = -a/b
+                        elif a + P1[j] < b * (coef[j]+d[j]):
+                            if b == 0:
+                                z = 0
+                            else:
+                                z = -(a + P1[j])/b
+                        elif a - P1[j] > b * (coef[j]+d[j]):
+                            if b == 0:
+                                z = 0
+                            else:
+                                z = -(a - P1[j])/b
+                        else:
+                            z = -(coef[j] + d[j])
+                        # update direction d
+                        d[j] += z
+                        # update A because d_j is now d_j+z
+                        # A = f'(w) + d*H(w) + (w+d)*P2
+                        # => A += (H+P2)*e_j z  = B_j * z
+                        # Note: B is symmetric B = B.transpose
+                        if sparse.issparse(B):
+                            if sparse.isspmatrix_csc(B):
+                                # slice columns
+                                A += B[:, j].toarray().ravel() * z
+                            else:
+                                # slice rows
+                                A += B[j, :].toarray().ravel() * z
+                        else:
+                            A += B[j, :] * z
+                        # end of cycle
+                    # stopping criterion for inner loop
+                    # sum_i(|minimum-norm subgrad of q(d)_i|)
+                    mn_subgrad = (np.where(coef + d == 0,
+                                  np.sign(A)*np.maximum(np.abs(A)-P1, 0),
+                                  A+np.sign(coef+d)*P1))
+                    mn_subgrad = np.sum(np.abs(mn_subgrad))
+                    if mn_subgrad <= inner_tol:
+                        if inner_iter == 1:
+                            inner_tol = inner_tol/4.
+                        break
+                    # end of inner loop
+                # line search by sequence beta^k, k=0, 1, ..
+                # F(w + lambda d) - F(w) <= lambda * bound
+                # bound = sigma * (f'(w)*d + w*P2*d
+                #                  +||P1 (w+d)||_1 - ||P1 w||_1)
+                P1w_1 = linalg.norm(P1*coef, ord=1)
+                # Note: coef_P2 already calculated and still valid
+                bound = sigma * (
+                    safe_sparse_dot(-score, d) +
+                    safe_sparse_dot(coef_P2, d) +
+                    linalg.norm(P1*(coef+d), ord=1) -
+                    P1w_1)
+                Fw = (0.5 * family.deviance(y, mu, weights) +
+                      0.5 * safe_sparse_dot(coef_P2, coef) +
+                      P1w_1)
+                la = 1./beta
+                for k in range(20):
+                    la *= beta  # starts with la=1
+                    mu_wd = link.inverse(safe_sparse_dot(Xnew, coef+la*d,
+                                         dense_output=True))
+                    Fwd = (0.5 * family.deviance(y, mu_wd, weights) +
+                           linalg.norm(P1*(coef+la*d), ord=1))
+                    if P2.ndim == 1:
+                        Fwd += 0.5 * safe_sparse_dot((coef+la*d)*P2, coef+la*d)
+                    else:
+                        Fwd += 0.5 * (safe_sparse_dot(coef+la*d,
+                                      safe_sparse_dot(P2, coef+la*d)))
+                    if Fwd-Fw <= sigma*la*bound:
+                        break
+                # update coefficients
+                # coef_old = coef.copy()
+                coef += la * d
+                # calculate eta, mu, score, Fisher matrix for next iteration
+                eta, mu, score, fisher = family._eta_mu_score_fisher(
+                    coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link)
+                # stopping criterion for outer loop
+                # sum_i(|minimum-norm subgrad of F(w)_i|)
+                # fp_wP2 = f'(w) + w*P2
+                # Note: eta, mu and score are already updated
+                if P2.ndim == 1:
+                    fp_wP2 = -score + coef*P2
+                else:
+                    fp_wP2 = -score + safe_sparse_dot(coef, P2)
+                mn_subgrad = (np.where(coef == 0,
+                              np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
+                              fp_wP2+np.sign(coef)*P1))
+                mn_subgrad = np.sum(np.abs(mn_subgrad))
+                if mn_subgrad <= self.tol:
+                    converged = True
+                    break
+                # end of outer loop
+            if not converged:
+                warnings.warn("Coordinate descent failed to converge. Increase"
+                              " the number of iterations (currently {0})"
+                              .format(self.max_iter), ConvergenceWarning)
 
         #######################################################################
         # 5. postprocessing                                                   #
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index de7de90db967b..87cc8bea45f5b 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -1,6 +1,7 @@
 import numpy as np
 from numpy.testing import assert_allclose
 import scipy as sp
+from scipy import sparse
 
 from sklearn.linear_model.glm import (
     Link,
@@ -126,7 +127,7 @@ def test_glm_identiy_regression():
         GammaDistribution(), InverseGaussianDistribution(),
         TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
         GeneralizedHyperbolicSecand())
-    for solver in ['irls', 'lbfgs', 'newton-cg']:
+    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
         for family in families:
             glm = GeneralizedLinearRegressor(
                 alpha=0, family=family, fit_intercept=False, solver=solver)
@@ -162,28 +163,31 @@ def test_normal_ridge():
     rng = np.random.RandomState(0)
     alpha = 1.0
 
-    # With more samples than features
+    # 1. With more samples than features
     n_samples, n_features, n_predict = 6, 5, 10
     y = rng.randn(n_samples)
     X = rng.randn(n_samples, n_features)
     T = rng.randn(n_predict, n_features)
 
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True)
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6,
+                  solver='svd', normalize=False)
     ridge.fit(X, y)
-    for solver in ['irls', 'lbfgs', 'newton-cg']:
+    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
         glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
                                          family='normal', link='identity',
-                                         fit_intercept=True, solver=solver)
+                                         fit_intercept=True, tol=1e-6,
+                                         max_iter=100, solver=solver)
         glm.fit(X, y)
         assert_equal(glm.coef_.shape, (X.shape[1], ))
         assert_array_almost_equal(glm.coef_, ridge.coef_)
         assert_almost_equal(glm.intercept_, ridge.intercept_)
         assert_array_almost_equal(glm.predict(T), ridge.predict(T))
 
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False)
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
+                  solver='svd', normalize=False)
     ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6,
                                      family='normal', link='identity',
                                      fit_intercept=False, solver='irls')
     glm.fit(X, y)
@@ -192,28 +196,30 @@ def test_normal_ridge():
     assert_almost_equal(glm.intercept_, ridge.intercept_)
     assert_array_almost_equal(glm.predict(T), ridge.predict(T))
 
-    # With more features than samples
+    # 2. With more features than samples and sparse
     n_samples, n_features, n_predict = 5, 10, 10
     y = rng.randn(n_samples)
-    X = rng.randn(n_samples, n_features)
-    T = rng.randn(n_predict, n_features)
+    X = sparse.csr_matrix(rng.randn(n_samples, n_features))
+    T = sparse.csr_matrix(rng.randn(n_predict, n_features))
 
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True)
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9,
+                  solver='sag', normalize=False, max_iter=100000)
     ridge.fit(X, y)
-    for solver in ['irls', 'lbfgs', 'newton-cg']:
-        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
+        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7,
                                          family='normal', link='identity',
                                          fit_intercept=True, solver=solver)
         glm.fit(X, y)
         assert_equal(glm.coef_.shape, (X.shape[1], ))
-        assert_array_almost_equal(glm.coef_, ridge.coef_)
-        assert_almost_equal(glm.intercept_, ridge.intercept_)
-        assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+        assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5)
+        assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5)
+        assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5)
 
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False)
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
+                  solver='sag', normalize=False, max_iter=1000)
     ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6,
                                      family='normal', link='identity',
                                      fit_intercept=False, solver='irls')
     glm.fit(X, y)
@@ -240,9 +246,41 @@ def test_poisson_ridge():
     # b            0.03741173122
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
-    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, family='poisson',
-                                     link='log', tol=1e-10)
+    s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 7, 'cd': 7}
+    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
+        glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
+                                         fit_intercept=True, family='poisson',
+                                         link='log', tol=1e-7,
+                                         solver=solver, max_iter=200)
+        glm.fit(X, y)
+        assert_almost_equal(glm.intercept_, -0.12889386979,
+                            decimal=s_dec[solver])
+        assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
+                                  decimal=s_dec[solver])
+
+
+def test_poisson_enet():
+    """Test elastic net regression with poisson family and LogLink
+
+    Compare to R's glmnet"""
+    # library("glmnet")
+    # options(digits=10)
+    # library("glmnet")
+    # options(digits=10)
+    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
+    # x <- data.matrix(df[,c("a", "b")])
+    # y <- df$y
+    # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson",
+    #               standardize=F, thresh=1e-10, nlambda=10000)
+    # coef(fit, s=1)
+    # (Intercept) -0.03550978409
+    # a            0.16936423283
+    # b            .
+    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
+    y = np.array([0, 1, 1, 2])
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
+                                     link='log', tol=1e-7)
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, -0.12889386979, decimal=7)
-    assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
+    assert_almost_equal(glm.intercept_, -0.03550978409, decimal=7)
+    assert_array_almost_equal(glm.coef_, [0.16936423283, 0.],
                               decimal=7)

From 91497a2abc4824cdcb72f88dc26c9fd347d54b0d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 24 Jan 2018 20:37:27 +0100
Subject: [PATCH 011/209] [WIP] Add Generalized Linear Models (#9405)

* bug for sparse matrices for newton-cg solver, function grad_hess
* reduce precision for solver newton-cg in test_poisson_ridge
* remedy doctest issues in linear_model.rst for example of GeneralizedLinearRegressor
* remove unused import of xrange from six
---
 doc/modules/linear_model.rst           | 2 +-
 sklearn/linear_model/glm.py            | 3 +--
 sklearn/linear_model/tests/test_glm.py | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 834466e494a4a..1f0946e97b059 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -919,7 +919,7 @@ follows:
 
     >>> from sklearn.linear_model import GeneralizedLinearRegressor
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
-    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +SKIP
+    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
     GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True,
               copy_X=True, family='poisson', fit_dispersion='chisqr',
               fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 3de82c20f33cf..25f3ee1f52a2e 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -53,7 +53,6 @@
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..externals import six
-from ..externals.six.moves import xrange
 from ..utils import check_array, check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import newton_cg
@@ -1288,7 +1287,7 @@ def grad_hess(coef, X, y, weights, link):
                 if P2.ndim == 1:
                     hessian[np.diag_indices_from(hessian)] += P2
                 else:
-                    hessian += P2
+                    hessian = hessian + P2
 
                 def Hs(s):
                     ret = safe_sparse_dot(hessian, s)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 87cc8bea45f5b..c48c59ebd0eda 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -246,7 +246,7 @@ def test_poisson_ridge():
     # b            0.03741173122
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
-    s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 7, 'cd': 7}
+    s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7}
     for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
         glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
                                          fit_intercept=True, family='poisson',

From b9e5105ddb011a2a4efd74eeb3033ebb824fa5a8 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 24 Jan 2018 21:44:10 +0100
Subject: [PATCH 012/209] [WIP] Add Generalized Linear Models (#9405)

* bug in cd solver for sparse matrices
* higer precision (smaller tol) in test_normal_ridge for sparse matrices
* for each solver a separate precision (tol) in test_poisson_ridge
---
 sklearn/linear_model/glm.py            |  2 +-
 sklearn/linear_model/tests/test_glm.py | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 25f3ee1f52a2e..b428ee7509d14 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -1376,7 +1376,7 @@ def Hs(s):
                     B[np.diag_indices_from(B)] += P2
                 else:
                     coef_P2 = safe_sparse_dot(coef, P2)
-                    B += P2
+                    B = B + P2
                 A = -score + coef_P2  # + d*(H+P2) but d=0 so far
                 inner_iter = 0
                 while inner_iter < max_inner_iter:
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index c48c59ebd0eda..baad852dfb945 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -207,19 +207,20 @@ def test_normal_ridge():
                   solver='sag', normalize=False, max_iter=100000)
     ridge.fit(X, y)
     for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
-        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7,
+        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8,
                                          family='normal', link='identity',
-                                         fit_intercept=True, solver=solver)
+                                         fit_intercept=True, solver=solver,
+                                         max_iter=300)
         glm.fit(X, y)
         assert_equal(glm.coef_.shape, (X.shape[1], ))
         assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5)
         assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5)
         assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5)
 
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7,
                   solver='sag', normalize=False, max_iter=1000)
     ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6,
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7,
                                      family='normal', link='identity',
                                      fit_intercept=False, solver='irls')
     glm.fit(X, y)
@@ -247,11 +248,12 @@ def test_poisson_ridge():
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
     s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7}
+    s_tol = {'irls': 1e-8, 'lbfgs': 1e-7, 'newton-cg': 1e-7, 'cd': 1e-8}
     for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
         glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
                                          fit_intercept=True, family='poisson',
-                                         link='log', tol=1e-7,
-                                         solver=solver, max_iter=200)
+                                         link='log', tol=s_tol[solver],
+                                         solver=solver, max_iter=300)
         glm.fit(X, y)
         assert_almost_equal(glm.intercept_, -0.12889386979,
                             decimal=s_dec[solver])

From e317422e9dd860c4ed5a3c6ac6191eb8e560c365 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 25 Jan 2018 21:44:04 +0100
Subject: [PATCH 013/209] [WIP] Add Generalized Linear Models (#9405)

* improved documentation
* additional option 'zero' for argument start_params
* validation of sample_weight in function predict
* input validation of estimate_phi
* set default fit_dispersion=None
* bug in estimate_phi because of weight rescaling
* test for estimate_phi in normal ridge regression
* extended tests for elastic net poisson
---
 sklearn/linear_model/glm.py            | 116 ++++++++++++++++---------
 sklearn/linear_model/tests/test_glm.py |  39 +++++++--
 2 files changed, 110 insertions(+), 45 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index b428ee7509d14..e5eda6108052c 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -11,13 +11,13 @@
 # TODO: deal with option self.copy_X
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
-# TODO: Add cross validation
+# TODO: Add cross validation support
 # TODO: Should GeneralizedLinearRegressor inherit from LinearModel?
 #       So far, it does not.
 # TODO: Include further classes in class.rst? ExponentialDispersionModel?
 #       TweedieDistribution?
-# TODO: Negative values in P1 are not allowed so far. They could be used form
-#       group lasse.
+# TODO: Negative values in P1 are not allowed so far. They could be used to
+#       for group lasso.
 
 # Design Decisions:
 # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
@@ -642,7 +642,7 @@ def _irls_step(X, W, P2, z):
     -------
     coef: array, shape = (X.shape[1])
     """
-    # TODO: scipy.linalg.solve if faster, but ordinary least squares uses
+    # TODO: scipy.linalg.solve is faster, but ordinary least squares uses
     #       scipy.linalg.lstsq. What is more appropriate?
     n_samples, n_features = X.shape
     if sparse.issparse(X):
@@ -709,16 +709,20 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     (penalized) maximum likelihood which is equivalent to minimizing the
     deviance.
 
-    TODO: For `alpha` > 0, the feature matrix `X` is assumed to be
-    standardized. Call
+    For `alpha` > 0, the feature matrix `X` should be standardized in order to
+    penalize features equally strong. Call
     :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
-    Otherwise, the strength of the penalty is different for the features.
 
     TODO: Estimation of the dispersion parameter phi.
 
-    TODO: Notes on weights and 'scaled' distributions. For Poisson, this means
-    to fit y = z/w with z=counts and w=exposure (time, money, persons, ...)
-    => y is a ratio with weights w. Same for other distributions.
+    If your target `y` is a ratio, you should also provide appropriate weights
+    `w`. As an example, consider Poission distributed counts `z` (integers) and
+    weights `w`=exposure (time, money, persons years, ...), then you fit
+    `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y,
+    sample_weight=w)``. You need the weights for the right mean, consider:
+    :math:`\bar(y) = \frac{\sum_i w_i y_i}{\sum_i w_i}`.
+    In this case one might say that y has a 'scaled' Poisson distributions.
+    The same holds for other distributions.
 
     Parameters
     ----------
@@ -800,8 +804,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         does not exit (first call to fit), option ``start_params`` sets the
         starting values for ``coef_`` and ``intercept_``.
 
-    start_params : None or array of shape (n_features, ) or 'least_squares'}, \
-            optional (default=None)
+    start_params : {None, 'least_squares', 'zero'} or array of shape \
+            (n_features, ) or }, optional (default=None)
         If an array of size n_features is supplied, use these as start values
         for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
         is assumed to be the start value for the ``intercept_``.
@@ -854,16 +858,18 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     n_iter_ : int
         Actual number of iterations of the solver.
 
-    Notes
-    -----
 
     References
     ----------
-    TODO
+    For the coordinate descent implementation:
+    .. [1] Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+           An Improved GLMNET for L1-regularized Logistic Regression,
+           Journal of Machine Learning Research 13 (2012) 1999-2030
+           https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
     """
     def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
                  fit_intercept=True, family='normal', link='identity',
-                 fit_dispersion='chisqr', solver='auto', max_iter=100,
+                 fit_dispersion=None, solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False, start_params=None,
                  selection='random', random_state=None, copy_X=True,
                  check_input=True, verbose=0):
@@ -1004,9 +1010,10 @@ def fit(self, X, y, sample_weight=None):
         if start_params is None:
             pass
         elif isinstance(start_params, six.string_types):
-            if start_params not in ['least_squares']:
+            if start_params not in ['least_squares', 'zero']:
                 raise ValueError("The argument start_params must be None, "
-                                 "'least-squares' or an array of right length,"
+                                 "'least-squares', 'zero' or an array of right"
+                                 " length,"
                                  " got(start_params={0})".format(start_params))
         else:
             start_params = np.atleast_1d(start_params)
@@ -1129,6 +1136,7 @@ def fit(self, X, y, sample_weight=None):
         # deviance = sum(sample_weight * unit_deviance),
         # we rescale weights such that sum(weights) = 1 and this becomes
         # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
+        weights_sum = np.sum(weights)
         weights = weights/np.sum(weights)
 
         #######################################################################
@@ -1141,7 +1149,8 @@ def fit(self, X, y, sample_weight=None):
         coef = None
         if self.warm_start and hasattr(self, "coef_"):
             if self.fit_intercept:
-                coef = np.concatenate((self.intercept_, self.coef_))
+                coef = np.concatenate((np.array([self.intercept_]),
+                                       self.coef_))
             else:
                 coef = self.coef_
         elif self.start_params is None:
@@ -1164,24 +1173,27 @@ def fit(self, X, y, sample_weight=None):
             else:
                 # with L1 penalty, start with coef = 0
                 coef = np.zeros(n_features)
-        elif (isinstance(self.start_params, six.string_types) and
-                self.start_params == 'least_squares'):
-            if self.alpha == 0:
-                reg = LinearRegression(copy_X=True, fit_intercept=False)
-                reg.fit(Xnew, link.link(y))
-                coef = reg.coef_
-            elif self.l1_ratio <= 0.01:
-                # ElasticNet says l1_ratio <= 0.01 is not reliable, use Ridge
-                reg = Ridge(copy_X=True, fit_intercept=False,
-                            alpha=self.alpha)
-                reg.fit(Xnew, link.link(y))
-                coef = reg.coef_
-            else:
-                # TODO: Does this make sense at all?
-                reg = ElasticNet(copy_X=True, fit_intercept=False,
-                                 alpha=self.alpha, l1_ratio=self.l1_ratio)
-                reg.fit(Xnew, link.link(y))
-                coef = reg.coef_
+        elif isinstance(self.start_params, six.string_types):
+            if self.start_params == 'zero':
+                coef = np.zeros(n_features)
+            elif self.start_params == 'least_squares':
+                if self.alpha == 0:
+                    reg = LinearRegression(copy_X=True, fit_intercept=False)
+                    reg.fit(Xnew, link.link(y))
+                    coef = reg.coef_
+                elif self.l1_ratio <= 0.01:
+                    # ElasticNet says l1_ratio <= 0.01 is not reliable
+                    # => use Ridge
+                    reg = Ridge(copy_X=True, fit_intercept=False,
+                                alpha=self.alpha)
+                    reg.fit(Xnew, link.link(y))
+                    coef = reg.coef_
+                else:
+                    # TODO: Does this make sense at all?
+                    reg = ElasticNet(copy_X=True, fit_intercept=False,
+                                     alpha=self.alpha, l1_ratio=self.l1_ratio)
+                    reg.fit(Xnew, link.link(y))
+                    coef = reg.coef_
         else:
             coef = start_params
 
@@ -1365,6 +1377,7 @@ def Hs(s):
                 d = np.zeros_like(coef)
                 # inner loop
                 # TODO: use sparsity (coefficient already 0 due to L1 penalty)
+                #       => active set of features for featurelist, see paper
                 d = np.zeros_like(coef)
                 # A = f'(w) + d*H(w) + (w+d)*P2
                 # B = H+P2
@@ -1508,7 +1521,8 @@ def Hs(s):
             self.coef_ = coef
 
         if self.fit_dispersion in ['chisqr', 'deviance']:
-            self.dispersion_ = self.estimate_phi(y, X, weights)
+            # attention because of rescaling of weights
+            self.dispersion_ = self.estimate_phi(y, X, weights)*weights_sum
 
         return self
 
@@ -1544,9 +1558,23 @@ def predict(self, X, sample_weight=1):
         C : array, shape = (n_samples)
             Returns predicted values times sample_weight.
         """
-        # TODO: validation of sample_weight
         eta = self.linear_predictor(X)
         mu = self._link_instance.inverse(eta)
+        if sample_weight is None:
+            return mu
+        elif np.isscalar(sample_weight):
+            if sample_weight <= 0:
+                raise ValueError("Sample weight must be positive, "
+                                 "got (sample_weight={0})."
+                                 .format(sample_weight))
+        else:
+            sample_weights = np.atleast_1d(sample_weight)
+            if sample_weight.ndim > 1:
+                raise ValueError("Sample weight must be 1D array or scalar.")
+            elif sample_weight.shape[0] != mu.shape[0]:
+                raise ValueError("Sample weights must have the same length as"
+                                 " X.shape[1].")
+
         return mu*sample_weight
 
     def estimate_phi(self, y, X, sample_weight):
@@ -1554,10 +1582,20 @@ def estimate_phi(self, y, X, sample_weight):
         Returns the estimate.
         """
         check_is_fitted(self, "coef_")
+        _dtype = [np.float64, np.float32]
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                         dtype=_dtype, y_numeric=True, multi_output=False)
         n_samples, n_features = X.shape
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
             eta += self.intercept_
+            n_features += 1
+        if n_samples <= n_features:
+            raise ValueError("Estimation of dispersion parameter phi requires"
+                             " more samples than features, got"
+                             " samples=X.shape[0]={0} and"
+                             " n_features=X.shape[1]+fit_intercept={1}."
+                             .format(n_samples, n_features))
         mu = self._link_instance.inverse(eta)
         if self.fit_dispersion == 'chisqr':
             chisq = np.sum(sample_weight*(y-mu)**2 /
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index baad852dfb945..ee90cd51c874d 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -164,7 +164,7 @@ def test_normal_ridge():
     alpha = 1.0
 
     # 1. With more samples than features
-    n_samples, n_features, n_predict = 6, 5, 10
+    n_samples, n_features, n_predict = 10, 5, 10
     y = rng.randn(n_samples)
     X = rng.randn(n_samples, n_features)
     T = rng.randn(n_predict, n_features)
@@ -189,12 +189,16 @@ def test_normal_ridge():
     ridge.fit(X, y)
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6,
                                      family='normal', link='identity',
-                                     fit_intercept=False, solver='irls')
+                                     fit_intercept=False, solver='irls',
+                                     fit_dispersion='chisqr')
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
     assert_array_almost_equal(glm.coef_, ridge.coef_)
     assert_almost_equal(glm.intercept_, ridge.intercept_)
     assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+    mu = glm.predict(X)
+    assert_almost_equal(glm.dispersion_,
+                        np.sum((y-mu)**2/(n_samples-n_features)))
 
     # 2. With more features than samples and sparse
     n_samples, n_features, n_predict = 5, 10, 10
@@ -278,11 +282,34 @@ def test_poisson_enet():
     # (Intercept) -0.03550978409
     # a            0.16936423283
     # b            .
+    glmnet_intercept = -0.03550978409
+    glmnet_coef = [0.16936423283, 0.]
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
-                                     link='log', tol=1e-7)
+                                     link='log', solver='cd', tol=1e-7)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7)
+    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7)
+
+    # same for start_params='zero' with reduced precision
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
+                                     link='log', solver='cd', tol=1e-5,
+                                     start_params='zero')
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
+    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
+
+    # start_params='least_squares' with different alpha
+    glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5,
+                                     family='poisson',
+                                     link='log', solver='cd', tol=1e-5,
+                                     start_params='zero')
+    glm.fit(X, y)
+    # warm start with original alpha and use of sparse matrices
+    glm.warm_start = True
+    glm.alpha = 1
+    X = sparse.csr_matrix(X)
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, -0.03550978409, decimal=7)
-    assert_array_almost_equal(glm.coef_, [0.16936423283, 0.],
-                              decimal=7)
+    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
+    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)

From 9a9818441d605bf86547651997db81e969f41cdf Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 25 Jan 2018 22:59:46 +0100
Subject: [PATCH 014/209] [WIP] Add Generalized Linear Models (#9405)

* new helper function _check_weights for validation of sample_weight
* fix white space issue in doctest of linear_model.rst
---
 doc/modules/linear_model.rst |  8 ++---
 sklearn/linear_model/glm.py  | 59 +++++++++++++++++-------------------
 2 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 1f0946e97b059..f7b0ca0cc7add 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -921,10 +921,10 @@ follows:
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
     GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True,
-              copy_X=True, family='poisson', fit_dispersion='chisqr',
-              fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
-              random_state=None, selection='random', solver='auto',
-              start_params=None, tol=0.0001, verbose=0, warm_start=False)
+                  copy_X=True, family='poisson', fit_dispersion='chisqr',
+                  fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
+                  random_state=None, selection='random', solver='auto',
+                  start_params=None, tol=0.0001, verbose=0, warm_start=False)
     >>> reg.coef_
     array([ 0.24630255,  0.43373521])
     >>> reg.intercept_ #doctest: +ELLIPSIS
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index e5eda6108052c..138830ea431c9 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -59,6 +59,26 @@
 from ..utils.validation import check_is_fitted, check_random_state
 
 
+def _check_weights(sample_weight, n_samples):
+    if sample_weight is None:
+        weights = np.ones(n_samples)
+    elif np.isscalar(sample_weight):
+        if sample_weight < 0:
+            raise ValueError("Sample weights must be non-negative.")
+        weights = sample_weight*np.ones(n_samples)
+    else:
+        weights = np.atleast_1d(sample_weight)
+        if weights.ndim > 1:
+            raise ValueError("Sample weight must be 1D array or scalar")
+        elif weights.shape[0] != n_samples:
+            raise ValueError("Sample weights must have the same length as"
+                             " y")
+        if not np.all(sample_weight >= 0):
+            raise ValueError("Sample weights must be non-negative.")
+
+    return weights
+
+
 class Link(six.with_metaclass(ABCMeta)):
     """Abstract base class for Link funtions
     """
@@ -925,17 +945,7 @@ def fit(self, X, y, sample_weight=None):
                          dtype=_dtype, y_numeric=True, multi_output=False)
         y = y.astype(np.float64)
 
-        if sample_weight is None:
-            weights = np.ones_like(y)
-        elif np.isscalar(sample_weight):
-            weights = sample_weight*np.ones_like(y)
-        else:
-            weights = np.atleast_1d(sample_weight)
-            if weights.ndim > 1:
-                raise ValueError("Sample weight must be 1D array or scalar")
-            elif weights.shape[0] != y.shape[0]:
-                raise ValueError("Sample weights must have the same length as"
-                                 " y")
+        weights = _check_weights(sample_weight, y.shape[0])
 
         # 1.2 validate arguments of __init__ ##################################
         # Garantee that self._family_instance is an instance of class
@@ -1544,7 +1554,7 @@ def linear_predictor(self, X):
         return safe_sparse_dot(X, self.coef_,
                                dense_output=True) + self.intercept_
 
-    def predict(self, X, sample_weight=1):
+    def predict(self, X, sample_weight=None):
         """Predict uing GLM with feature matrix X.
         If sample_weight is given, returns prediction*sample_weight.
 
@@ -1558,26 +1568,13 @@ def predict(self, X, sample_weight=1):
         C : array, shape = (n_samples)
             Returns predicted values times sample_weight.
         """
+        weights = _check_weights(sample_weight, X.shape[0])
         eta = self.linear_predictor(X)
         mu = self._link_instance.inverse(eta)
-        if sample_weight is None:
-            return mu
-        elif np.isscalar(sample_weight):
-            if sample_weight <= 0:
-                raise ValueError("Sample weight must be positive, "
-                                 "got (sample_weight={0})."
-                                 .format(sample_weight))
-        else:
-            sample_weights = np.atleast_1d(sample_weight)
-            if sample_weight.ndim > 1:
-                raise ValueError("Sample weight must be 1D array or scalar.")
-            elif sample_weight.shape[0] != mu.shape[0]:
-                raise ValueError("Sample weights must have the same length as"
-                                 " X.shape[1].")
 
-        return mu*sample_weight
+        return mu*weights
 
-    def estimate_phi(self, y, X, sample_weight):
+    def estimate_phi(self, y, X, sample_weight=None):
         """Estimation of the dispersion parameter.
         Returns the estimate.
         """
@@ -1586,6 +1583,7 @@ def estimate_phi(self, y, X, sample_weight):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          dtype=_dtype, y_numeric=True, multi_output=False)
         n_samples, n_features = X.shape
+        weights = _check_weights(sample_weight, n_samples)
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
             eta += self.intercept_
@@ -1640,10 +1638,7 @@ def score(self, X, y, sample_weight=None):
         # Note, default score defined in RegressorMixin is R^2 score.
         # TODO: make D^2 a score function in module metrics (and thereby get
         #       input validation and so on)
-        if sample_weight is None:
-            weights = np.ones_like(y)
-        else:
-            weights = np.atleast_1d(sample_weight)
+        weights = _check_weights(sample_weight, y.shape[0])
         mu = self.predict(X)
         dev = self._family_instance.deviance(y, mu, weights=weights)
         y_mean = np.average(y, weights=weights)

From db9defe6b7637fe022034ca7f435f4fd37f6c118 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 26 Jan 2018 08:33:59 +0100
Subject: [PATCH 015/209] [WIP] Add Generalized Linear Models (#9405)

* fit_dispersion default=None also in docs.
* improved docs.
* fixed input validation of predict
* fixed bug for sample_weight in estimate_phi
---
 sklearn/linear_model/glm.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 138830ea431c9..22a8be9e50828 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -754,7 +754,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         case, the design matrix X must have full column rank
         (no collinearities).
 
-    l1_ratio : float, optional (defaul=0)
+    l1_ratio : float, optional (default=0)
         The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For
         ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
@@ -768,7 +768,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         Note that n_features* = X.shape[1] = length of coef_ (intercept
         always excluded from counting).
 
-    P2 : None or array of shape (n_features*, n_features*)
+    P2 : None or array of shape (n_features*, n_features*), optional\
+            (default=None)
         With this square matrix the L2 penalty is calculated as `w P2 w`.
         This gives a fine control over this penalty (Tikhonov
         regularization).
@@ -781,20 +782,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\
             of class ExponentialDispersionModel, optional(default='normal')
-        the distributional assumption of the GLM.
+        the distributional assumption of the GLM, i.e. which loss function to
+        be minimized.
 
     link : {'identity', 'log'} or an instance of class Link,
         optional (default='identity')
         the link function of the GLM, i.e. mapping from linear predictor
         (X*coef) to expectation (mu).
 
-    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul='chisqr')
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None)
         method for estimation of the dispersion parameter phi. Whether to use
         the chi squared statisic or the deviance statistic. If None, the
         dispersion is not estimated.
 
     solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \
-            optional (defaul='auto')
+            optional (default='auto')
         Algorithm to use in the optimization problem.
 
         - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'.
@@ -830,11 +832,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
         is assumed to be the start value for the ``intercept_``.
         If 'least_squares' is set, the result of a least squares fit in the
-        link space (linear predictor) is taken. If ``None``, the start values
-        are calculated by setting mu to family.starting_mu(..) and one step of
-        irls.
-        This option only applies if ``warm_start=False`` or if fit is called
-        the first time (``self.coef_`` does not exist).
+        link space (linear predictor) is taken.
+        If 'zero' is set, all coefficients start with zero.
+        If ``None``, the start values are calculated by setting mu to
+        family.starting_mu(..) and one step of irls.
+        These options only apply if ``warm_start=False`` or if fit is called
+        the first time (``self.coef_`` does not yet exist).
 
     selection : str, optional (default='random')
         For the solver 'cd' (coordinate descent), the coordinates (features)
@@ -1550,7 +1553,9 @@ def linear_predictor(self, X):
             Returns predicted values of linear predictor.
         """
         check_is_fitted(self, "coef_")
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                        dtype='numeric', copy=True, ensure_2d=True,
+                        allow_nd=False)
         return safe_sparse_dot(X, self.coef_,
                                dense_output=True) + self.intercept_
 
@@ -1568,9 +1573,10 @@ def predict(self, X, sample_weight=None):
         C : array, shape = (n_samples)
             Returns predicted values times sample_weight.
         """
-        weights = _check_weights(sample_weight, X.shape[0])
+        # validation of X in linear_predictor
         eta = self.linear_predictor(X)
         mu = self._link_instance.inverse(eta)
+        weights = _check_weights(sample_weight, X.shape[0])
 
         return mu*weights
 
@@ -1596,11 +1602,11 @@ def estimate_phi(self, y, X, sample_weight=None):
                              .format(n_samples, n_features))
         mu = self._link_instance.inverse(eta)
         if self.fit_dispersion == 'chisqr':
-            chisq = np.sum(sample_weight*(y-mu)**2 /
+            chisq = np.sum(weights*(y-mu)**2 /
                            self._family_instance.unit_variance(mu))
             return chisq/(n_samples - n_features)
         elif self.fit_dispersion == 'deviance':
-            dev = self._family_instance.deviance(y, mu, sample_weight)
+            dev = self._family_instance.deviance(y, mu, weights)
             return dev/(n_samples - n_features)
 
     # Note: check_estimator(GeneralizedLinearRegressor) might raise

From dc7fdd7f8f6a3a2276f5f96aa6d5dd6ad3ce853e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 26 Jan 2018 08:41:24 +0100
Subject: [PATCH 016/209] [WIP] Add Generalized Linear Models (#9405)

* improved docs
---
 sklearn/linear_model/tests/test_glm.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index ee90cd51c874d..0af837c9c73f3 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -177,7 +177,8 @@ def test_normal_ridge():
         glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
                                          family='normal', link='identity',
                                          fit_intercept=True, tol=1e-6,
-                                         max_iter=100, solver=solver)
+                                         max_iter=100, solver=solver,
+                                         random_state=42)
         glm.fit(X, y)
         assert_equal(glm.coef_.shape, (X.shape[1], ))
         assert_array_almost_equal(glm.coef_, ridge.coef_)
@@ -214,7 +215,7 @@ def test_normal_ridge():
         glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8,
                                          family='normal', link='identity',
                                          fit_intercept=True, solver=solver,
-                                         max_iter=300)
+                                         max_iter=300, random_state=42)
         glm.fit(X, y)
         assert_equal(glm.coef_.shape, (X.shape[1], ))
         assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5)
@@ -257,7 +258,8 @@ def test_poisson_ridge():
         glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
                                          fit_intercept=True, family='poisson',
                                          link='log', tol=s_tol[solver],
-                                         solver=solver, max_iter=300)
+                                         solver=solver, max_iter=300,
+                                         random_state=42)
         glm.fit(X, y)
         assert_almost_equal(glm.intercept_, -0.12889386979,
                             decimal=s_dec[solver])
@@ -282,20 +284,23 @@ def test_poisson_enet():
     # (Intercept) -0.03550978409
     # a            0.16936423283
     # b            .
+    rand = 0
     glmnet_intercept = -0.03550978409
     glmnet_coef = [0.16936423283, 0.]
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
-                                     link='log', solver='cd', tol=1e-7)
+                                     link='log', solver='cd', tol=1e-7,
+                                     selection='random', random_state=42)
     glm.fit(X, y)
     assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7)
     assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7)
 
-    # same for start_params='zero' with reduced precision
+    # same for start_params='zero' and selection='cyclic'
+    # with reduced precision
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
                                      link='log', solver='cd', tol=1e-5,
-                                     start_params='zero')
+                                     selection='cyclic', start_params='zero')
     glm.fit(X, y)
     assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
     assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)

From b11d06ba72865c14b0532c6d6c34d264a09d7ae4 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 26 Jan 2018 16:57:45 +0100
Subject: [PATCH 017/209] [WIP] Add Generalized Linear Models (#9405)

* fixed input validation of X in predict
---
 sklearn/linear_model/glm.py            | 4 +++-
 sklearn/linear_model/tests/test_glm.py | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 22a8be9e50828..032ded86816dd 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -1573,7 +1573,9 @@ def predict(self, X, sample_weight=None):
         C : array, shape = (n_samples)
             Returns predicted values times sample_weight.
         """
-        # validation of X in linear_predictor
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                        dtype='numeric', copy=True, ensure_2d=True,
+                        allow_nd=False)
         eta = self.linear_predictor(X)
         mu = self._link_instance.inverse(eta)
         weights = _check_weights(sample_weight, X.shape[0])
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 0af837c9c73f3..776edd8aeec46 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -284,7 +284,6 @@ def test_poisson_enet():
     # (Intercept) -0.03550978409
     # a            0.16936423283
     # b            .
-    rand = 0
     glmnet_intercept = -0.03550978409
     glmnet_coef = [0.16936423283, 0.]
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T

From 9e6c01378a4cb245824bcb9429b3d566652af743 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 26 Jan 2018 17:37:44 +0100
Subject: [PATCH 018/209] [WIP] Add Generalized Linear Models (#9405)

* redundant line of code 'd = np.zeros_like(coef)'
---
 doc/modules/linear_model.rst | 2 +-
 sklearn/linear_model/glm.py  | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index f7b0ca0cc7add..dcb35b6a5d941 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -921,7 +921,7 @@ follows:
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
     GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True,
-                  copy_X=True, family='poisson', fit_dispersion='chisqr',
+                  copy_X=True, family='poisson', fit_dispersion=None,
                   fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
                   random_state=None, selection='random', solver='auto',
                   start_params=None, tol=0.0001, verbose=0, warm_start=False)
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 032ded86816dd..8856af8ec698a 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -73,7 +73,7 @@ def _check_weights(sample_weight, n_samples):
         elif weights.shape[0] != n_samples:
             raise ValueError("Sample weights must have the same length as"
                              " y")
-        if not np.all(sample_weight >= 0):
+        if not np.all(weights >= 0):
             raise ValueError("Sample weights must be non-negative.")
 
     return weights
@@ -1132,7 +1132,7 @@ def fit(self, X, y, sample_weight=None):
                 if sparse.issparse(P2):
                     # TODO: check sparse P2 for non-negativeness
                     # raise NotImplementedError("Check sparse P2 for "
-                    #                          "non-negaitveness is not yet "
+                    #                          "non-negativeness is not yet "
                     #                          "implemented.")
                     pass
                 elif P2.ndim == 2:
@@ -1391,7 +1391,6 @@ def Hs(s):
                 # inner loop
                 # TODO: use sparsity (coefficient already 0 due to L1 penalty)
                 #       => active set of features for featurelist, see paper
-                d = np.zeros_like(coef)
                 # A = f'(w) + d*H(w) + (w+d)*P2
                 # B = H+P2
                 # Note: f'=-score and H=fisher are updated at the end of outer

From bad0190a22623eae3a2f6dfcdb0fd8caee625111 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 27 Jan 2018 20:38:02 +0100
Subject: [PATCH 019/209] [WIP] Add Generalized Linear Models (#9405)

* added test to compare to ElasticNet
* deleted identical comment lines
---
 sklearn/linear_model/tests/test_glm.py | 29 +++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 776edd8aeec46..9990cafe2cbcf 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -12,7 +12,7 @@
     GammaDistribution, InverseGaussianDistribution,
     GeneralizedHyperbolicSecand,
     GeneralizedLinearRegressor)
-from sklearn.linear_model.ridge import Ridge
+from sklearn.linear_model import ElasticNet, Ridge
 
 from sklearn.utils.testing import (
     assert_equal, assert_almost_equal,
@@ -267,14 +267,37 @@ def test_poisson_ridge():
                                   decimal=s_dec[solver])
 
 
+def test_normal_enet():
+    """Tet elastic net regression with normal/gaussian family"""
+    rng = np.random.RandomState(0)
+    alpha, l1_ratio = 0.3, 0.7
+    n_samples, n_features = 20, 2
+    X = rng.randn(n_samples, n_features).copy(order='F')
+    beta = rng.randn(n_features)
+    y = 2 + np.dot(X, beta) + rng.randn(n_samples)
+
+    glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio,
+                                     family='normal', link='identity',
+                                     fit_intercept=True, tol=1e-7,
+                                     max_iter=100, selection='cyclic',
+                                     solver='cd', start_params='zero',
+                                     check_input=False)
+    glm.fit(X, y)
+
+    enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True,
+                      normalize=False, tol=1e-7, copy_X=True)
+    enet.fit(X, y)
+
+    assert_almost_equal(glm.intercept_, enet.intercept_)
+    assert_array_almost_equal(glm.coef_, enet.coef_)
+
+
 def test_poisson_enet():
     """Test elastic net regression with poisson family and LogLink
 
     Compare to R's glmnet"""
     # library("glmnet")
     # options(digits=10)
-    # library("glmnet")
-    # options(digits=10)
     # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
     # x <- data.matrix(df[,c("a", "b")])
     # y <- df$y

From 48137d86079c9a8efd15d57e719e3ee35f1644c9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 28 Jan 2018 11:52:19 +0100
Subject: [PATCH 020/209] [WIP] Add Generalized Linear Models (#9405)

* increased precision in test_normal_enet
---
 sklearn/linear_model/tests/test_glm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 9990cafe2cbcf..6f8bdd3a72f40 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -278,18 +278,18 @@ def test_normal_enet():
 
     glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio,
                                      family='normal', link='identity',
-                                     fit_intercept=True, tol=1e-7,
+                                     fit_intercept=True, tol=1e-8,
                                      max_iter=100, selection='cyclic',
                                      solver='cd', start_params='zero',
                                      check_input=False)
     glm.fit(X, y)
 
     enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True,
-                      normalize=False, tol=1e-7, copy_X=True)
+                      normalize=False, tol=1e-8, copy_X=True)
     enet.fit(X, y)
 
-    assert_almost_equal(glm.intercept_, enet.intercept_)
-    assert_array_almost_equal(glm.coef_, enet.coef_)
+    assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
+    assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
 
 
 def test_poisson_enet():

From 2c2a077a2e8c57bdf3c945678526d6efdb5763e9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 28 Jan 2018 12:42:03 +0100
Subject: [PATCH 021/209] [WIP] Add Generalized Linear Models (#9405)

* better doc for heavy tailed distributions
---
 doc/modules/linear_model.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index dcb35b6a5d941..5cb8e54afbb06 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -905,7 +905,8 @@ are the following:
 
   * If the target values are positive valued and skewed, you might try a Gamma deviance.
 
-  * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family).
+  * If the target values seem to be heavier tailed than a Gamma distribution, you might try an Inverse Gaussian deviance (or even higher variance powers of the Tweedie family).
+    Keep in mind that the mean is not a good measure for very heavy tailed distributions, cf. extreme value theory.
 
 Since the linear predictor :math:`Xw` can be negative and
 Poisson, Gamma and Inverse Gaussian distributions don't support negative values,

From 15931c3148b68c47aa3c3c19983525ae758a0981 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 28 Jan 2018 15:18:26 +0100
Subject: [PATCH 022/209] [WIP] Add Generalized Linear Models (#9405)

* improved input validation and testing of them
---
 sklearn/linear_model/glm.py            |  24 +++-
 sklearn/linear_model/tests/test_glm.py | 174 ++++++++++++++++++++++++-
 2 files changed, 189 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 8856af8ec698a..33df6b4c9b850 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -63,11 +63,14 @@ def _check_weights(sample_weight, n_samples):
     if sample_weight is None:
         weights = np.ones(n_samples)
     elif np.isscalar(sample_weight):
-        if sample_weight < 0:
+        if sample_weight <= 0:
             raise ValueError("Sample weights must be non-negative.")
         weights = sample_weight*np.ones(n_samples)
     else:
-        weights = np.atleast_1d(sample_weight)
+        _dtype = [np.float64, np.float32]
+        weights = check_array(sample_weight, accept_sparse='csr',
+                              force_all_finite=True, ensure_2d=False,
+                              dtype=_dtype)
         if weights.ndim > 1:
             raise ValueError("Sample weight must be 1D array or scalar")
         elif weights.shape[0] != n_samples:
@@ -75,6 +78,9 @@ def _check_weights(sample_weight, n_samples):
                              " y")
         if not np.all(weights >= 0):
             raise ValueError("Sample weights must be non-negative.")
+        elif not np.sum(weights) > 0:
+            raise ValueError("Sample weights must have at least one positive "
+                             "element.")
 
     return weights
 
@@ -1010,10 +1016,12 @@ def fit(self, X, y, sample_weight=None):
                                  "with L1 penalties, which are included with "
                                  "(alpha={1}) and (l1_ratio={2})."
                                  .format(solver, self.alpha, self.l1_ratio))
-        if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
-            raise ValueError("Maximum number of iteration must be positive;"
+        if (not isinstance(self.max_iter, six.integer_types)
+                or self.max_iter <= 0):
+            raise ValueError("Maximum number of iteration must be a positive "
+                             "integer;"
                              " got (max_iter={0!r})".format(self.max_iter))
-        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
+        if not isinstance(self.tol, numbers.Number) or self.tol <= 0:
             raise ValueError("Tolerance for stopping criteria must be "
                              "positive; got (tol={0!r})".format(self.tol))
         if not isinstance(self.warm_start, bool):
@@ -1029,7 +1037,9 @@ def fit(self, X, y, sample_weight=None):
                                  " length,"
                                  " got(start_params={0})".format(start_params))
         else:
-            start_params = np.atleast_1d(start_params)
+            start_params = check_array(start_params, accept_sparse='csr',
+                                       force_all_finite=True, ensure_2d=False,
+                                       dtype=_dtype, copy=True)
             if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
                     (start_params.ndim != 1)):
                 raise ValueError("Start values for parameters must have the"
@@ -1160,7 +1170,7 @@ def fit(self, X, y, sample_weight=None):
 
         # set start values for coef
         coef = None
-        if self.warm_start and hasattr(self, "coef_"):
+        if self.warm_start and hasattr(self, 'coef_'):
             if self.fit_intercept:
                 coef = np.concatenate((np.array([self.intercept_]),
                                        self.coef_))
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 6f8bdd3a72f40..1abbcf0540e28 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -5,7 +5,7 @@
 
 from sklearn.linear_model.glm import (
     Link,
-    # IdentityLink,
+    IdentityLink,
     LogLink,
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
@@ -16,7 +16,8 @@
 
 from sklearn.utils.testing import (
     assert_equal, assert_almost_equal,
-    assert_array_equal, assert_array_almost_equal)
+    assert_array_equal, assert_array_almost_equal,
+    assert_raises)
 
 
 def test_link_properties():
@@ -102,6 +103,34 @@ def f(coef):
         assert_allclose(fisher, approx, rtol=1e-3)
 
 
+def test_sample_weights_validation():
+    """Test the raised errors in the validation of sample_weight"""
+    # 1. scalar value but not positive
+    X = [[1]]
+    y = [1]
+    weights = 0
+    glm = GeneralizedLinearRegressor(fit_intercept=False)
+    assert_raises(ValueError, glm.fit, X, y, weights)
+
+    # 2. 2d array
+    weights = [[0]]
+    assert_raises(ValueError, glm.fit, X, y, weights)
+
+    # 3. 1d but wrong length
+    weights = [1, 0]
+    assert_raises(ValueError, glm.fit, X, y, weights)
+
+    # 4. 1d but only zeros (sum not greater than 0)
+    weights = [0, 0]
+    X = [[0], [1]]
+    y = [1, 2]
+    assert_raises(ValueError, glm.fit, X, y, weights)
+
+    # 5. 1d but weith a negative value
+    weights = [2, -1]
+    assert_raises(ValueError, glm.fit, X, y, weights)
+
+
 def test_glm_family_argument():
     """Test GLM family argument set as string
     """
@@ -115,6 +144,147 @@ def test_glm_family_argument():
                                          alpha=0).fit(X, y)
         assert_equal(type(glm._family_instance), type(fam))
 
+    glm = GeneralizedLinearRegressor(family='not a family',
+                                     fit_intercept=False)
+    assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_link_argument():
+    """Test GLM link argument set as string
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for (l, link) in [('identity', IdentityLink()),
+                      ('log', LogLink())]:
+        glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
+                                         link=l).fit(X, y)
+        assert_equal(type(glm._link_instance), type(link))
+
+    glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
+                                     link='not a link')
+    assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_alpha_argument():
+    """Test GLM alpha argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for alpha in ['not a number', -4.2]:
+        glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
+                                         alpha=alpha)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_l1_ratio_argument():
+    """Test GLM l1_ratio argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for l1_ratio in ['not a number', -4.2, 1.1, [1]]:
+        glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
+                                         l1_ratio=l1_ratio)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_fit_intercept_argument():
+    """Test GLM fit_intercept argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for fit_intercept in ['not bool', 1, 0, [True]]:
+        glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_solver_argument():
+    """Test GLM solver argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for solver in ['not a solver', 1, [1]]:
+        glm = GeneralizedLinearRegressor(solver=solver)
+        assert_raises(ValueError, glm.fit, X, y)
+
+    # solver not suitable for L1 penalty
+    for solver in ['irls', 'lbfgs', 'newton-cg']:
+        glm = GeneralizedLinearRegressor(solver=solver, alpha=1, l1_ratio=0.1)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_max_iter_argument():
+    """Test GLM max_iter argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for max_iter in ['not a number', 0, -1, 5.5, [1]]:
+        glm = GeneralizedLinearRegressor(max_iter=max_iter)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_tol_argument():
+    """Test GLM tol argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for tol in ['not a number', 0, -1.0, [1e-3]]:
+        glm = GeneralizedLinearRegressor(tol=tol)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_warm_start_argument():
+    """Test GLM warm_start argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for warm_start in ['not bool', 1, 0, [True]]:
+        glm = GeneralizedLinearRegressor(warm_start=warm_start)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_start_params_argument():
+    """Test GLM start_params argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for start_params in ['not a start_params', ['zero'], [0, 0, 0],
+                         [[0, 0]], ['a', 'b']]:
+        glm = GeneralizedLinearRegressor(start_params=start_params)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_selection_argument():
+    """Test GLM selection argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for selection in ['not a selection', 1, 0, ['cyclic']]:
+        glm = GeneralizedLinearRegressor(selection=selection)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_check_input_argument():
+    """Test GLM check_input argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for check_input in ['not bool', 1, 0, [True]]:
+        glm = GeneralizedLinearRegressor(check_input=check_input)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_random_state_argument():
+    """Test GLM random_state argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for random_state in ['a string', 0.5, [0]]:
+        glm = GeneralizedLinearRegressor(random_state=random_state)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+# TODO: check P1 and P2
+# TODO: check additional validations if check_input == True
 
 def test_glm_identiy_regression():
     """Test GLM regression with identity link on a simple dataset

From feedba379c2a8f53a7a9792e33041f29da7a4c95 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 30 Mar 2018 18:50:06 +0200
Subject: [PATCH 023/209] [MRG] Add Generalized Linear Models (#9405)

* improved input validation and testing of P1
* test case for validation of argument P2
* test case for validation of argument copy_X
---
 sklearn/linear_model/glm.py            |  3 ++
 sklearn/linear_model/tests/test_glm.py | 48 +++++++++++++++++++++-----
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 33df6b4c9b850..eae4a56ea1d95 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -1065,6 +1065,9 @@ def fit(self, X, y, sample_weight=None):
             P1 = np.ones(X.shape[1])
         else:
             P1 = np.atleast_1d(np.copy(self.P1))
+            if P1.dtype.kind not in ['b', 'i', 'u', 'f']:
+                raise ValueError("P1 must be a numeric value; "
+                                 "got (dtype={0}).".format(P1.dtype))
             if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]):
                 raise ValueError("P1 must be either None or an 1D array with "
                                  "the length of X.shape[1]; "
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 1abbcf0540e28..edf579a416973 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -187,6 +187,27 @@ def test_glm_l1_ratio_argument():
         assert_raises(ValueError, glm.fit, X, y)
 
 
+def test_glm_P1_argument():
+    """Test GLM P1 arguments
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for P1 in [['a string', 'a string'], [1, [2]], [1, 2, 3]]:
+        glm = GeneralizedLinearRegressor(P1=P1)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_P2_argument():
+    """Test GLM P2 arguments
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for P2 in [np.full((2, 2), 'a string'), [[1, [2]], [3, 4]], [1, 2, 3],
+               [[1, 2]], [[1], [2]]]:
+        glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
 def test_glm_fit_intercept_argument():
     """Test GLM fit_intercept argument
     """
@@ -263,27 +284,36 @@ def test_glm_selection_argument():
         assert_raises(ValueError, glm.fit, X, y)
 
 
-def test_glm_check_input_argument():
-    """Test GLM check_input argument
+def test_glm_random_state_argument():
+    """Test GLM random_state argument
     """
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for check_input in ['not bool', 1, 0, [True]]:
-        glm = GeneralizedLinearRegressor(check_input=check_input)
+    for random_state in ['a string', 0.5, [0]]:
+        glm = GeneralizedLinearRegressor(random_state=random_state)
         assert_raises(ValueError, glm.fit, X, y)
 
 
-def test_glm_random_state_argument():
-    """Test GLM random_state argument
+def test_glm_copy_X_argument():
+    """Test GLM copy_X arguments
     """
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for random_state in ['a string', 0.5, [0]]:
-        glm = GeneralizedLinearRegressor(random_state=random_state)
+    for copy_X in ['not bool', 1, 0, [True]]:
+        glm = GeneralizedLinearRegressor(copy_X=copy_X)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_check_input_argument():
+    """Test GLM check_input argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for check_input in ['not bool', 1, 0, [True]]:
+        glm = GeneralizedLinearRegressor(check_input=check_input)
         assert_raises(ValueError, glm.fit, X, y)
 
 
-# TODO: check P1 and P2
 # TODO: check additional validations if check_input == True
 
 def test_glm_identiy_regression():

From 6fdfb47428571b4c8e89046a7b1f481711832f61 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 30 Mar 2018 19:41:09 +0200
Subject: [PATCH 024/209] [MRG] Add Generalized Linear Models (#9405)

* fix doctest failure in example of linear_model.rst

* fix dtype issue in test_glm_P2_argument
---
 doc/modules/linear_model.rst           | 4 ++--
 sklearn/linear_model/tests/test_glm.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 5cb8e54afbb06..9f85da771c6f1 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -926,8 +926,8 @@ follows:
                   fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
                   random_state=None, selection='random', solver='auto',
                   start_params=None, tol=0.0001, verbose=0, warm_start=False)
-    >>> reg.coef_
-    array([ 0.24630255,  0.43373521])
+    >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE
+    array([0.24630255, 0.43373521])
     >>> reg.intercept_ #doctest: +ELLIPSIS
     -0.76383575...
 
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index edf579a416973..c5d132d35bdb9 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -202,8 +202,8 @@ def test_glm_P2_argument():
     """
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for P2 in [np.full((2, 2), 'a string'), [[1, [2]], [3, 4]], [1, 2, 3],
-               [[1, 2]], [[1], [2]]]:
+    for P2 in [np.full((2, 2), 'a string', dtype=np.dtype('<U8')),
+               [[1, [2]], [3, 4]], [1, 2, 3], [[1, 2]], [[1], [2]]]:
         glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
         assert_raises(ValueError, glm.fit, X, y)
 

From d489f56204585fb2c57e20d44e05730c47f301a1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 5 Aug 2018 14:48:33 +0200
Subject: [PATCH 025/209] [MRG] Add Generalized Linear Models (#9405)

* fix typos in doc
---
 doc/modules/linear_model.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 9f85da771c6f1..a204ccb080cc9 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -934,7 +934,7 @@ follows:
 Mathematical formulation
 ------------------------
 
-In the unpenalized case, the assumptions are the folowing:
+In the unpenalized case, the assumptions are the following:
 
     * The target values :math:`y_i` are realizations of random variables
       :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})`
@@ -951,7 +951,7 @@ same as specifying a unit variance function (they are one-to-one).
 
 Including penalties helps to avoid overfitting or, in case of L1 penalty, to
 obtain sparse solutions. But there are also other motivations to include them,
-e.g. accounting fo dependence structure of :math:`y`.
+e.g. accounting for the dependence structure of :math:`y`.
 
 The objective function, which is independent of :math:`\phi`, is minimized with
 respect to the coefficients :math:`w`.

From 809e3a2747e451fc97b9a462cf682110d59fac25 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 26 Aug 2018 20:41:25 +0200
Subject: [PATCH 026/209] Remove test_glm_P2_argument

---
 sklearn/linear_model/tests/test_glm.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index c5d132d35bdb9..82b8ec2435543 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -197,15 +197,15 @@ def test_glm_P1_argument():
         assert_raises(ValueError, glm.fit, X, y)
 
 
-def test_glm_P2_argument():
-    """Test GLM P2 arguments
-    """
-    y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    for P2 in [np.full((2, 2), 'a string', dtype=np.dtype('<U8')),
-               [[1, [2]], [3, 4]], [1, 2, 3], [[1, 2]], [[1], [2]]]:
-        glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
-        assert_raises(ValueError, glm.fit, X, y)
+# def test_glm_P2_argument():
+#     """Test GLM P2 arguments
+#     """
+#     y = np.array([1, 2])
+#     X = np.array([[1], [1]])
+#     for P2 in [np.full((2, 2), 'a string', dtype=np.dtype('<U8')),
+#                [[1, [2]], [3, 4]], [1, 2, 3], [[1, 2]], [[1], [2]]]:
+#         glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
+#         assert_raises(ValueError, glm.fit, X, y)
 
 
 def test_glm_fit_intercept_argument():

From 4edce36f6d4e66325db6213eb31a407d04565ee9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 30 Aug 2018 19:33:27 +0200
Subject: [PATCH 027/209] Filter out DeprecationWarning in old versions of
 scipy.sparse.linalg.spsolve about usage of umfpack

---
 sklearn/linear_model/tests/test_glm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 82b8ec2435543..e4be75ddb7a64 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -355,6 +355,7 @@ def test_glm_log_regression():
             assert_array_almost_equal(res.coef_, coef)
 
 
+@pytest.mark.filterwarnings('ignore:DeprecationWarning')
 def test_normal_ridge():
     """Test ridge regression for Normal distributions
 

From 46df5b6555938dc7c9acac67929b0633ea1354a3 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 30 Aug 2018 20:40:08 +0200
Subject: [PATCH 028/209] import pytest

---
 sklearn/linear_model/tests/test_glm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index e4be75ddb7a64..dfa205407a193 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -1,5 +1,6 @@
 import numpy as np
 from numpy.testing import assert_allclose
+import pytest
 import scipy as sp
 from scipy import sparse
 

From 21f2136c083b4057868b7886880f356e58703611 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 30 Aug 2018 21:17:53 +0200
Subject: [PATCH 029/209] Document arguments of abstact methods

---
 sklearn/linear_model/glm.py | 125 +++++++++++++++++++++++++++++++++---
 1 file changed, 115 insertions(+), 10 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index eae4a56ea1d95..825ee6bfe8c45 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -93,12 +93,22 @@ class Link(six.with_metaclass(ABCMeta)):
     def link(self, mu):
         """The link function g(mu) with argument mu=E[Y] returns the
         linear predictor.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Usually the predicted mean.
         """
         raise NotImplementedError
 
     @abstractmethod
     def derivative(self, mu):
         """Derivative of the link g'(mu).
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Usually the predicted mean.
         """
         raise NotImplementedError
 
@@ -106,18 +116,33 @@ def derivative(self, mu):
     def inverse(self, lin_pred):
         """The inverse link function h(lin_pred) with the linear predictor as
         argument returns mu=E[Y].
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (predicted) linear predictor.
         """
         raise NotImplementedError
 
     @abstractmethod
     def inverse_derivative(self, lin_pred):
         """Derivative of the inverse link function h'(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (predicted) linear predictor.
         """
         raise NotImplementedError
 
     @abstractmethod
     def inverse_derivative2(self, lin_pred):
         """Second derivative of the inverse link function h''(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (predicted) linear predictor.
         """
         raise NotImplementedError
 
@@ -236,6 +261,11 @@ def include_upper_bound(self):
 
     def in_y_range(self, x):
         """Returns true if `x` is in the valid range of Y~EDM.
+
+        Parameters
+        ----------
+        x : array, shape (n_samples,)
+            Target values.
         """
         if self.include_lower_bound:
             if self.include_upper_bound:
@@ -263,12 +293,22 @@ def unit_variance(self, mu):
             \partial\mu^2}}\big|_{y=\mu}
 
         See also :func:`variance`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
         """
         raise NotImplementedError()
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
         r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Target values.
         """
         raise NotImplementedError()
 
@@ -276,6 +316,17 @@ def variance(self, mu, phi=1, weights=1):
         r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
         :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`,
         with unit variance :math:`v(\mu)` and weights :math:`s_i`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        phi : float
+            Dispersion parameter.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
         """
         return phi/weights * self.unit_variance(mu)
 
@@ -284,6 +335,17 @@ def variance_derivative(self, mu, phi=1, weights=1):
         :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
         =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)`
         and weights :math:`s_i`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        phi : float (default=1)
+            Dispersion parameter.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
         """
         return phi/weights * self.unit_variance_derivative(mu)
 
@@ -293,6 +355,14 @@ def unit_deviance(self, y, mu):
         In terms of the log-likelihood it is given by
         :math:`d(y,\mu) = -2\phi\cdot
         \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).`
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
         """
         raise NotImplementedError()
 
@@ -301,9 +371,13 @@ def unit_deviance_derivative(self, y, mu):
         :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
         with unit variance :math:`v(\mu)`.
 
-        Returns
-        -------
-        derivative: array, shape = (n_samples,)
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
         """
         return -2*(y-mu)/self.unit_variance(mu)
 
@@ -313,6 +387,17 @@ def deviance(self, y, mu, weights=1):
         In terms of the likelihood it is :math:`D = -2\phi\cdot
         \left(loglike(y,\mu,\frac{phi}{s})
         - loglike(y,y,\frac{phi}{s})\right)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
         """
         return np.sum(weights*self.unit_deviance(y, mu))
 
@@ -326,6 +411,17 @@ def _deviance(self, coef, X, y, weights, link):
 
     def deviance_derivative(self, y, mu, weights=1):
         """The derivative w.r.t. `mu` of the deviance.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
         """
         return weights*self.unit_deviance_derivative(y, mu)
 
@@ -464,7 +560,16 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link):
         return eta, mu, score, fisher
 
     def starting_mu(self, y, weights=1):
-        """Starting values for the mean mu_i in (unpenalized) IRLS."""
+        """Starting values for the mean mu_i in (unpenalized) IRLS.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
         return ((weights*y+np.mean(weights*y)) /
                 (2.*np.sum(np.ones_like(y)*weights)))
 
@@ -656,12 +761,12 @@ def _irls_step(X, W, P2, z):
     X : numpy array or sparse matrix of shape (n_samples, n_features)
         Training data (with intercept included if present)
 
-    W : numpy array of shape (n_samples, )
+    W : numpy array of shape (n_samples,)
 
     P2 : numpy array or sparse matrix of shape (n_features, n_features)
         The l2-penalty matrix or vector (=diagonal matrix)
 
-    z  : numpy array of shape (n_samples, )
+    z  : numpy array of shape (n_samples,)
         Working observations
 
     Returns
@@ -927,12 +1032,12 @@ def fit(self, X, y, sample_weight=None):
         Parameters
         ----------
         X : numpy array or sparse matrix of shape (n_samples, n_features)
-            Training data
+            Training data.
 
-        y : numpy array of shape (n_samples, )
-            Target values
+        y : numpy array of shape (n_samples,)
+            Target values.
 
-        sample_weight : array of shape (n_samples, ) or None,\
+        sample_weight : array of shape (n_samples,) or None,\
                 optinal (default=None)
             Individual weights w_i for each sample. Note that for an
             Exponential Dispersion Model (EDM), one has

From 1faedf87eecfd2c6a668e2aec6e28a9fff8780ec Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 30 Aug 2018 21:18:51 +0200
Subject: [PATCH 030/209] Pytest filter warnings use two colons

---
 sklearn/linear_model/tests/test_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index dfa205407a193..1ac5ccd4d3d5c 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -356,7 +356,7 @@ def test_glm_log_regression():
             assert_array_almost_equal(res.coef_, coef)
 
 
-@pytest.mark.filterwarnings('ignore:DeprecationWarning')
+@pytest.mark.filterwarnings('ignore::DeprecationWarning')
 def test_normal_ridge():
     """Test ridge regression for Normal distributions
 

From 992f9819838336a9372a7cadecc53eeafef439ff Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 30 Aug 2018 22:07:33 +0200
Subject: [PATCH 031/209] Improve documentation of arguments that were so far
 undocumented

---
 sklearn/linear_model/glm.py | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 825ee6bfe8c45..a0d0266fa0efe 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -579,6 +579,17 @@ class TweedieDistribution(ExponentialDispersionModel):
     They have :math:`\mu=\mathrm{E}[Y]` and
     :math:`\mathrm{Var}[Y] \propto \mu^power.
 
+    Special cases are:
+
+    ===== ================
+    Power Distribution
+    ===== ================
+    0     Normal
+    1     Poisson
+    (0,1) Compound Poisson
+    2     Gamma
+    3     Inverse Gaussian
+
     Attributes
     ----------
     power : float
@@ -586,6 +597,12 @@ class TweedieDistribution(ExponentialDispersionModel):
             :math:`v(\mu) = \mu^{power}`.
     """
     def __init__(self, power=0):
+        """
+        Parameters
+        ----------
+        power : float (default=0)
+            Power of (of mu) of the variance function.
+        """
         self.power = power
         self._upper_bound = np.Inf
         self._include_upper_bound = False
@@ -623,6 +640,9 @@ def __init__(self, power=0):
             # Positive Stable
             self._lower_bound = 0
             self._include_lower_bound = False
+        else:
+            raise ValueError('The power must be a float, i.e. real number, '
+                             'got (power={})'.format(power))
 
     @property
     def power(self):
@@ -653,12 +673,22 @@ def include_upper_bound(self):
 
     def unit_variance(self, mu):
         """The unit variance of a Tweedie distribution is v(mu)=mu**power.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
         """
         return np.power(mu, self.power)
 
     def unit_variance_derivative(self, mu):
         """The derivative of the unit variance of a Tweedie distribution is
         v(mu)=power*mu**(power-1).
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
         """
         return self.power*np.power(mu, self.power-1)
 
@@ -680,9 +710,6 @@ def unit_deviance(self, y, mu):
             return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
                         y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
 
-    def likelihood(self, y, X, w, phi, weights=1):
-        raise NotImplementedError('This function is not (yet) implemented.')
-
 
 class NormalDistribution(TweedieDistribution):
     """Class for the Normal (aka Gaussian) distribution"""

From 06b8451ea109040371615e9e7baaa8ff505197f0 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 31 Aug 2018 00:26:39 +0200
Subject: [PATCH 032/209] Further improve documentation of arguments

---
 sklearn/linear_model/glm.py | 43 +++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index a0d0266fa0efe..9688b1e0c9e5c 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -601,7 +601,7 @@ def __init__(self, power=0):
         Parameters
         ----------
         power : float (default=0)
-            Power of (of mu) of the variance function.
+            Variance power of the `unit_variance` function.
         """
         self.power = power
         self._upper_bound = np.Inf
@@ -798,7 +798,7 @@ def _irls_step(X, W, P2, z):
 
     Returns
     -------
-    coef: array, shape = (X.shape[1])
+    coef: array, shape (X.shape[1])
     """
     # TODO: scipy.linalg.solve is faster, but ordinary least squares uses
     #       scipy.linalg.lstsq. What is more appropriate?
@@ -898,7 +898,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
-    P1 : None or array of shape (n_features*, ), optional\
+    P1 : None or array of shape (n_features*,), optional\
             (default=None)
         With this array, you can exclude coefficients from the L1 penalty.
         Set the corresponding value to 1 (include) or 0 (exclude). The
@@ -1007,7 +1007,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     Attributes
     ----------
-    coef_ : array, shape (n_features, )
+    coef_ : array, shape (n_features,)
         Estimated coefficients for the linear predictor (X*coef_) in the GLM.
 
     intercept_ : float
@@ -1679,7 +1679,7 @@ def Hs(s):
 
         if self.fit_dispersion in ['chisqr', 'deviance']:
             # attention because of rescaling of weights
-            self.dispersion_ = self.estimate_phi(y, X, weights)*weights_sum
+            self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum
 
         return self
 
@@ -1688,12 +1688,12 @@ def linear_predictor(self, X):
 
         Parameters
         ----------
-        X : numpy array or sparse matrix of shape [n_samples,n_features]
+        X : numpy array or sparse matrix, shape (n_samples, n_features)
             Samples.
 
         Returns
         -------
-        C : array, shape = (n_samples)
+        C : array, shape (n_samples)
             Returns predicted values of linear predictor.
         """
         check_is_fitted(self, "coef_")
@@ -1709,12 +1709,15 @@ def predict(self, X, sample_weight=None):
 
         Parameters
         ----------
-        X : numpy array or sparse matrix of shape [n_samples,n_features]
+        X : numpy array or sparse matrix, shape (n_samples, n_features)
             Samples.
 
+        sample_weight : array of shape (n_samples,) or None , \
+             (default=None)
+
         Returns
         -------
-        C : array, shape = (n_samples)
+        C : array, shape (n_samples,)
             Returns predicted values times sample_weight.
         """
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
@@ -1726,9 +1729,21 @@ def predict(self, X, sample_weight=None):
 
         return mu*weights
 
-    def estimate_phi(self, y, X, sample_weight=None):
-        """Estimation of the dispersion parameter.
+    def estimate_phi(self, X, y, sample_weight=None):
+        """Estimation of the dispersion parameter phi.
         Returns the estimate.
+
+        Parameters
+        ----------
+        X : numpy array or sparse matrix of shape (n_samples, n_features)
+            Training data.
+
+        y : numpy array, shape (n_samples,)
+            Target values.
+
+        sample_weight : array of shape (n_samples,) or None,\
+                optinal (default=None)
+            Sample weights.
         """
         check_is_fitted(self, "coef_")
         _dtype = [np.float64, np.float32]
@@ -1773,13 +1788,13 @@ def score(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like, shape = (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Test samples
 
-        y : array-like of shape = (n_samples)
+        y : array-like, shape (n_samples,)
             True valeus for X.
 
-        sample_weight : array-like, shape = (n_samples), optional
+        sample_weight : array-like, shape = (n_samples,), optional
             Sample weights.
 
         Returns

From c93f60d9e98a5be0d493d513e37c3c9de5167542 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 31 Aug 2018 08:25:01 +0200
Subject: [PATCH 033/209] Remove parameters docstring for __init__

---
 sklearn/linear_model/glm.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 9688b1e0c9e5c..021927b598822 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -597,12 +597,6 @@ class TweedieDistribution(ExponentialDispersionModel):
             :math:`v(\mu) = \mu^{power}`.
     """
     def __init__(self, power=0):
-        """
-        Parameters
-        ----------
-        power : float (default=0)
-            Variance power of the `unit_variance` function.
-        """
         self.power = power
         self._upper_bound = np.Inf
         self._include_upper_bound = False

From 66ec63b5157026f8541e2761c29ef3225d89a44c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 31 Aug 2018 19:24:23 +0200
Subject: [PATCH 034/209] Fix typos in docstring of TweedieDistribution

---
 sklearn/linear_model/glm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 021927b598822..65abb42b043d5 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -577,7 +577,7 @@ def starting_mu(self, y, weights=1):
 class TweedieDistribution(ExponentialDispersionModel):
     r"""A class for the Tweedie distribution.
     They have :math:`\mu=\mathrm{E}[Y]` and
-    :math:`\mathrm{Var}[Y] \propto \mu^power.
+    :math:`\mathrm{Var}[Y] \propto \mu^power`.
 
     Special cases are:
 
@@ -593,7 +593,7 @@ class TweedieDistribution(ExponentialDispersionModel):
     Attributes
     ----------
     power : float
-            The variance power of the unit_variance
+            The variance power of the `unit_variance`
             :math:`v(\mu) = \mu^{power}`.
     """
     def __init__(self, power=0):

From 53c69702c790223d3940cd650b9d52a61e39b244 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 31 Aug 2018 22:20:43 +0200
Subject: [PATCH 035/209] Change docstring section of TweedieDistribution from
 Attributes to Parameters

---
 sklearn/linear_model/glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 65abb42b043d5..fcb6c9754b826 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -590,7 +590,7 @@ class TweedieDistribution(ExponentialDispersionModel):
     2     Gamma
     3     Inverse Gaussian
 
-    Attributes
+    Parameters
     ----------
     power : float
             The variance power of the `unit_variance`

From 87d5ba38c5663224edd7ec10bc7efdf27172e7f8 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 7 Oct 2018 19:43:42 +0200
Subject: [PATCH 036/209] Minor doc improvements of GeneralizedLinearRegressor

---
 sklearn/linear_model/glm.py | 87 +++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index fcb6c9754b826..01d5420773ffc 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -6,17 +6,16 @@
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
-# TODO: Write more tests
-# TODO: Write examples and more docu
-# TODO: deal with option self.copy_X
+# TODO: Write examples
+# TODO: Make option self.copy_X more meaningfull than just for start values.
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
-# TODO: Add cross validation support
+# TODO: Add cross validation support?
 # TODO: Should GeneralizedLinearRegressor inherit from LinearModel?
 #       So far, it does not.
 # TODO: Include further classes in class.rst? ExponentialDispersionModel?
 #       TweedieDistribution?
-# TODO: Negative values in P1 are not allowed so far. They could be used to
+# TODO: Negative values in P1 are not allowed so far. They could be used
 #       for group lasso.
 
 # Design Decisions:
@@ -26,7 +25,7 @@
 #   regressor, Bernoulli/Binomial => classifier.
 #   Solution: GeneralizedLinearRegressor since this is the focus.
 # - Allow for finer control of penalty terms:
-#   L1: ||P1*w||_1 with P1*w a componentwise product, this allows to exclude
+#   L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude
 #       factors from the L1 penalty.
 #   L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be
 #   a 1st or 2nd order difference matrix (compare B-spline penalties and
@@ -322,7 +321,7 @@ def variance(self, mu, phi=1, weights=1):
         mu : array, shape (n_samples,)
             Predicted mean.
 
-        phi : float
+        phi : float (default=1)
             Dispersion parameter.
 
         weights : array, shape (n_samples,) (default=1)
@@ -592,7 +591,7 @@ class TweedieDistribution(ExponentialDispersionModel):
 
     Parameters
     ----------
-    power : float
+    power : float (default=0)
             The variance power of the `unit_variance`
             :math:`v(\mu) = \mu^{power}`.
     """
@@ -779,22 +778,22 @@ def _irls_step(X, W, P2, z):
 
     Parameters
     ----------
-    X : numpy array or sparse matrix of shape (n_samples, n_features)
+    X : {numpy array, sparse matrix}, shape (n_samples, n_features)
         Training data (with intercept included if present)
 
-    W : numpy array of shape (n_samples,)
+    W : numpy array, shape (n_samples,)
 
-    P2 : numpy array or sparse matrix of shape (n_features, n_features)
-        The l2-penalty matrix or vector (=diagonal matrix)
+    P2 : {numpy array, sparse matrix}, shape (n_features, n_features)
+        The L2-penalty matrix or vector (=diagonal matrix)
 
-    z  : numpy array of shape (n_samples,)
+    z  : numpy array, shape (n_samples,)
         Working observations
 
     Returns
     -------
     coef: array, shape (X.shape[1])
     """
-    # TODO: scipy.linalg.solve is faster, but ordinary least squares uses
+    # TODO: scipy.linalg.solve seems faster, but ordinary least squares uses
     #       scipy.linalg.lstsq. What is more appropriate?
     n_samples, n_features = X.shape
     if sparse.issparse(X):
@@ -892,19 +891,20 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
-    P1 : None or array of shape (n_features*,), optional\
+    P1 : {None, array-like}, shape (n_features*,), optional\
             (default=None)
         With this array, you can exclude coefficients from the L1 penalty.
         Set the corresponding value to 1 (include) or 0 (exclude). The
-        default value ``None`` is the same as an array of ones.
+        default value ``None`` is the same as a 1d array of ones.
         Note that n_features* = X.shape[1] = length of coef_ (intercept
         always excluded from counting).
 
-    P2 : None or array of shape (n_features*, n_features*), optional\
-            (default=None)
+    P2 : {None, array-like, sparse matrix}, shape \
+            (n_features*, n_features*), optional (default=None)
         With this square matrix the L2 penalty is calculated as `w P2 w`.
         This gives a fine control over this penalty (Tikhonov
         regularization).
+        The default value ``None`` is the same as the idendity matrix.
         Note that n_features* = X.shape[1] = length of coef_ (intercept
         always excluded from counting). P2 must be positive semi-definite.
 
@@ -939,8 +939,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties.
 
-        - 'cd' is the coordinate descent algorithm. It can deal with L1 and
-            L2 penalties.
+        - 'cd' is the coordinate descent algorithm. It can
+            deal with L1 as well as L2 penalties.
 
     max_iter : int, optional (default=100)
         The maximal number of iterations for solver algorithms.
@@ -958,8 +958,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         does not exit (first call to fit), option ``start_params`` sets the
         starting values for ``coef_`` and ``intercept_``.
 
-    start_params : {None, 'least_squares', 'zero'} or array of shape \
-            (n_features, ) or }, optional (default=None)
+    start_params : {None, 'least_squares', 'zero', array of shape \
+            (n_features, )}, optional (default=None)
         If an array of size n_features is supplied, use these as start values
         for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
         is assumed to be the start value for the ``intercept_``.
@@ -979,7 +979,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         (setting to 'random') often leads to significantly faster convergence
         especially when tol is higher than 1e-4.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : {int, RandomState instance, None}, optional (default=None)
         The seed of the pseudo random number generator that selects a random
         feature to be updated for solver 'cd' (coordinate descent).
         If int, random_state is the seed used by the random
@@ -1052,13 +1052,13 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : numpy array or sparse matrix of shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Training data.
 
-        y : numpy array of shape (n_samples,)
+        y : array-like, shape (n_samples,)
             Target values.
 
-        sample_weight : array of shape (n_samples,) or None,\
+        sample_weight : {None, array-like}, shape (n_samples,),\
                 optinal (default=None)
             Individual weights w_i for each sample. Note that for an
             Exponential Dispersion Model (EDM), one has
@@ -1190,12 +1190,12 @@ def fit(self, X, y, sample_weight=None):
         if self.P1 is None:
             P1 = np.ones(X.shape[1])
         else:
-            P1 = np.atleast_1d(np.copy(self.P1))
+            P1 = np.copy(np.atleast_1d(self.P1))
             if P1.dtype.kind not in ['b', 'i', 'u', 'f']:
                 raise ValueError("P1 must be a numeric value; "
                                  "got (dtype={0}).".format(P1.dtype))
             if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]):
-                raise ValueError("P1 must be either None or an 1D array with "
+                raise ValueError("P1 must be either None or a 1d array with "
                                  "the length of X.shape[1]; "
                                  "got (P1.shape[0]={0}), "
                                  "needed (X.shape[1]={1})."
@@ -1324,6 +1324,7 @@ def fit(self, X, y, sample_weight=None):
                 coef = _irls_step(Xnew, W, P2, z)
             else:
                 # with L1 penalty, start with coef = 0
+                # TODO: Are there better options?
                 coef = np.zeros(n_features)
         elif isinstance(self.start_params, six.string_types):
             if self.start_params == 'zero':
@@ -1353,7 +1354,7 @@ def fit(self, X, y, sample_weight=None):
         # 4. fit                                                              #
         #######################################################################
         # algorithms for optimiation
-        # TODO: Parallelize it
+        # TODO: Parallelize it?
         self.n_iter_ = 0
         converged = False
         # 4.1 IRLS ############################################################
@@ -1682,12 +1683,12 @@ def linear_predictor(self, X):
 
         Parameters
         ----------
-        X : numpy array or sparse matrix, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Samples.
 
         Returns
         -------
-        C : array, shape (n_samples)
+        C : array, shape (n_samples,)
             Returns predicted values of linear predictor.
         """
         check_is_fitted(self, "coef_")
@@ -1703,17 +1704,18 @@ def predict(self, X, sample_weight=None):
 
         Parameters
         ----------
-        X : numpy array or sparse matrix, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Samples.
 
-        sample_weight : array of shape (n_samples,) or None , \
-             (default=None)
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
 
         Returns
         -------
         C : array, shape (n_samples,)
             Returns predicted values times sample_weight.
         """
+        # TODO: Is copy=True necessary?
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
                         dtype='numeric', copy=True, ensure_2d=True,
                         allow_nd=False)
@@ -1729,14 +1731,14 @@ def estimate_phi(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : numpy array or sparse matrix of shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Training data.
 
-        y : numpy array, shape (n_samples,)
+        y : array-like, shape (n_samples,)
             Target values.
 
-        sample_weight : array of shape (n_samples,) or None,\
-                optinal (default=None)
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
             Sample weights.
         """
         check_is_fitted(self, "coef_")
@@ -1782,13 +1784,14 @@ def score(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
-            Test samples
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Test samples.
 
         y : array-like, shape (n_samples,)
-            True valeus for X.
+            True values of target.
 
-        sample_weight : array-like, shape = (n_samples,), optional
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
             Sample weights.
 
         Returns

From a9ae023ec331e782d7a23de18f7e6fbb0dd1f57d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Mon, 8 Oct 2018 20:21:35 +0200
Subject: [PATCH 037/209] Double escape in doctring of
 GeneralizedLinearRegressor

---
 sklearn/linear_model/glm.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 01d5420773ffc..535af60289f8e 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -866,13 +866,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     TODO: Estimation of the dispersion parameter phi.
 
-    If your target `y` is a ratio, you should also provide appropriate weights
-    `w`. As an example, consider Poission distributed counts `z` (integers) and
-    weights `w`=exposure (time, money, persons years, ...), then you fit
+    If the target `y` is a ratio, appropriate weights `w` should be provided.
+    As an example, consider Poission distributed counts `z` (integers) and
+    weights `w`=exposure (time, money, persons years, ...). Then you fit
     `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y,
-    sample_weight=w)``. You need the weights for the right mean, consider:
-    :math:`\bar(y) = \frac{\sum_i w_i y_i}{\sum_i w_i}`.
-    In this case one might say that y has a 'scaled' Poisson distributions.
+    sample_weight=w)``. The weights are necessary for the right mean, consider:
+    :math:`\\bar(y) = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`.
+    In this case one might say that 'y' has a 'scaled' Poisson distributions.
     The same holds for other distributions.
 
     Parameters
@@ -891,7 +891,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
-    P1 : {None, array-like}, shape (n_features*,), optional\
+    P1 : {None, array-like}, shape (n_features*,), optional \
             (default=None)
         With this array, you can exclude coefficients from the L1 penalty.
         Set the corresponding value to 1 (include) or 0 (exclude). The

From bb62485166412d8ba6393e5ba753b015b806867c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Mon, 31 Dec 2018 16:54:40 +0100
Subject: [PATCH 038/209] Add example for GeneralizedLinearRegressor

* add example

* improve docstring of GeneralizedLinearRegressor

* improve user guide for GeneralizedLinearRegressor
---
 doc/modules/linear_model.rst                  | 34 +++++---
 .../plot_poisson_spline_regression.py         | 83 +++++++++++++++++++
 sklearn/linear_model/glm.py                   | 22 ++---
 3 files changed, 115 insertions(+), 24 deletions(-)
 create mode 100644 examples/linear_model/plot_poisson_spline_regression.py

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index a204ccb080cc9..d65f7ed121f8e 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -901,12 +901,15 @@ for a more versatile L2 penalty.
 Use cases, where a loss different from the squared loss might be appropriate,
 are the following:
 
-  * If the target values :math:`y` are counts (integer valued) or frequencies, you might try a Poisson deviance.
+  * If the target values :math:`y` are counts (non-negative integer valued) or
+    frequencies (non-negative), you might use a Poisson deviance with log-link.
 
-  * If the target values are positive valued and skewed, you might try a Gamma deviance.
+  * If the target values are positive valued and skewed, you might try a
+    Gamma deviance with log-link.
 
-  * If the target values seem to be heavier tailed than a Gamma distribution, you might try an Inverse Gaussian deviance (or even higher variance powers of the Tweedie family).
-    Keep in mind that the mean is not a good measure for very heavy tailed distributions, cf. extreme value theory.
+  * If the target values seem to be heavier tailed than a Gamma distribution,
+    you might try an Inverse Gaussian deviance (or even higher variance powers
+    of the Tweedie family).
 
 Since the linear predictor :math:`Xw` can be negative and
 Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
@@ -931,6 +934,11 @@ follows:
     >>> reg.intercept_ #doctest: +ELLIPSIS
     -0.76383575...
 
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py`
+
 Mathematical formulation
 ------------------------
 
@@ -956,20 +964,20 @@ e.g. accounting for the dependence structure of :math:`y`.
 The objective function, which is independent of :math:`\phi`, is minimized with
 respect to the coefficients :math:`w`.
 
-The deviance is defined by
+The deviance is defined by the log of the EDM likelihood as
 
 .. math::     D(y, \mu) = -2\phi\cdot
               \left(loglike(y,\mu,\frac{\phi}{s})
               - loglike(y,y,\frac{\phi}{s})\right)
 
-=====================================  =================================
-Distribution                           Variance Function :math:`v(\mu)`
-=====================================  =================================
-Normal ("normal")                      :math:`1`
-Poisson ("poisson")                    :math:`\mu`
-Gamma ("gamma")                        :math:`\mu^2`
-Inverse Gaussian ("inverse.gaussian")  :math:`\mu^3`
-=====================================  =================================
+===================================== ===============================  ================================= ============================================
+Distribution                          Target Domain                    Variance Function :math:`v(\mu)`  Deviance :math:`D(y, \mu)`
+===================================== ===============================  ================================= ============================================
+Normal ("normal")                     :math:`y \in (-\infty, \infty)`  :math:`1`                         :math:`(y-\mu)^2`
+Poisson ("poisson")                   :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{/mu}-y+\mu)`
+Gamma ("gamma")                       :math:`y \in (0, \infty)`        :math:`\mu^2`                     :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
+Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)`        :math:`\mu^3`                     :math:`\frac{(y-\mu)^2}{y\mu^2}`
+===================================== ===============================  ================================= ============================================
 
 Two remarks:
 
diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py
new file mode 100644
index 0000000000000..b98bca5d8f867
--- /dev/null
+++ b/examples/linear_model/plot_poisson_spline_regression.py
@@ -0,0 +1,83 @@
+"""
+=================================
+Poisson Regression with B-Splines
+=================================
+
+As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py`
+example, a Poisson regression with penalized B-splines (P-splines) [1]_ is
+fitted on slightly different sinusodial, Poisson distributed data and
+compared to an AdaBoost model with decision trees.
+One can see, that this is a hard problem for both estimators.
+
+.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines
+       and penalties". Statist. Sci. 11 (1996), no. 2, 89--121.
+       `doi:10.1214/ss/1038425655
+       <https://projecteuclid.org/euclid.ss/1038425655>`_
+
+"""
+print(__doc__)
+
+# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
+# based on the AdaBoost regression example from Noel Dawe <noel.dawe@gmail.com>
+# License: BSD 3 clause
+
+# importing necessary libraries
+import numpy as np
+from scipy.linalg import toeplitz
+from scipy.interpolate import BSpline
+import matplotlib.pyplot as plt
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import AdaBoostRegressor
+from sklearn.linear_model import GeneralizedLinearRegressor
+
+
+# Create the dataset
+xmin, xmax = 0, 6
+rng = np.random.RandomState(1)
+X = np.linspace(xmin, xmax, 500)[:, np.newaxis]
+y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel())
+y = rng.poisson(y_true, X.shape[0])
+
+# b-spline basis
+nknots, degree = 40, 3
+ns = nknots - degree - 1  # number of base spline functions
+dx = (xmax - xmin) / (nknots - 1 - 2 * degree)
+knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots)
+coef = np.zeros(ns)
+splineBasis = np.empty((X.shape[0], ns), dtype=float)
+for i in range(ns):
+    coef[i] = 1
+    splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \
+        .ravel()
+    coef[i] = 0
+
+# second order difference matrix
+P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float)
+P2[0, 0] = P2[-1, -1] = 1
+
+# Fit regression model
+regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
+                           n_estimators=10, random_state=rng)
+
+regr_2 = GeneralizedLinearRegressor(family='poisson', link='log',
+                                    fit_intercept=True, alpha=0.02,
+                                    l1_ratio=0.1, P2=P2)
+
+regr_1.fit(X, y)
+regr_2.fit(splineBasis, y)
+
+# Predict
+y_1 = regr_1.predict(X)
+y_2 = regr_2.predict(splineBasis)
+
+# Plot the results
+plt.figure()
+plt.plot(X, y_true, c="b", label="true mean")
+plt.scatter(X, y, c="k", marker='.', label="training samples")
+plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2)
+plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2)
+plt.xlabel("data")
+plt.ylabel("target")
+plt.title("Regression Comparison")
+plt.legend()
+plt.show()
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 535af60289f8e..37afc8da2d6db 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -6,7 +6,7 @@
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
-# TODO: Write examples
+# TODO: Write more examples.
 # TODO: Make option self.copy_X more meaningfull than just for start values.
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
@@ -832,7 +832,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     with inverse link function `h` and s=sum of `sample_weight` (which equals
     n_samples for `sample_weight=None`).
-    For `P1`=`P2`=identity, the penalty is the elastic net::
+    For `P1=P2=identity`, the penalty is the elastic net::
 
             alpha * l1_ratio * ||w||_1
             + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2
@@ -868,11 +868,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     If the target `y` is a ratio, appropriate weights `w` should be provided.
     As an example, consider Poission distributed counts `z` (integers) and
-    weights `w`=exposure (time, money, persons years, ...). Then you fit
-    `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y,
-    sample_weight=w)``. The weights are necessary for the right mean, consider:
-    :math:`\\bar(y) = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`.
-    In this case one might say that 'y' has a 'scaled' Poisson distributions.
+    weights `w=exposure` (time, money, persons years, ...). Then you fit
+    `y = z/w`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
+    sample_weight=w)``. The weights are necessary for the right meanself.
+    Consider :math:`\\bar{y} = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`,
+    in this case one might say that `y` has a 'scaled' Poisson distributions.
     The same holds for other distributions.
 
     Parameters
@@ -1017,10 +1017,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     References
     ----------
     For the coordinate descent implementation:
-    .. [1] Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
-           An Improved GLMNET for L1-regularized Logistic Regression,
-           Journal of Machine Learning Research 13 (2012) 1999-2030
-           https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+        * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+          An Improved GLMNET for L1-regularized Logistic Regression,
+          Journal of Machine Learning Research 13 (2012) 1999-2030
+          https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
     """
     def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
                  fit_intercept=True, family='normal', link='identity',

From 16d064db7cee1d59569d21631cc2fa41be8b3b14 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 1 Jan 2019 11:58:41 +0100
Subject: [PATCH 039/209] Resolve merge conflicts

* resolve merge conflicts in linear_model.rst

* replace BSpline by splev to support older scipy versions
---
 doc/modules/linear_model.rst                       | 14 +++++++-------
 .../linear_model/plot_poisson_spline_regression.py |  8 +++++---
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index d65f7ed121f8e..09f14735c2907 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -876,18 +876,18 @@ to warm-starting (see :term:`Glossary <warm_start>`).
 
 .. _Generalized_linear_regression:
 
-Generalized linear regression
+Generalized Linear Regression
 =============================
 
 :class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two
-ways [8]_. First, the predicted values :math:`\hat{y}` are linked to a linear
+ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear
 combination of the input variables :math:`X` via an inverse link function
 :math:`h` as
 
 .. math::    \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p).
 
 Secondly, the squared loss function is replaced by the deviance :math:`D` of an
-exponential dispersion model (EDM) [9]_. The objective function beeing minimized
+exponential dispersion model (EDM) [11]_. The objective function beeing minimized
 becomes
 
 .. math::    \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1
@@ -983,7 +983,7 @@ Two remarks:
 
 * The deviances for at least Normal, Poisson and Gamma distributions are
   strictly consistent scoring functions for the mean :math:`\mu`, see Eq.
-  (19)-(20) in [10]_.
+  (19)-(20) in [12]_.
 
 * If you want to model a frequency, i.e. counts per exposure (time, volume, ...)
   you can do so by a Poisson distribution and passing
@@ -993,12 +993,12 @@ Two remarks:
 
 .. topic:: References:
 
-    .. [8] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+    .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
 
-    .. [9] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
+    .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
            See also `Exponential dispersion model. <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
 
-    .. [10] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
+    .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
 
 Stochastic Gradient Descent - SGD
 =================================
diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py
index b98bca5d8f867..fce85fae1ea8c 100644
--- a/examples/linear_model/plot_poisson_spline_regression.py
+++ b/examples/linear_model/plot_poisson_spline_regression.py
@@ -24,7 +24,8 @@
 # importing necessary libraries
 import numpy as np
 from scipy.linalg import toeplitz
-from scipy.interpolate import BSpline
+# from scipy.interpolate import BSpline
+from scipy.interpolate import splev
 import matplotlib.pyplot as plt
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.ensemble import AdaBoostRegressor
@@ -47,8 +48,9 @@
 splineBasis = np.empty((X.shape[0], ns), dtype=float)
 for i in range(ns):
     coef[i] = 1
-    splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \
-        .ravel()
+#    splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \
+#        .ravel()
+    splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel()
     coef[i] = 0
 
 # second order difference matrix

From 1a02a901d1a6d99484241ca205178ac61fc47846 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 1 Jan 2019 12:59:30 +0100
Subject: [PATCH 040/209] Adapt for minimum numpy version

* replace np.block
---
 sklearn/linear_model/glm.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 37afc8da2d6db..851767055c61c 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -1235,8 +1235,11 @@ def fit(self, X, y, sample_weight=None):
                 P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2),
                                        dtype=P2.dtype).tocsr()
             else:
-                P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))],
-                               [np.zeros((X.shape[1], 1)), P2]])
+                # as of numpy 1.13 this would work:
+                # P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))],
+                #                [np.zeros((X.shape[1], 1)), P2]])
+                P2 = np.hstack((np.zeros((X.shape[1], 1)), P2))
+                P2 = np.vstack((np.zeros((1, X.shape[1]+1)), P2))
         else:
             Xnew = X
 

From 177eb4cc017a7262e472070b4a920250711a099c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 6 Jan 2019 19:46:19 +0100
Subject: [PATCH 041/209] Remove six dependencies as in #12639

* replace six.with_metaclass(ABCMeta) by metaclass=ABCMeta

* replace six.integer_types by int

* replace six.string_types by str

* rebase

* correct email address
---
 sklearn/linear_model/glm.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 851767055c61c..aca49ec7edf28 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -2,12 +2,13 @@
 Generalized Linear Models with Exponential Dispersion Family
 """
 
-# Author: Christian Lorentzen <lorentzen.ch@googlemail.ch>
+# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
 # TODO: Write more examples.
-# TODO: Make option self.copy_X more meaningfull than just for start values.
+# TODO: Make option self.copy_X more meaningful.
+# So far, fit uses Xnew instead of X.
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
 # TODO: Add cross validation support?
@@ -51,7 +52,6 @@
 from .ridge import Ridge
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
-from ..externals import six
 from ..utils import check_array, check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import newton_cg
@@ -84,7 +84,7 @@ def _check_weights(sample_weight, n_samples):
     return weights
 
 
-class Link(six.with_metaclass(ABCMeta)):
+class Link(metaclass=ABCMeta):
     """Abstract base class for Link funtions
     """
 
@@ -186,7 +186,7 @@ def inverse_derivative2(self, lin_pred):
         return np.exp(lin_pred)
 
 
-class ExponentialDispersionModel(six.with_metaclass(ABCMeta)):
+class ExponentialDispersionModel(metaclass=ABCMeta):
     r"""Base class for reproductive Exponential Dispersion Models (EDM).
 
     The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
@@ -1142,7 +1142,7 @@ def fit(self, X, y, sample_weight=None):
                                  "with L1 penalties, which are included with "
                                  "(alpha={1}) and (l1_ratio={2})."
                                  .format(solver, self.alpha, self.l1_ratio))
-        if (not isinstance(self.max_iter, six.integer_types)
+        if (not isinstance(self.max_iter, int)
                 or self.max_iter <= 0):
             raise ValueError("Maximum number of iteration must be a positive "
                              "integer;"
@@ -1156,7 +1156,7 @@ def fit(self, X, y, sample_weight=None):
         start_params = self.start_params
         if start_params is None:
             pass
-        elif isinstance(start_params, six.string_types):
+        elif isinstance(start_params, str):
             if start_params not in ['least_squares', 'zero']:
                 raise ValueError("The argument start_params must be None, "
                                  "'least-squares', 'zero' or an array of right"
@@ -1329,7 +1329,7 @@ def fit(self, X, y, sample_weight=None):
                 # with L1 penalty, start with coef = 0
                 # TODO: Are there better options?
                 coef = np.zeros(n_features)
-        elif isinstance(self.start_params, six.string_types):
+        elif isinstance(self.start_params, str):
             if self.start_params == 'zero':
                 coef = np.zeros(n_features)
             elif self.start_params == 'least_squares':

From 3d4c784df6797c244b31dd3393083d2c63114bf2 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 3 Feb 2019 21:38:24 +0100
Subject: [PATCH 042/209] Improve user guide, doc and fix penalty parameter for
 Ridge

* move parts of docstring to new Notes section

* improve user guide and doc

* fix typos

* fix scaling of penalty parameter in Ridge()

* docstring for _check_weights

* reduce tol for parameter initialization
---
 doc/modules/linear_model.rst |  18 +++---
 sklearn/linear_model/glm.py  | 114 +++++++++++++++++++----------------
 2 files changed, 73 insertions(+), 59 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 09f14735c2907..174d1e4eddae4 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -890,7 +890,7 @@ Secondly, the squared loss function is replaced by the deviance :math:`D` of an
 exponential dispersion model (EDM) [11]_. The objective function beeing minimized
 becomes
 
-.. math::    \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1
+.. math::    \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1
             +\frac{\alpha(1-\rho)}{2} w^T P_2 w
 
 with sample weights :math:`s`.
@@ -914,7 +914,7 @@ are the following:
 Since the linear predictor :math:`Xw` can be negative and
 Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
 it is convenient to apply a link function different from the identity link
-:math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with
+:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with
 :math:`h(Xw)=\exp(Xw)`.
 
 Note that the feature matrix `X` should be standardized before fitting. This
@@ -964,17 +964,19 @@ e.g. accounting for the dependence structure of :math:`y`.
 The objective function, which is independent of :math:`\phi`, is minimized with
 respect to the coefficients :math:`w`.
 
-The deviance is defined by the log of the EDM likelihood as
+The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
+likelihood as
 
-.. math::     D(y, \mu) = -2\phi\cdot
-              \left(loglike(y,\mu,\frac{\phi}{s})
-              - loglike(y,y,\frac{\phi}{s})\right)
+.. math::     d(y, \mu) = -2\phi\cdot
+              \left(loglike(y,\mu,\phi)
+              - loglike(y,y,\phi)\right) \\
+              D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i)
 
 ===================================== ===============================  ================================= ============================================
-Distribution                          Target Domain                    Variance Function :math:`v(\mu)`  Deviance :math:`D(y, \mu)`
+Distribution                          Target Domain                    Variance Function :math:`v(\mu)`  Unit Deviance :math:`d(y, \mu)`
 ===================================== ===============================  ================================= ============================================
 Normal ("normal")                     :math:`y \in (-\infty, \infty)`  :math:`1`                         :math:`(y-\mu)^2`
-Poisson ("poisson")                   :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{/mu}-y+\mu)`
+Poisson ("poisson")                   :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{\mu}-y+\mu)`
 Gamma ("gamma")                       :math:`y \in (0, \infty)`        :math:`\mu^2`                     :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
 Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)`        :math:`\mu^3`                     :math:`\frac{(y-\mu)^2}{y\mu^2}`
 ===================================== ===============================  ================================= ============================================
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index aca49ec7edf28..5fc869f81195f 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -8,7 +8,7 @@
 
 # TODO: Write more examples.
 # TODO: Make option self.copy_X more meaningful.
-# So far, fit uses Xnew instead of X.
+#       So far, fit uses Xnew instead of X.
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
 # TODO: Add cross validation support?
@@ -28,7 +28,7 @@
 # - Allow for finer control of penalty terms:
 #   L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude
 #       factors from the L1 penalty.
-#   L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be
+#   L2: w*P2*w with P2 a (semi-) positive definite matrix, e.g. P2 could be
 #   a 1st or 2nd order difference matrix (compare B-spline penalties and
 #   Tikhonov regularization).
 # - The link funtion (instance of class Link) is necessary for the evaluation
@@ -59,6 +59,8 @@
 
 
 def _check_weights(sample_weight, n_samples):
+    """Check that weights are non-negative and have the right shape
+    """
     if sample_weight is None:
         weights = np.ones(n_samples)
     elif np.isscalar(sample_weight):
@@ -594,6 +596,7 @@ class TweedieDistribution(ExponentialDispersionModel):
     power : float (default=0)
             The variance power of the `unit_variance`
             :math:`v(\mu) = \mu^{power}`.
+            For ``0<power<1``, no distribution exists.
     """
     def __init__(self, power=0):
         self.power = power
@@ -826,12 +829,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     Minimizes the objective function::
 
-            1/(2s) * deviance(y, h(X*w))
+            1/(2*sum(s)) * deviance(y, h(X*w); s)
             + alpha * l1_ratio * ||P1*w||_1
             + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
 
-    with inverse link function `h` and s=sum of `sample_weight` (which equals
-    n_samples for `sample_weight=None`).
+    with inverse link function `h` and s=`sample_weight` (for
+    `sample_weight=Nones` one has s=1 and sum(s) equals `n_samples`).
     For `P1=P2=identity`, the penalty is the elastic net::
 
             alpha * l1_ratio * ||w||_1
@@ -852,29 +855,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     Read more in the :ref:`User Guide <Generalized_linear_regression>`.
 
-    The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments :math:`E[Y_i]=\\mu_i=h(\\eta_i)` and
-    :math:`Var[Y_i]=\\frac{\\phi}{w_i} v(\\mu_i)`.
-
-    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
-    (penalized) maximum likelihood which is equivalent to minimizing the
-    deviance.
-
-    For `alpha` > 0, the feature matrix `X` should be standardized in order to
-    penalize features equally strong. Call
-    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
-
-    TODO: Estimation of the dispersion parameter phi.
-
-    If the target `y` is a ratio, appropriate weights `w` should be provided.
-    As an example, consider Poission distributed counts `z` (integers) and
-    weights `w=exposure` (time, money, persons years, ...). Then you fit
-    `y = z/w`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
-    sample_weight=w)``. The weights are necessary for the right meanself.
-    Consider :math:`\\bar{y} = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`,
-    in this case one might say that `y` has a 'scaled' Poisson distributions.
-    The same holds for other distributions.
-
     Parameters
     ----------
     alpha : float, optional (default=1)
@@ -891,22 +871,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
-    P1 : {None, array-like}, shape (n_features*,), optional \
+    P1 : {None, array-like}, shape (n_features,), optional \
             (default=None)
         With this array, you can exclude coefficients from the L1 penalty.
         Set the corresponding value to 1 (include) or 0 (exclude). The
         default value ``None`` is the same as a 1d array of ones.
-        Note that n_features* = X.shape[1] = length of coef_ (intercept
-        always excluded from counting).
+        Note that n_features = X.shape[1].
 
     P2 : {None, array-like, sparse matrix}, shape \
-            (n_features*, n_features*), optional (default=None)
+            (n_features, n_features), optional (default=None)
         With this square matrix the L2 penalty is calculated as `w P2 w`.
         This gives a fine control over this penalty (Tikhonov
-        regularization).
-        The default value ``None`` is the same as the idendity matrix.
-        Note that n_features* = X.shape[1] = length of coef_ (intercept
-        always excluded from counting). P2 must be positive semi-definite.
+        regularization). The diagonal zeros of a diagonal P2, for example,
+        exclude all corresponding coefficients from the L2 penalty.
+        The default value ``None`` is the same as the identity matrix.
+        Note that n_features = X.shape[1]. P2 must be positive semi-definite.
 
     fit_intercept : boolean, optional (default=True)
         Specifies if a constant (a.k.a. bias or intercept) should be
@@ -929,18 +908,22 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \
             optional (default='auto')
-        Algorithm to use in the optimization problem.
+        Algorithm to use in the optimization problem:
 
-        - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'.
+        'auto'
+            Sets 'irls' if l1_ratio equals 0, else 'cd'.
 
-        - 'irls' is iterated reweighted least squares (Fisher scoring).
+        'irls'
+            iterated reweighted least squares (Fisher scoring).
             It is the standard algorithm for GLMs. Cannot deal with
             L1 penalties.
 
-        - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties.
+        'newton-cg', 'lbfgs'
+            Cannot deal with L1 penalties.
 
-        - 'cd' is the coordinate descent algorithm. It can
-            deal with L1 as well as L2 penalties.
+        'cd'
+            coordinate descent algorithm. It can deal with L1 as well as L2
+            penalties.
 
     max_iter : int, optional (default=100)
         The maximal number of iterations for solver algorithms.
@@ -959,10 +942,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         starting values for ``coef_`` and ``intercept_``.
 
     start_params : {None, 'least_squares', 'zero', array of shape \
-            (n_features, )}, optional (default=None)
-        If an array of size n_features is supplied, use these as start values
+            (n_features*, )}, optional (default=None)
+        If an array of size n_features* is supplied, use it as start values
         for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
         is assumed to be the start value for the ``intercept_``.
+        Note that n_features* = X.shape[1] + fit_intercept includes the
+        intercept in counting.
         If 'least_squares' is set, the result of a least squares fit in the
         link space (linear predictor) is taken.
         If 'zero' is set, all coefficients start with zero.
@@ -1013,6 +998,30 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     n_iter_ : int
         Actual number of iterations of the solver.
 
+    Notes
+    -----
+    The fit itself does not need Y to be from an EDM, but only assumes
+    the first two moments :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`.
+
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    (penalized) maximum likelihood which is equivalent to minimizing the
+    deviance.
+
+    For `alpha` > 0, the feature matrix `X` should be standardized in order to
+    penalize features equally strong. Call
+    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
+
+    If the target `y` is a ratio, appropriate sample weights `s` should be
+    provided.
+    As an example, consider Poission distributed counts `z` (integers) and
+    weights `s=exposure` (time, money, persons years, ...). Then you fit
+    `y = z/s`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
+    sample_weight=s)``. The weights are necessary for the right (finite
+    sample) mean.
+    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
+    in this case one might say that `y` has a 'scaled' Poisson distributions.
+    The same holds for other distributions.
 
     References
     ----------
@@ -1138,10 +1147,10 @@ def fit(self, X, y, sample_weight=None):
             else:
                 solver = 'cd'
         if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']):
-                raise ValueError("The chosen solver (solver={0}) can't deal "
-                                 "with L1 penalties, which are included with "
-                                 "(alpha={1}) and (l1_ratio={2})."
-                                 .format(solver, self.alpha, self.l1_ratio))
+            raise ValueError("The chosen solver (solver={0}) can't deal "
+                             "with L1 penalties, which are included with "
+                             "(alpha={1}) and (l1_ratio={2})."
+                             .format(solver, self.alpha, self.l1_ratio))
         if (not isinstance(self.max_iter, int)
                 or self.max_iter <= 0):
             raise ValueError("Maximum number of iteration must be a positive "
@@ -1340,14 +1349,17 @@ def fit(self, X, y, sample_weight=None):
                 elif self.l1_ratio <= 0.01:
                     # ElasticNet says l1_ratio <= 0.01 is not reliable
                     # => use Ridge
+                    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
                     reg = Ridge(copy_X=True, fit_intercept=False,
-                                alpha=self.alpha)
+                                alpha=self.alpha*n_samples,
+                                tol=np.max([self.tol, np.sqrt(self.tol)]))
                     reg.fit(Xnew, link.link(y))
                     coef = reg.coef_
                 else:
                     # TODO: Does this make sense at all?
                     reg = ElasticNet(copy_X=True, fit_intercept=False,
-                                     alpha=self.alpha, l1_ratio=self.l1_ratio)
+                                     alpha=self.alpha, l1_ratio=self.l1_ratio,
+                                     tol=np.max([self.tol, np.sqrt(self.tol)]))
                     reg.fit(Xnew, link.link(y))
                     coef = reg.coef_
         else:
@@ -1557,7 +1569,7 @@ def Hs(s):
                         # minimize_z: a z + 1/2 b z^2 + c |d+z|
                         # a = A_j
                         # b = B_jj > 0
-                        # c = |P1_j| = P1_j > 0, ee 1.3
+                        # c = |P1_j| = P1_j > 0, see 1.3
                         # d = w_j + d_j
                         # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
                         # with beta = z+d, beta_hat = d-a/b and gamma = c/b

From 919912c3f98e6d1190737e344c14f31c2eef9077 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 17 Feb 2019 18:38:36 +0100
Subject: [PATCH 043/209] Smarter intercept initialization and docstring
 improvements

* smarter initialization of intercept

* PEP 257 -- Docstring Conventions

* minor docstring changes
---
 sklearn/linear_model/glm.py | 265 +++++++++++++++++++++---------------
 1 file changed, 153 insertions(+), 112 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 5fc869f81195f..d69ccd0a66486 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -59,14 +59,13 @@
 
 
 def _check_weights(sample_weight, n_samples):
-    """Check that weights are non-negative and have the right shape
-    """
+    """Check that weights are non-negative and have the right shape."""
     if sample_weight is None:
         weights = np.ones(n_samples)
     elif np.isscalar(sample_weight):
         if sample_weight <= 0:
             raise ValueError("Sample weights must be non-negative.")
-        weights = sample_weight*np.ones(n_samples)
+        weights = sample_weight * np.ones(n_samples)
     else:
         _dtype = [np.float64, np.float32]
         weights = check_array(sample_weight, accept_sparse='csr',
@@ -75,8 +74,8 @@ def _check_weights(sample_weight, n_samples):
         if weights.ndim > 1:
             raise ValueError("Sample weight must be 1D array or scalar")
         elif weights.shape[0] != n_samples:
-            raise ValueError("Sample weights must have the same length as"
-                             " y")
+            raise ValueError("Sample weights must have the same length as "
+                             "y")
         if not np.all(weights >= 0):
             raise ValueError("Sample weights must be non-negative.")
         elif not np.sum(weights) > 0:
@@ -87,70 +86,72 @@ def _check_weights(sample_weight, n_samples):
 
 
 class Link(metaclass=ABCMeta):
-    """Abstract base class for Link funtions
-    """
+    """Abstract base class for Link funtions."""
 
     @abstractmethod
     def link(self, mu):
-        """The link function g(mu) with argument mu=E[Y] returns the
-        linear predictor.
+        """Compute the link function g(mu).
+
+        The link function links the mean mu=E[Y] to the so called linear
+        predictor (X*w), i.e. g(mu) = linear predictor.
 
         Parameters
         ----------
         mu : array, shape (n_samples,)
-            Usually the predicted mean.
+            Usually the (predicted) mean.
         """
         raise NotImplementedError
 
     @abstractmethod
     def derivative(self, mu):
-        """Derivative of the link g'(mu).
+        """Compute the derivative of the link g'(mu).
 
         Parameters
         ----------
         mu : array, shape (n_samples,)
-            Usually the predicted mean.
+            Usually the (predicted) mean.
         """
         raise NotImplementedError
 
     @abstractmethod
     def inverse(self, lin_pred):
-        """The inverse link function h(lin_pred) with the linear predictor as
-        argument returns mu=E[Y].
+        """Compute the inverse link function h(lin_pred).
+
+        Gives the inverse relationship between linkear predictor and the mean
+        mu=E[Y], i.e. h(linear predictor) = mu.
 
         Parameters
         ----------
         lin_pred : array, shape (n_samples,)
-            Usually the (predicted) linear predictor.
+            Usually the (fitted) linear predictor.
         """
         raise NotImplementedError
 
     @abstractmethod
     def inverse_derivative(self, lin_pred):
-        """Derivative of the inverse link function h'(lin_pred).
+        """Compute the derivative of the inverse link function h'(lin_pred).
 
         Parameters
         ----------
         lin_pred : array, shape (n_samples,)
-            Usually the (predicted) linear predictor.
+            Usually the (fitted) linear predictor.
         """
         raise NotImplementedError
 
     @abstractmethod
     def inverse_derivative2(self, lin_pred):
-        """Second derivative of the inverse link function h''(lin_pred).
+        """Compute 2nd derivative of the inverse link function h''(lin_pred).
 
         Parameters
         ----------
         lin_pred : array, shape (n_samples,)
-            Usually the (predicted) linear predictor.
+            Usually the (fitted) linear predictor.
         """
         raise NotImplementedError
 
 
 class IdentityLink(Link):
-    """The identity link function g(x)=x.
-    """
+    """The identity link function g(x)=x."""
 
     def link(self, mu):
         return mu
@@ -169,8 +170,7 @@ def inverse_derivative2(self, lin_pred):
 
 
 class LogLink(Link):
-    """The log link function g(x)=log(x).
-    """
+    """The log link function g(x)=log(x)."""
 
     def link(self, mu):
         return np.log(mu)
@@ -238,26 +238,22 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
 
     @abstractproperty
     def lower_bound(self):
-        """The lower bound of values of Y~EDM.
-        """
+        """The lower bound of values of Y~EDM."""
         raise NotImplementedError()
 
     @abstractproperty
     def upper_bound(self):
-        """The upper bound of values of Y~EDM.
-        """
+        """The upper bound of values of Y~EDM."""
         raise NotImplementedError()
 
     @abstractproperty
     def include_lower_bound(self):
-        """If True, values of y may equal lower bound: y >= lower_bound.
-        """
+        """If True, values of y may equal lower bound: y >= lower_bound."""
         raise NotImplementedError()
 
     @abstractproperty
     def include_upper_bound(self):
-        """If True, values of y may equal upper bound: y <= upper_bound.
-        """
+        """If True, values of y may equal upper bound: y <= upper_bound."""
         raise NotImplementedError()
 
     def in_y_range(self, x):
@@ -285,7 +281,9 @@ def in_y_range(self, x):
 
     @abstractmethod
     def unit_variance(self, mu):
-        r"""The unit variance :math:`v(\mu)` determines the variance as
+        r"""Compute the unit variance function.
+
+        The unit variance :math:`v(\mu)` determines the variance as
         a function of the mean :math:`\mu` by
         :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`.
         It can also be derived from the unit deviance :math:`d(y,\mu)` as
@@ -304,7 +302,9 @@ def unit_variance(self, mu):
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
-        r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`.
+        r"""Compute the derivative of the unit variance w.r.t. mu.
+
+        Return :math:`v'(\mu)`.
 
         Parameters
         ----------
@@ -314,7 +314,9 @@ def unit_variance_derivative(self, mu):
         raise NotImplementedError()
 
     def variance(self, mu, phi=1, weights=1):
-        r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
+        r"""Compute the variance function.
+
+        The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
         :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`,
         with unit variance :math:`v(\mu)` and weights :math:`s_i`.
 
@@ -332,7 +334,9 @@ def variance(self, mu, phi=1, weights=1):
         return phi/weights * self.unit_variance(mu)
 
     def variance_derivative(self, mu, phi=1, weights=1):
-        r"""The derivative of the variance w.r.t. `mu`,
+        r"""Compute the derivative of the variance w.r.t. mu.
+
+        Returns
         :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
         =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)`
         and weights :math:`s_i`.
@@ -352,10 +356,12 @@ def variance_derivative(self, mu, phi=1, weights=1):
 
     @abstractmethod
     def unit_deviance(self, y, mu):
-        r"""The unit_deviance :math:`d(y,\mu)`.
-        In terms of the log-likelihood it is given by
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,\mu)` can be defined by the
+        log-likelihood as
         :math:`d(y,\mu) = -2\phi\cdot
-        \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).`
+        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
 
         Parameters
         ----------
@@ -368,7 +374,9 @@ def unit_deviance(self, y, mu):
         raise NotImplementedError()
 
     def unit_deviance_derivative(self, y, mu):
-        r"""The derivative w.r.t. `mu` of the unit deviance
+        r"""Compute the derivative of the unit deviance w.r.t. mu.
+
+        The derivative of the unit deviance is given by
         :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
         with unit variance :math:`v(\mu)`.
 
@@ -383,9 +391,12 @@ def unit_deviance_derivative(self, y, mu):
         return -2*(y-mu)/self.unit_variance(mu)
 
     def deviance(self, y, mu, weights=1):
-        r"""The deviance is given by :math:`D = \sum_i s_i \cdot d(y, \mu)
+        r"""Compute the deviance.
+
+        The deviance is a weighted sum of the per sample unit deviances,
+        :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)`
         with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`.
-        In terms of the likelihood it is :math:`D = -2\phi\cdot
+        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
         \left(loglike(y,\mu,\frac{phi}{s})
         - loglike(y,y,\frac{phi}{s})\right)`.
 
@@ -403,15 +414,15 @@ def deviance(self, y, mu, weights=1):
         return np.sum(weights*self.unit_deviance(y, mu))
 
     def _deviance(self, coef, X, y, weights, link):
-        """The deviance as a function of the coefficients `coef`
-        (:math:`w`).
-        """
+        """Compute the deviance as a function of the coefficients and data."""
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
         mu = link.inverse(lin_pred)
         return self.deviance(y, mu, weights)
 
     def deviance_derivative(self, y, mu, weights=1):
-        """The derivative w.r.t. `mu` of the deviance.
+        """Compute the derivative of the deviance w.r.t. mu.
+
+        It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`.
 
         Parameters
         ----------
@@ -427,7 +438,9 @@ def deviance_derivative(self, y, mu, weights=1):
         return weights*self.unit_deviance_derivative(y, mu)
 
     def _score(self, coef, phi, X, y, weights, link):
-        r"""The score function is the derivative of the
+        r"""Compute the score function.
+
+        The score function is the derivative of the
         log-likelihood w.r.t. `coef` (:math:`w`).
         It is given by
 
@@ -453,7 +466,8 @@ def _score(self, coef, phi, X, y, weights, link):
         return score
 
     def _fisher_matrix(self, coef, phi, X, y, weights, link):
-        r"""The Fisher information matrix.
+        r"""Compute the Fisher information matrix.
+
         The Fisher information matrix, also known as expected information
         matrix is given by
 
@@ -482,9 +496,10 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link):
         return fisher_matrix
 
     def _observed_information(self, coef, phi, X, y, weights, link):
-        r"""The observed information matrix.
+        r"""Compute the observed information matrix.
+
         The observed information matrix, also known as the negative of
-        the Hessian matrix of the log-likelihood. It is given by
+        the Hessian matrix of the log-likelihood, is given by
 
         .. math:
 
@@ -518,8 +533,10 @@ def _observed_information(self, coef, phi, X, y, weights, link):
         return observed_information
 
     def _deviance_derivative(self, coef, X, y, weights, link):
-        r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a
-        function of the coefficients `coef`.
+        r"""Compute the derivative of the deviance w.r.t. coef.
+
+        The derivative of the deviance w.r.t. `coef` (:math:`w`) as a
+        function of the coefficients `coef` and the data.
         This is equivalent to :math:`-2\phi` times the score function
         :func:`_score` (derivative of the log-likelihood).
         """
@@ -528,9 +545,11 @@ def _deviance_derivative(self, coef, X, y, weights, link):
         return -2*score
 
     def _deviance_hessian(self, coef, X, y, weights, link):
-        r"""The hessian matrix w.r.t. `coef` (:math:`w`) of the deviance
-        as a function of the coefficients `coef`.
-        This is equivalent to :math:`+2\phi` times the observed information
+        r"""Compute the hessian matrix of the deviance w.r.t. coef.
+
+        The hessian of the deviance w.r.t. `coef` (:math:`w`) is evaluated as
+        a function of the coefficients `coef` and the data.
+        It is equivalent to :math:`+2\phi` times the observed information
         matrix.
         """
         info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y,
@@ -538,8 +557,12 @@ def _deviance_hessian(self, coef, X, y, weights, link):
         return 2*info_matrix
 
     def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link):
-        """Calculates eta (linear predictor), mu, score function (derivative
-        of log-likelihood) and Fisher matrix (all with phi=1) all in one go"""
+        """Compute linear predictor, mean, score function and fisher matrix.
+
+        It calculates the linear predictor, the mean, score function
+        (derivative of log-likelihood) and Fisher information matrix
+        all in one go as function of `coef` (:math:`w`) and the data.
+        """
         n_samples, n_features = X.shape
         # eta = linear predictor
         eta = safe_sparse_dot(X, coef, dense_output=True)
@@ -561,7 +584,9 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link):
         return eta, mu, score, fisher
 
     def starting_mu(self, y, weights=1):
-        """Starting values for the mean mu_i in (unpenalized) IRLS.
+        """Set starting values for the mean mu.
+
+        These may be good starting points for the (unpenalized) IRLS solver.
 
         Parameters
         ----------
@@ -577,7 +602,9 @@ def starting_mu(self, y, weights=1):
 
 class TweedieDistribution(ExponentialDispersionModel):
     r"""A class for the Tweedie distribution.
-    They have :math:`\mu=\mathrm{E}[Y]` and
+
+    A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely
+    defined by it's mean-variance relationship
     :math:`\mathrm{Var}[Y] \propto \mu^power`.
 
     Special cases are:
@@ -668,7 +695,7 @@ def include_upper_bound(self):
         return self._include_upper_bound
 
     def unit_variance(self, mu):
-        """The unit variance of a Tweedie distribution is v(mu)=mu**power.
+        """Compute the unit variance of a Tweedie distribution v(mu)=mu**power.
 
         Parameters
         ----------
@@ -678,8 +705,8 @@ def unit_variance(self, mu):
         return np.power(mu, self.power)
 
     def unit_variance_derivative(self, mu):
-        """The derivative of the unit variance of a Tweedie distribution is
-        v(mu)=power*mu**(power-1).
+        """Compute the derivative of the unit variance of a Tweedie
+        distribution v(mu)=power*mu**(power-1).
 
         Parameters
         ----------
@@ -732,9 +759,9 @@ def __init__(self):
 
 
 class GeneralizedHyperbolicSecand(ExponentialDispersionModel):
-    """A class for the von Generalized Hyperbolic Secand (GHS) distribution.
+    """A class for the Generalized Hyperbolic Secand (GHS) distribution.
 
-    The GHS distribution is for data y in (-inf, inf).
+    The GHS distribution is for tagets y in (-inf, inf).
     """
     def __init__(self):
         self._lower_bound = -np.Inf
@@ -770,7 +797,7 @@ def unit_deviance(self, y, mu):
 
 
 def _irls_step(X, W, P2, z):
-    """One step in iteratively reweighted least squares
+    """Compute one step in iteratively reweighted least squares.
 
     Solve A w = b for w with
     A = (X' W X + P2)
@@ -823,18 +850,18 @@ def _irls_step(X, W, P2, z):
 
 
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
-    """Regression via a Generalized Linear Model (GLM) based on reproductive
-    Exponential Dispersion Models (EDM) with combined L1 and L2 priors as
-    regularizer.
+    """Regression via a Generalized Linear Model (GLM) with penalties.
 
-    Minimizes the objective function::
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) with
+    combined L1 and L2 priors as regularizer minimizes the following objective
+    function::
 
             1/(2*sum(s)) * deviance(y, h(X*w); s)
             + alpha * l1_ratio * ||P1*w||_1
             + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
 
     with inverse link function `h` and s=`sample_weight` (for
-    `sample_weight=Nones` one has s=1 and sum(s) equals `n_samples`).
+    `sample_weight=None`, one has s=1 and sum(s)=`n_samples`).
     For `P1=P2=identity`, the penalty is the elastic net::
 
             alpha * l1_ratio * ||w||_1
@@ -893,16 +920,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\
             of class ExponentialDispersionModel, optional(default='normal')
-        the distributional assumption of the GLM, i.e. which loss function to
-        be minimized.
+        The distributional assumption of the GLM, i.e. which distribution from
+        the EDM, specifies the loss function to be minimized.
 
     link : {'identity', 'log'} or an instance of class Link,
         optional (default='identity')
-        the link function of the GLM, i.e. mapping from linear predictor
+        The link function of the GLM, i.e. mapping from linear predictor
         (X*coef) to expectation (mu).
 
     fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None)
-        method for estimation of the dispersion parameter phi. Whether to use
+        Method for estimation of the dispersion parameter phi. Whether to use
         the chi squared statisic or the deviance statistic. If None, the
         dispersion is not estimated.
 
@@ -914,15 +941,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
             Sets 'irls' if l1_ratio equals 0, else 'cd'.
 
         'irls'
-            iterated reweighted least squares (Fisher scoring).
-            It is the standard algorithm for GLMs. Cannot deal with
+            Iterated reweighted least squares (with Fisher scoring).
+            It is the standard algorithm for GLMs. It cannot deal with
             L1 penalties.
 
         'newton-cg', 'lbfgs'
             Cannot deal with L1 penalties.
 
         'cd'
-            coordinate descent algorithm. It can deal with L1 as well as L2
+            Coordinate descent algorithm. It can deal with L1 as well as L2
             penalties.
 
     max_iter : int, optional (default=100)
@@ -930,15 +957,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     tol : float, optional (default=1e-4)
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
-        the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol``
+        the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative of
-        the deviance).
+        the objective function).
 
     warm_start : boolean, optional (default=False)
-        If set to ``True``, reuse the solution of the previous call to fit as
-        initialization for ``coef_`` and ``intercept_`` (supersedes option
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` (supersedes option
         ``start_params``). If set to ``True`` or if the attribute ``coef_``
-        does not exit (first call to fit), option ``start_params`` sets the
+        does not exit (first call to ``fit``), option ``start_params`` sets the
         starting values for ``coef_`` and ``intercept_``.
 
     start_params : {None, 'least_squares', 'zero', array of shape \
@@ -946,8 +973,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         If an array of size n_features* is supplied, use it as start values
         for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
         is assumed to be the start value for the ``intercept_``.
-        Note that n_features* = X.shape[1] + fit_intercept includes the
-        intercept in counting.
+        Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes
+        the intercept in counting.
         If 'least_squares' is set, the result of a least squares fit in the
         link space (linear predictor) is taken.
         If 'zero' is set, all coefficients start with zero.
@@ -960,7 +987,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         For the solver 'cd' (coordinate descent), the coordinates (features)
         can be updated in either cyclic or random order.
         If set to 'random', a random coefficient is updated every iteration
-        rather than looping over features sequentially by default. This
+        rather than looping over features sequentially in the same order. This
         (setting to 'random') often leads to significantly faster convergence
         especially when tol is higher than 1e-4.
 
@@ -1057,7 +1084,7 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
         self.verbose = verbose
 
     def fit(self, X, y, sample_weight=None):
-        """Fit a generalized linear model.
+        """Fit a Generalized Linear Model.
 
         Parameters
         ----------
@@ -1087,12 +1114,14 @@ def fit(self, X, y, sample_weight=None):
         _dtype = [np.float64, np.float32]
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          dtype=_dtype, y_numeric=True, multi_output=False)
+        # Without converting y to float, deviance might raise
+        # ValueError: Integers to negative integer powers are not allowed.
         y = y.astype(np.float64)
 
         weights = _check_weights(sample_weight, y.shape[0])
 
         # 1.2 validate arguments of __init__ ##################################
-        # Garantee that self._family_instance is an instance of class
+        # Guarantee that self._family_instance is an instance of class
         # ExponentialDispersionModel
         if isinstance(self.family, ExponentialDispersionModel):
             self._family_instance = self.family
@@ -1112,7 +1141,8 @@ def fit(self, X, y, sample_weight=None):
                     " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];"
                     " got (family={0})".format(self.family))
 
-        # Garantee that self._link_instance is set to an instance of class Link
+        # Guarantee that self._link_instance is set to an instance of
+        # class Link
         if isinstance(self.link, Link):
             self._link_instance = self.link
         else:
@@ -1127,19 +1157,19 @@ def fit(self, X, y, sample_weight=None):
                     .format(self.link))
 
         if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
-            raise ValueError("Penalty term must be non-negative;"
+            raise ValueError("Penalty term must be a non-negative number;"
                              " got (alpha={0})".format(self.alpha))
         if (not isinstance(self.l1_ratio, numbers.Number) or
                 self.l1_ratio < 0 or self.l1_ratio > 1):
-            raise ValueError("l1_ratio must be in interval [0, 1]; got"
-                             " (l1_ratio={0})".format(self.l1_ratio))
+            raise ValueError("l1_ratio must be a number in interval [0, 1];"
+                             " got (l1_ratio={0})".format(self.l1_ratio))
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
         if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']:
-            raise ValueError("GeneralizedLinearRegressor supports only irls, "
-                             "auto, lbfgs, newton-cg and cd solvers, got {0}"
-                             "".format(self.solver))
+            raise ValueError("GeneralizedLinearRegressor supports only solvers"
+                             " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';"
+                             " got {0}".format(self.solver))
         solver = self.solver
         if self.solver == 'auto':
             if self.l1_ratio == 0:
@@ -1168,8 +1198,8 @@ def fit(self, X, y, sample_weight=None):
         elif isinstance(start_params, str):
             if start_params not in ['least_squares', 'zero']:
                 raise ValueError("The argument start_params must be None, "
-                                 "'least-squares', 'zero' or an array of right"
-                                 " length,"
+                                 "'least-squares', 'zero' or an array of "
+                                 " correct length;"
                                  " got(start_params={0})".format(start_params))
         else:
             start_params = check_array(start_params, accept_sparse='csr',
@@ -1179,21 +1209,21 @@ def fit(self, X, y, sample_weight=None):
                     (start_params.ndim != 1)):
                 raise ValueError("Start values for parameters must have the"
                                  "right length and dimension; required (length"
-                                 "={0}, ndim=1), got (length={1}, ndim={2})."
+                                 "={0}, ndim=1); got (length={1}, ndim={2})."
                                  .format(X.shape[1] + self.fit_intercept,
                                          start_params.shape[0],
                                          start_params.ndim))
 
         if self.selection not in ['cyclic', 'random']:
             raise ValueError("The argument selection must be 'cyclic' or "
-                             "'random', got (selection={0})"
+                             "'random'; got (selection={0})"
                              .format(self.selection))
         random_state = check_random_state(self.random_state)
         if not isinstance(self.copy_X, bool):
             raise ValueError("The argument copy_X must be bool;"
                              " got {0}".format(self.copy_X))
         if not isinstance(self.check_input, bool):
-            raise ValueError("The attribute check_input must be bool; got "
+            raise ValueError("The argument check_input must be bool; got "
                              "(check_input={0})".format(self.check_input))
 
         if self.P1 is None:
@@ -1232,7 +1262,7 @@ def fit(self, X, y, sample_weight=None):
         link = self._link_instance
 
         if self.fit_intercept:
-            # intercept is first column <=> coef[0] is for intecept
+            # Note: intercept is first column <=> coef[0] is for intecept
             if sparse.issparse(X):
                 Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X])
             else:
@@ -1259,10 +1289,11 @@ def fit(self, X, y, sample_weight=None):
         P2 *= l2
         # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
         # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
-        if sparse.issparse(P2):
-            P2 = 0.5 * (P2 + P2.transpose())
-        else:
-            P2 = 0.5 * (P2 + P2.T)
+        if P2.ndim == 2:
+            if sparse.issparse(P2):
+                P2 = 0.5 * (P2 + P2.transpose())
+            else:
+                P2 = 0.5 * (P2 + P2.T)
 
         # 1.3 additional validations ##########################################
         if self.check_input:
@@ -1301,7 +1332,7 @@ def fit(self, X, y, sample_weight=None):
         # we rescale weights such that sum(weights) = 1 and this becomes
         # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
         weights_sum = np.sum(weights)
-        weights = weights/np.sum(weights)
+        weights = weights/weights_sum
 
         #######################################################################
         # 3. initialization of coef = (intercept_, coef_)                     #
@@ -1338,6 +1369,8 @@ def fit(self, X, y, sample_weight=None):
                 # with L1 penalty, start with coef = 0
                 # TODO: Are there better options?
                 coef = np.zeros(n_features)
+                if self.fit_intercept:
+                    coef[0] = link.link(np.mean(y))
         elif isinstance(self.start_params, str):
             if self.start_params == 'zero':
                 coef = np.zeros(n_features)
@@ -1546,6 +1579,8 @@ def Hs(s):
                 # inner loop
                 # TODO: use sparsity (coefficient already 0 due to L1 penalty)
                 #       => active set of features for featurelist, see paper
+                #          of Improved GLMNET or Gap Safe Screening Rules
+                #          https://arxiv.org/abs/1611.05780
                 # A = f'(w) + d*H(w) + (w+d)*P2
                 # B = H+P2
                 # Note: f'=-score and H=fisher are updated at the end of outer
@@ -1694,7 +1729,7 @@ def Hs(s):
         return self
 
     def linear_predictor(self, X):
-        """The linear_predictor X*coef_ + intercept_.
+        """Compute the linear_predictor = X*coef_ + intercept_.
 
         Parameters
         ----------
@@ -1741,8 +1776,7 @@ def predict(self, X, sample_weight=None):
         return mu*weights
 
     def estimate_phi(self, X, y, sample_weight=None):
-        """Estimation of the dispersion parameter phi.
-        Returns the estimate.
+        """Estimate/fit the dispersion parameter phi.
 
         Parameters
         ----------
@@ -1755,6 +1789,11 @@ def estimate_phi(self, X, y, sample_weight=None):
         sample_weight : {None, array-like}, shape (n_samples,), optional \
                 (default=None)
             Sample weights.
+
+        Returns
+        -------
+        phi : float
+            Dispersion parameter.
         """
         check_is_fitted(self, "coef_")
         _dtype = [np.float64, np.float32]
@@ -1785,15 +1824,17 @@ def estimate_phi(self, X, y, sample_weight=None):
     # "AssertionError: -0.28014056555724598 not greater than 0.5"
     # unless GeneralizedLinearRegressor has a score which passes the test.
     def score(self, X, y, sample_weight=None):
-        r"""Returns D^2, a generalization of the coefficient of determination
-        R^2, which uses deviance instead of squared error.
+        r"""Compute D^2, the percentage of deviance explained.
+
+        D^2 is a generalization of the coefficient of determination R^2.
+        R^2 uses squared error and D^2 deviance. Note that those two are equal
+        for family='normal'.
 
         D^2 is defined as
         :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}`
         is the null deviance, i.e. the deviance of a model with intercept
         alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean
-        :math:`\bar{y}` is averaged by sample_weight. In the case of a Normal
-        distribution, D^2 equals R^2.
+        :math:`\bar{y}` is averaged by sample_weight.
         Best possible score is 1.0 and it can be negative (because the
         model can be arbitrarily worse).
 
@@ -1812,7 +1853,7 @@ def score(self, X, y, sample_weight=None):
         Returns
         -------
         score : float
-            D^2 of self.predict(X) wrt. y.
+            D^2 of self.predict(X) w.r.t. y.
         """
         # Note, default score defined in RegressorMixin is R^2 score.
         # TODO: make D^2 a score function in module metrics (and thereby get

From 01033e36d913756f7ff5e2214189cf1d7426dee1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 20 Feb 2019 17:20:09 +0100
Subject: [PATCH 044/209] Fix false formula in starting_mu and improve
 start_params

---
 sklearn/linear_model/glm.py | 108 ++++++++++++++++++++----------------
 1 file changed, 60 insertions(+), 48 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index d69ccd0a66486..fad7492acc2fb 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -583,7 +583,7 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link):
         fisher = safe_sparse_dot(X.T, temp, dense_output=False)
         return eta, mu, score, fisher
 
-    def starting_mu(self, y, weights=1):
+    def starting_mu(self, y, weights=1, ind_weight=0.5):
         """Set starting values for the mean mu.
 
         These may be good starting points for the (unpenalized) IRLS solver.
@@ -595,9 +595,13 @@ def starting_mu(self, y, weights=1):
 
         weights : array, shape (n_samples,) (default=1)
             Weights or exposure to which variance is inverse proportional.
+
+        ind_weight : float (default=0.5)
+            Must be between 0 and 1. Specifies how much weight is given to the
+            individual observations instead of the mean of y.
         """
-        return ((weights*y+np.mean(weights*y)) /
-                (2.*np.sum(np.ones_like(y)*weights)))
+        return (ind_weight * y +
+                (1. - ind_weight) * np.average(y, weights=weights))
 
 
 class TweedieDistribution(ExponentialDispersionModel):
@@ -852,17 +856,19 @@ def _irls_step(X, W, P2, z):
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """Regression via a Generalized Linear Model (GLM) with penalties.
 
-    GLMs based on a reproductive Exponential Dispersion Model (EDM) with
-    combined L1 and L2 priors as regularizer minimizes the following objective
-    function::
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
+    fitting and predicting the mean `mu=h(X*w)`. Therefore the fit minimizes
+    the following objective function with combined L1 and L2 priors as
+    regularizer::
 
             1/(2*sum(s)) * deviance(y, h(X*w); s)
             + alpha * l1_ratio * ||P1*w||_1
             + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
 
     with inverse link function `h` and s=`sample_weight` (for
-    `sample_weight=None`, one has s=1 and sum(s)=`n_samples`).
-    For `P1=P2=identity`, the penalty is the elastic net::
+    ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`).
+    For `P1=P2=identity` (``P1=None``, ``P2=None``), the penalty is the
+    elastic net::
 
             alpha * l1_ratio * ||w||_1
             + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2
@@ -966,24 +972,34 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         as initialization for ``coef_`` and ``intercept_`` (supersedes option
         ``start_params``). If set to ``True`` or if the attribute ``coef_``
         does not exit (first call to ``fit``), option ``start_params`` sets the
-        starting values for ``coef_`` and ``intercept_``.
+        start values for ``coef_`` and ``intercept_``.
+
+    start_params : {'irls', 'least_squares', 'zero', array of shape \
+            (n_features*, )}, optional (default='irls')
+        Relevant only if ``warm_start=False`` or if fit is called
+        the first time (``self.coef_`` does not yet exist).
+
+        'irls'
+            Start values of mu are calculated by family.starting_mu(..). Then,
+            one step of irls obtains start values for ``coef_`. This gives
+            usually good results.
 
-    start_params : {None, 'least_squares', 'zero', array of shape \
-            (n_features*, )}, optional (default=None)
-        If an array of size n_features* is supplied, use it as start values
-        for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
+        'least_squares'
+        Start values for ``coef_`` are obtained by a least squares fit in the
+        link space (y is transformed to the space of the linear predictor).
+
+        'zero'
+        All coefficients are set to zero. If ``fit_intercept=True``, the
+        start value for the intercept is obtained by the average of y.
+
+        array
+        The array of size n_features* is directly used as start values
+        for ``coef_``. If ``fit_intercept=True``, the first element
         is assumed to be the start value for the ``intercept_``.
         Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes
         the intercept in counting.
-        If 'least_squares' is set, the result of a least squares fit in the
-        link space (linear predictor) is taken.
-        If 'zero' is set, all coefficients start with zero.
-        If ``None``, the start values are calculated by setting mu to
-        family.starting_mu(..) and one step of irls.
-        These options only apply if ``warm_start=False`` or if fit is called
-        the first time (``self.coef_`` does not yet exist).
 
-    selection : str, optional (default='random')
+    selection : str, optional (default='cyclic')
         For the solver 'cd' (coordinate descent), the coordinates (features)
         can be updated in either cyclic or random order.
         If set to 'random', a random coefficient is updated every iteration
@@ -1005,7 +1021,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     check_input : boolean, optional (default=True)
         Allow to bypass several checks on input: y values in range of family,
-        sample_weights non-negative, P2 positive semi-definite.
+        sample_weight non-negative, P2 positive semi-definite.
         Don't use this parameter unless you know what you do.
 
     verbose : int, optional (default=0)
@@ -1061,8 +1077,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
                  fit_intercept=True, family='normal', link='identity',
                  fit_dispersion=None, solver='auto', max_iter=100,
-                 tol=1e-4, warm_start=False, start_params=None,
-                 selection='random', random_state=None, copy_X=True,
+                 tol=1e-4, warm_start=False, start_params='irls',
+                 selection='cyclic', random_state=None, copy_X=True,
                  check_input=True, verbose=0):
         self.alpha = alpha
         self.l1_ratio = l1_ratio
@@ -1193,11 +1209,9 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("The argument warm_start must be bool;"
                              " got {0}".format(self.warm_start))
         start_params = self.start_params
-        if start_params is None:
-            pass
-        elif isinstance(start_params, str):
-            if start_params not in ['least_squares', 'zero']:
-                raise ValueError("The argument start_params must be None, "
+        if isinstance(start_params, str):
+            if start_params not in ['irls', 'least_squares', 'zero']:
+                raise ValueError("The argument start_params must be 'irls', "
                                  "'least-squares', 'zero' or an array of "
                                  " correct length;"
                                  " got(start_params={0})".format(start_params))
@@ -1348,11 +1362,11 @@ def fit(self, X, y, sample_weight=None):
                                        self.coef_))
             else:
                 coef = self.coef_
-        elif self.start_params is None:
-            if self.l1_ratio == 0:
+        elif isinstance(start_params, str):
+            if start_params == 'irls':
                 # See 3.1 IRLS
                 # Use mu_start and apply one irls step to calculate coef
-                mu = family.starting_mu(y, weights)
+                mu = family.starting_mu(y, weights=weights)
                 # linear predictor
                 eta = link.link(mu)
                 # h'(eta)
@@ -1365,16 +1379,9 @@ def fit(self, X, y, sample_weight=None):
                 # solve A*coef = b
                 # A = X' W X + l2 P2, b = X' W z
                 coef = _irls_step(Xnew, W, P2, z)
-            else:
-                # with L1 penalty, start with coef = 0
-                # TODO: Are there better options?
-                coef = np.zeros(n_features)
-                if self.fit_intercept:
-                    coef[0] = link.link(np.mean(y))
-        elif isinstance(self.start_params, str):
-            if self.start_params == 'zero':
-                coef = np.zeros(n_features)
-            elif self.start_params == 'least_squares':
+            elif start_params == 'least_squares':
+                # less restrictive tolerance for finding start values
+                tol = np.max([self.tol, np.sqrt(self.tol)])
                 if self.alpha == 0:
                     reg = LinearRegression(copy_X=True, fit_intercept=False)
                     reg.fit(Xnew, link.link(y))
@@ -1384,18 +1391,21 @@ def fit(self, X, y, sample_weight=None):
                     # => use Ridge
                     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
                     reg = Ridge(copy_X=True, fit_intercept=False,
-                                alpha=self.alpha*n_samples,
-                                tol=np.max([self.tol, np.sqrt(self.tol)]))
+                                alpha=self.alpha*n_samples, tol=tol)
                     reg.fit(Xnew, link.link(y))
                     coef = reg.coef_
                 else:
                     # TODO: Does this make sense at all?
                     reg = ElasticNet(copy_X=True, fit_intercept=False,
                                      alpha=self.alpha, l1_ratio=self.l1_ratio,
-                                     tol=np.max([self.tol, np.sqrt(self.tol)]))
+                                     tol=tol)
                     reg.fit(Xnew, link.link(y))
                     coef = reg.coef_
-        else:
+            else:  # start_params == 'zero'
+                coef = np.zeros(n_features)
+                if self.fit_intercept:
+                    coef[0] = link.link(np.average(y, weights=weights))
+        else:  # assign given array as start values
             coef = start_params
 
         #######################################################################
@@ -1560,6 +1570,8 @@ def Hs(s):
             # some precalculations
             eta, mu, score, fisher = family._eta_mu_score_fisher(
                 coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link)
+            # set up space for search direction d for inner loop
+            d = np.zeros_like(coef)
             # initial stopping tolerance of inner loop
             # use L1-norm of minimum-norm of subgradient of F
             # fp_wP2 = f'(w) + w*P2
@@ -1574,8 +1586,8 @@ def Hs(s):
             # outer loop
             while self.n_iter_ < self.max_iter:
                 self.n_iter_ += 1
-                # initialize search direction d (to be optimized)
-                d = np.zeros_like(coef)
+                # initialize search direction d (to be optimized) with zero
+                d.fill(0)
                 # inner loop
                 # TODO: use sparsity (coefficient already 0 due to L1 penalty)
                 #       => active set of features for featurelist, see paper

From 4071a8a54de0112fd1afd6d8fc5b5585708c84ea Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 20 Feb 2019 21:15:04 +0100
Subject: [PATCH 045/209] Improve argument handling of P1 and P2

* P2 also accepts 1d array and interprets it as diagonal matrix

* improved input checks for P1 and P2
---
 sklearn/linear_model/glm.py            | 103 ++++++++++++++++---------
 sklearn/linear_model/tests/test_glm.py |  31 ++++----
 2 files changed, 81 insertions(+), 53 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index fad7492acc2fb..33e0d75730e3a 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -45,7 +45,10 @@
 from abc import ABCMeta, abstractmethod, abstractproperty
 import numbers
 import numpy as np
-from scipy import linalg, optimize, sparse, special
+from scipy import linalg, sparse
+import scipy.sparse.linalg as splinalg
+from scipy.optimize import fmin_l_bfgs_b
+from scipy.special import xlogy
 import warnings
 from .base import LinearRegression
 from .coordinate_descent import ElasticNet
@@ -727,7 +730,7 @@ def unit_deviance(self, y, mu):
         if p == 1:
             # PoissonDistribution
             # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
-            return 2 * (special.xlogy(y, y/mu) - y + mu)
+            return 2 * (xlogy(y, y/mu) - y + mu)
         elif p == 2:
             # GammaDistribution
             return 2 * (np.log(mu/y)+y/mu-1)
@@ -840,7 +843,7 @@ def _irls_step(X, W, P2, z):
         XtW = X.transpose() * W
         A = XtW * X + L2
         b = XtW * z
-        coef = sparse.linalg.spsolve(A, b)
+        coef = splinalg.spsolve(A, b)
     else:
         XtW = (X.T * W)
         A = XtW.dot(X)
@@ -867,7 +870,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     with inverse link function `h` and s=`sample_weight` (for
     ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`).
-    For `P1=P2=identity` (``P1=None``, ``P2=None``), the penalty is the
+    For ``P1=P2='identity'`` (``P1=None``, ``P2=None``), the penalty is the
     elastic net::
 
             alpha * l1_ratio * ||w||_1
@@ -904,21 +907,24 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
-    P1 : {None, array-like}, shape (n_features,), optional \
-            (default=None)
+    P1 : {'identity', array-like}, shape (n_features,), optional \
+            (default='identity')
         With this array, you can exclude coefficients from the L1 penalty.
         Set the corresponding value to 1 (include) or 0 (exclude). The
-        default value ``None`` is the same as a 1d array of ones.
+        default value ``'identity'`` is the same as a 1d array of ones.
         Note that n_features = X.shape[1].
 
-    P2 : {None, array-like, sparse matrix}, shape \
-            (n_features, n_features), optional (default=None)
-        With this square matrix the L2 penalty is calculated as `w P2 w`.
-        This gives a fine control over this penalty (Tikhonov
-        regularization). The diagonal zeros of a diagonal P2, for example,
-        exclude all corresponding coefficients from the L2 penalty.
-        The default value ``None`` is the same as the identity matrix.
-        Note that n_features = X.shape[1]. P2 must be positive semi-definite.
+    P2 : {'identity', array-like, sparse matrix}, shape \
+            (n_features,) or (n_features, n_features), optional \
+            (default='identity')
+        With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`.
+        This gives a fine control over this penalty (Tikhonov regularization).
+        A 2d array is directly used as the square matrix P2. A 1d array is
+        interpreted as diagonal (square) matrix. The default 'identity' sets
+        the identity matrix, which gives the usual squared L2-norm. If you just
+        want to exclude certain coefficients, pass a 1d array filled with 1,
+        and 0 for the coefficients to be excluded.
+        Note that P2 must be positive semi-definite.
 
     fit_intercept : boolean, optional (default=True)
         Specifies if a constant (a.k.a. bias or intercept) should be
@@ -1074,7 +1080,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
           Journal of Machine Learning Research 13 (2012) 1999-2030
           https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
     """
-    def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
+    def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
                  fit_intercept=True, family='normal', link='identity',
                  fit_dispersion=None, solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False, start_params='irls',
@@ -1240,20 +1246,23 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("The argument check_input must be bool; got "
                              "(check_input={0})".format(self.check_input))
 
-        if self.P1 is None:
+        if isinstance(self.P1, str) and self.P1 == 'identity':
             P1 = np.ones(X.shape[1])
         else:
-            P1 = np.copy(np.atleast_1d(self.P1))
-            if P1.dtype.kind not in ['b', 'i', 'u', 'f']:
-                raise ValueError("P1 must be a numeric value; "
-                                 "got (dtype={0}).".format(P1.dtype))
+            P1 = np.atleast_1d(self.P1)
+            try:
+                P1 = P1.astype(np.float64, casting='safe', copy=True)
+            except TypeError:
+                raise TypeError("The given P1 cannot be converted to a numeric"
+                                "array; got (P1.dtype={0})."
+                                .format(P1.dtype))
             if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]):
-                raise ValueError("P1 must be either None or a 1d array with "
-                                 "the length of X.shape[1]; "
+                raise ValueError("P1 must be either 'identity' or a 1d array "
+                                 "with the length of X.shape[1]; "
                                  "got (P1.shape[0]={0}), "
                                  "needed (X.shape[1]={1})."
                                  .format(P1.shape[0], X.shape[1]))
-        if self.P2 is None:
+        if isinstance(self.P2, str) and self.P2 == 'identity':
             if not sparse.issparse(X):
                 P2 = np.ones(X.shape[1])
             else:
@@ -1262,8 +1271,15 @@ def fit(self, X, y, sample_weight=None):
         else:
             P2 = check_array(self.P2, copy=True,
                              accept_sparse=['csr', 'csc', 'coo'],
-                             dtype="numeric", ensure_2d=True)
-            if ((P2.ndim != 2) or
+                             dtype=_dtype, ensure_2d=False)
+            if P2.ndim == 1:
+                if P2.shape[0] != X.shape[1]:
+                    raise ValueError("P2 should be a 1d array of shape "
+                                     "(n_features,) with "
+                                     "n_features=X.shape[1]; "
+                                     "got (P2.shape=({0},)), needed ({1},)"
+                                     .format(P2.shape[0], X.shape[1]))
+            elif ((P2.ndim != 2) or
                     (P2.shape[0] != P2.shape[1]) or
                     (P2.shape[0] != X.shape[1])):
                 raise ValueError("P2 must be either None or an array of shape "
@@ -1319,21 +1335,32 @@ def fit(self, X, y, sample_weight=None):
                 raise ValueError("Sample weights must be non-negative.")
             # check if P1 has only non-negative values, negative values might
             # indicate group lasso in the future.
-            if self.P1 is not None:
+            if self.P1 != 'identity':
                 if not np.all(P1 >= 0):
                     raise ValueError("P1 must not have negative values.")
             # check if P2 is positive semidefinite
             # np.linalg.cholesky(P2) 'only' asserts positive definite
-            if self.P2 is not None:
-                if sparse.issparse(P2):
-                    # TODO: check sparse P2 for non-negativeness
-                    # raise NotImplementedError("Check sparse P2 for "
-                    #                          "non-negativeness is not yet "
-                    #                          "implemented.")
-                    pass
-                elif P2.ndim == 2:
-                    if not np.all(np.linalg.eigvals(P2) >= -1e-15):
-                        raise ValueError("P2 must be positive definite.")
+            if self.P2 != 'identity':
+                # due to numerical precision, we allow eigenvalues to be a
+                # tiny bit negative
+                epsneg = 10 * np.finfo(P2.dtype).epsneg
+                if P2.ndim == 1 or P2.shape[0] == 1:
+                    if not np.all(P2 >= 0):
+                        raise ValueError("1d array P2 must not have negative "
+                                         "values.")
+                elif sparse.issparse(P2):
+                    # for sparse matrices, not all eigenvals can be computed
+                    # efficiently, use only half of n_features
+                    # k = how many eigenvals to compute
+                    k = np.min([10, n_features // 10 + 1])
+                    sigma = 0  # start searching near this value
+                    which = 'SA'  # find smallest algebraic eigenvalues first
+                    if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma,
+                                                 which=which) >= epsneg):
+                        raise ValueError("P2 must be positive semi-definite.")
+                else:
+                    if not np.all(linalg.eigvalsh(P2) >= epsneg):
+                        raise ValueError("P2 must be positive semi-definite.")
             # TODO: if alpha=0 check that Xnew is not rank deficient
             # TODO: what else to check?
 
@@ -1520,7 +1547,7 @@ def Hs(s):
             args = (Xnew, y, weights, link)
 
             if solver == 'lbfgs':
-                coef, loss, info = optimize.fmin_l_bfgs_b(
+                coef, loss, info = fmin_l_bfgs_b(
                     func, coef, fprime=fprime, args=args,
                     iprint=(self.verbose > 0) - 1, pgtol=self.tol,
                     maxiter=self.max_iter)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 1ac5ccd4d3d5c..fde1604ad16e3 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -188,25 +188,26 @@ def test_glm_l1_ratio_argument():
         assert_raises(ValueError, glm.fit, X, y)
 
 
-def test_glm_P1_argument():
-    """Test GLM P1 arguments
-    """
+@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3]])
+def test_glm_P1_argument(P1):
+    """Test GLM P1 arguments."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for P1 in [['a string', 'a string'], [1, [2]], [1, 2, 3]]:
-        glm = GeneralizedLinearRegressor(P1=P1)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(P1=P1)
+    with pytest.raises((ValueError, TypeError)):
+        glm.fit(X, y)
 
 
-# def test_glm_P2_argument():
-#     """Test GLM P2 arguments
-#     """
-#     y = np.array([1, 2])
-#     X = np.array([[1], [1]])
-#     for P2 in [np.full((2, 2), 'a string', dtype=np.dtype('<U8')),
-#                [[1, [2]], [3, 4]], [1, 2, 3], [[1, 2]], [[1], [2]]]:
-#         glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
-#         assert_raises(ValueError, glm.fit, X, y)
+@pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]],
+                                sparse.csr_matrix([1, 2, 3]),
+                                sparse.lil_matrix([[1]])])
+def test_glm_P2_argument(P2):
+    """Test GLM P2 arguments."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
+    with pytest.raises((ValueError, TypeError)):
+        glm.fit(X, y)
 
 
 def test_glm_fit_intercept_argument():

From 757bc3c53facc32ca8eb3ef9b10100c3accb40dd Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 20 Feb 2019 22:48:14 +0100
Subject: [PATCH 046/209] Fix doctest, test_poisson_enet, change IRLS to use
 lstsq, fix input checks

* adapt examples of GeneralizedLinearModel to new defaults for
  P1, P2 and selection

* fix precision/decimal issue in test_poisson_enet

* use more robust least squares instead of solve in IRLS

* fix sign error in input checks
---
 doc/modules/linear_model.rst           | 16 +++++++++-------
 sklearn/linear_model/glm.py            | 25 ++++++++++++++++---------
 sklearn/linear_model/tests/test_glm.py | 24 ++++++++++++++++++++----
 3 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 174d1e4eddae4..e60e9e84a4747 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -924,15 +924,17 @@ follows:
     >>> from sklearn.linear_model import GeneralizedLinearRegressor
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
-    GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True,
-                  copy_X=True, family='poisson', fit_dispersion=None,
-                  fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
-                  random_state=None, selection='random', solver='auto',
-                  start_params=None, tol=0.0001, verbose=0, warm_start=False)
+    GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5,
+                               check_input=True, copy_X=True, family='poisson',
+                               fit_dispersion=None, fit_intercept=True, l1_ratio=0,
+                               link='log', max_iter=100, random_state=None,
+                               selection='cyclic', solver='auto',
+                               start_params='irls', tol=0.0001, verbose=0,
+                               warm_start=False)
     >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE
-    array([0.24630255, 0.43373521])
+    array([0.24630169, 0.43373464])
     >>> reg.intercept_ #doctest: +ELLIPSIS
-    -0.76383575...
+    -0.76383633...
 
 
 .. topic:: Examples:
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 33e0d75730e3a..bc1a0434fa3b0 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -830,8 +830,9 @@ def _irls_step(X, W, P2, z):
     -------
     coef: array, shape (X.shape[1])
     """
-    # TODO: scipy.linalg.solve seems faster, but ordinary least squares uses
-    #       scipy.linalg.lstsq. What is more appropriate?
+    # Note: solve vs least squares, what is more appropriate?
+    #       scipy.linalg.solve seems faster, but scipy.linalg.lstsq
+    #       is more robust.
     n_samples, n_features = X.shape
     if sparse.issparse(X):
         W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr()
@@ -843,7 +844,8 @@ def _irls_step(X, W, P2, z):
         XtW = X.transpose() * W
         A = XtW * X + L2
         b = XtW * z
-        coef = splinalg.spsolve(A, b)
+        # coef = splinalg.spsolve(A, b)
+        coef, *_ = splinalg.lsmr(A, b)
     else:
         XtW = (X.T * W)
         A = XtW.dot(X)
@@ -852,7 +854,8 @@ def _irls_step(X, W, P2, z):
         else:
             A += P2
         b = XtW.dot(z)
-        coef = linalg.solve(A, b)
+        # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True)
+        coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
     return coef
 
 
@@ -1340,12 +1343,15 @@ def fit(self, X, y, sample_weight=None):
                     raise ValueError("P1 must not have negative values.")
             # check if P2 is positive semidefinite
             # np.linalg.cholesky(P2) 'only' asserts positive definite
-            if self.P2 != 'identity':
+            if not isinstance(self.P2, str):  # self.P2 != 'identity'
                 # due to numerical precision, we allow eigenvalues to be a
                 # tiny bit negative
-                epsneg = 10 * np.finfo(P2.dtype).epsneg
+                epsneg = -10 * np.finfo(P2.dtype).epsneg
                 if P2.ndim == 1 or P2.shape[0] == 1:
-                    if not np.all(P2 >= 0):
+                    p2 = P2
+                    if sparse.issparse(P2):
+                        p2 = P2.toarray()
+                    if not np.all(p2 >= 0):
                         raise ValueError("1d array P2 must not have negative "
                                          "values.")
                 elif sparse.issparse(P2):
@@ -1360,6 +1366,7 @@ def fit(self, X, y, sample_weight=None):
                         raise ValueError("P2 must be positive semi-definite.")
                 else:
                     if not np.all(linalg.eigvalsh(P2) >= epsneg):
+                        return P2
                         raise ValueError("P2 must be positive semi-definite.")
             # TODO: if alpha=0 check that Xnew is not rank deficient
             # TODO: what else to check?
@@ -1689,7 +1696,7 @@ def Hs(s):
                     mn_subgrad = (np.where(coef + d == 0,
                                   np.sign(A)*np.maximum(np.abs(A)-P1, 0),
                                   A+np.sign(coef+d)*P1))
-                    mn_subgrad = np.sum(np.abs(mn_subgrad))
+                    mn_subgrad = linalg.norm(mn_subgrad, ord=1)
                     if mn_subgrad <= inner_tol:
                         if inner_iter == 1:
                             inner_tol = inner_tol/4.
@@ -1740,7 +1747,7 @@ def Hs(s):
                 mn_subgrad = (np.where(coef == 0,
                               np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
                               fp_wP2+np.sign(coef)*P1))
-                mn_subgrad = np.sum(np.abs(mn_subgrad))
+                mn_subgrad = linalg.norm(mn_subgrad, ord=1)
                 if mn_subgrad <= self.tol:
                     converged = True
                     break
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index fde1604ad16e3..8893028d0176a 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -2,7 +2,7 @@
 from numpy.testing import assert_allclose
 import pytest
 import scipy as sp
-from scipy import sparse
+from scipy import sparse, optimize
 
 from sklearn.linear_model.glm import (
     Link,
@@ -199,8 +199,7 @@ def test_glm_P1_argument(P1):
 
 
 @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]],
-                                sparse.csr_matrix([1, 2, 3]),
-                                sparse.lil_matrix([[1]])])
+                                sparse.csr_matrix([1, 2, 3])])
 def test_glm_P2_argument(P2):
     """Test GLM P2 arguments."""
     y = np.array([1, 2])
@@ -515,12 +514,29 @@ def test_poisson_enet():
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
-                                     link='log', solver='cd', tol=1e-7,
+                                     link='log', solver='cd', tol=1e-8,
                                      selection='random', random_state=42)
     glm.fit(X, y)
     assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7)
     assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7)
 
+    # test results with general optimization procedure
+    def obj(coef):
+        pd = PoissonDistribution()
+        link = LogLink()
+        N = y.shape[0]
+        mu = link.inverse(X @ coef[1:]+coef[0])
+        alpha, l1_ratio = (1, 0.5)
+        return 1./(2.*N) * pd.deviance(y, mu) \
+            + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \
+            + alpha * l1_ratio * np.sum(np.abs(coef[1:]))
+    res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10,
+                            options={'maxiter': 1000, 'disp': False})
+    assert_almost_equal(glm.intercept_, res.x[0], decimal=5)
+    assert_almost_equal(glm.coef_, res.x[1:], decimal=5)
+    assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))),
+                        res.fun, decimal=8)
+
     # same for start_params='zero' and selection='cyclic'
     # with reduced precision
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',

From ed8e74f97d2b1921af4b8c2907c9e30629788bdc Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 23 Feb 2019 14:13:22 +0100
Subject: [PATCH 047/209] Use pytest decorators and pytest.raises

---
 sklearn/linear_model/glm.py            |   7 +-
 sklearn/linear_model/tests/test_glm.py | 510 ++++++++++++-------------
 2 files changed, 257 insertions(+), 260 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index bc1a0434fa3b0..f583e17433ee3 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -765,8 +765,8 @@ def __init__(self):
         super(InverseGaussianDistribution, self).__init__(power=3)
 
 
-class GeneralizedHyperbolicSecand(ExponentialDispersionModel):
-    """A class for the Generalized Hyperbolic Secand (GHS) distribution.
+class GeneralizedHyperbolicSecant(ExponentialDispersionModel):
+    """A class for the Generalized Hyperbolic Secant (GHS) distribution.
 
     The GHS distribution is for tagets y in (-inf, inf).
     """
@@ -1338,7 +1338,7 @@ def fit(self, X, y, sample_weight=None):
                 raise ValueError("Sample weights must be non-negative.")
             # check if P1 has only non-negative values, negative values might
             # indicate group lasso in the future.
-            if self.P1 != 'identity':
+            if not isinstance(self.P1, str):  # if self.P1 != 'identity':
                 if not np.all(P1 >= 0):
                     raise ValueError("P1 must not have negative values.")
             # check if P2 is positive semidefinite
@@ -1366,7 +1366,6 @@ def fit(self, X, y, sample_weight=None):
                         raise ValueError("P2 must be positive semi-definite.")
                 else:
                     if not np.all(linalg.eigvalsh(P2) >= epsneg):
-                        return P2
                         raise ValueError("P2 must be positive semi-definite.")
             # TODO: if alpha=0 check that Xnew is not rank deficient
             # TODO: what else to check?
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 8893028d0176a..361a237f2cc9f 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -2,7 +2,7 @@
 from numpy.testing import assert_allclose
 import pytest
 import scipy as sp
-from scipy import sparse, optimize
+from scipy import linalg, optimize, sparse
 
 from sklearn.linear_model.glm import (
     Link,
@@ -11,354 +11,355 @@
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
-    GeneralizedHyperbolicSecand,
+    GeneralizedHyperbolicSecant,
     GeneralizedLinearRegressor)
 from sklearn.linear_model import ElasticNet, Ridge
 
 from sklearn.utils.testing import (
     assert_equal, assert_almost_equal,
-    assert_array_equal, assert_array_almost_equal,
-    assert_raises)
+    assert_array_equal, assert_array_almost_equal)
 
 
-def test_link_properties():
-    """Test link inverse and derivative
-    """
+@pytest.mark.parametrize('link', Link.__subclasses__())
+def test_link_properties(link):
+    """Test link inverse and derivative."""
     rng = np.random.RandomState(0)
     x = rng.rand(100)*100
-    # from sklearn.linear_model.glm import Link
-    # for link in vars()['Link'].__subclasses__():
-    for link in Link.__subclasses__():
-        link = link()
-        assert_almost_equal(link.link(link.inverse(x)), x, decimal=10)
-        assert_almost_equal(link.inverse_derivative(link.link(x)),
-                            1/link.derivative(x), decimal=10)
-
-
-def test_family_bounds():
-    """Test the valid range of distributions
-    """
-    family = NormalDistribution()
-    result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, [True, True, True])
-
-    family = PoissonDistribution()
+    link = link()  # instatiate object
+    assert_almost_equal(link.link(link.inverse(x)), x, decimal=10)
+    assert_almost_equal(link.inverse_derivative(link.link(x)),
+                        1/link.derivative(x), decimal=10)
+
+
+@pytest.mark.parametrize(
+    'family, expected',
+    [(NormalDistribution(), [True, True, True]),
+     (PoissonDistribution(), [False, True, True]),
+     (TweedieDistribution(power=1.5), [False, True, True]),
+     (GammaDistribution(), [False, False, True]),
+     (InverseGaussianDistribution(), [False, False, True]),
+     (TweedieDistribution(power=4.5), [False, False, True])])
+def test_family_bounds(family, expected):
+    """Test the valid range of distributions at -1, 0, 1."""
     result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, [False, True, True])
-
-    family = TweedieDistribution(power=1.5)
-    result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, [False, True, True])
-
-    family = GammaDistribution()
-    result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, [False, False, True])
-
-    family = InverseGaussianDistribution()
-    result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, [False, False, True])
-
-    family = TweedieDistribution(power=4.5)
-    result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, [False, False, True])
-
-
-def test_deviance_zero():
-    """Test deviance(y,y) = 0 for different families
-    """
-    for family in [NormalDistribution(), PoissonDistribution(),
-                   GammaDistribution(), InverseGaussianDistribution(),
-                   TweedieDistribution(power=-2.5),
-                   TweedieDistribution(power=-1),
-                   TweedieDistribution(power=1.5),
-                   TweedieDistribution(power=2.5),
-                   TweedieDistribution(power=4),
-                   GeneralizedHyperbolicSecand()]:
-        assert_almost_equal(family.deviance(0.1, 0.1), 0, decimal=10)
-        assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10)
-
-
-def test_fisher_matrix():
+    assert_array_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    'family, chk_values',
+    [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
+     (PoissonDistribution(), [0.1, 1.5]),
+     (GammaDistribution(), [0.1, 1.5]),
+     (InverseGaussianDistribution(), [0.1, 1.5]),
+     (TweedieDistribution(power=-2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-1), [0.1, 1.5]),
+     (TweedieDistribution(power=1.5), [0.1, 1.5]),
+     (TweedieDistribution(power=2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-4), [0.1, 1.5]),
+     (GeneralizedHyperbolicSecant(), [0.1, 1.5])])
+def test_deviance_zero(family, chk_values):
+    """Test deviance(y,y) = 0 for different families."""
+    for x in chk_values:
+        assert_almost_equal(family.deviance(x, x), 0, decimal=10)
+
+
+@pytest.mark.parametrize(
+    'family, link',
+    [(NormalDistribution(), IdentityLink()),
+     (PoissonDistribution(), LogLink()),
+     (GammaDistribution(), LogLink()),
+     (InverseGaussianDistribution(), LogLink()),
+     (TweedieDistribution(power=1.5), LogLink()),
+     (TweedieDistribution(power=4.5), LogLink())])
+def test_fisher_matrix(family, link):
     """Test the Fisher matrix numerically.
     Trick: Use numerical differentiation with y = mu"""
-    for family in [NormalDistribution(), PoissonDistribution(),
-                   GammaDistribution(), InverseGaussianDistribution()]:
-        link = LogLink()
-        rng = np.random.RandomState(0)
-        coef = np.array([-2, 1, 0, 1, 2.5])
-        phi = 0.5
-        X = rng.randn(10, 5)
-        lin_pred = np.dot(X, coef)
-        mu = link.inverse(lin_pred)
-        weights = rng.randn(10)**2 + 1
-        fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu,
-                                       weights=weights, link=link)
-        approx = np.array([]).reshape(0, coef.shape[0])
-        for i in range(coef.shape[0]):
-            def f(coef):
-                return -family._score(coef=coef, phi=phi, X=X, y=mu,
-                                      weights=weights, link=link)[i]
-            approx = np.vstack(
-                [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]
-                )
-        assert_allclose(fisher, approx, rtol=1e-3)
+    rng = np.random.RandomState(0)
+    coef = np.array([-2, 1, 0, 1, 2.5])
+    phi = 0.5
+    X = rng.randn(10, 5)
+    lin_pred = np.dot(X, coef)
+    mu = link.inverse(lin_pred)
+    weights = rng.randn(10)**2 + 1
+    fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu,
+                                   weights=weights, link=link)
+    approx = np.array([]).reshape(0, coef.shape[0])
+    for i in range(coef.shape[0]):
+        def f(coef):
+            return -family._score(coef=coef, phi=phi, X=X, y=mu,
+                                  weights=weights, link=link)[i]
+        approx = np.vstack(
+            [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)])
+    assert_allclose(fisher, approx, rtol=1e-3)
 
 
 def test_sample_weights_validation():
-    """Test the raised errors in the validation of sample_weight"""
+    """Test the raised errors in the validation of sample_weight."""
     # 1. scalar value but not positive
     X = [[1]]
     y = [1]
     weights = 0
     glm = GeneralizedLinearRegressor(fit_intercept=False)
-    assert_raises(ValueError, glm.fit, X, y, weights)
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
 
     # 2. 2d array
     weights = [[0]]
-    assert_raises(ValueError, glm.fit, X, y, weights)
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
 
     # 3. 1d but wrong length
     weights = [1, 0]
-    assert_raises(ValueError, glm.fit, X, y, weights)
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
 
     # 4. 1d but only zeros (sum not greater than 0)
     weights = [0, 0]
     X = [[0], [1]]
     y = [1, 2]
-    assert_raises(ValueError, glm.fit, X, y, weights)
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
 
     # 5. 1d but weith a negative value
     weights = [2, -1]
-    assert_raises(ValueError, glm.fit, X, y, weights)
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
 
 
 def test_glm_family_argument():
-    """Test GLM family argument set as string
-    """
+    """Test GLM family argument set as string."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
+    X = np.array([[1], [2]])
     for (f, fam) in [('normal', NormalDistribution()),
                      ('poisson', PoissonDistribution()),
                      ('gamma', GammaDistribution()),
                      ('inverse.gaussian', InverseGaussianDistribution())]:
-        glm = GeneralizedLinearRegressor(family=f, fit_intercept=False,
-                                         alpha=0).fit(X, y)
+        glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
         assert_equal(type(glm._family_instance), type(fam))
 
     glm = GeneralizedLinearRegressor(family='not a family',
                                      fit_intercept=False)
-    assert_raises(ValueError, glm.fit, X, y)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
 def test_glm_link_argument():
-    """Test GLM link argument set as string
-    """
+    """Test GLM link argument set as string."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
+    X = np.array([[1], [2]])
     for (l, link) in [('identity', IdentityLink()),
                       ('log', LogLink())]:
-        glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
-                                         link=l).fit(X, y)
+        glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y)
         assert_equal(type(glm._link_instance), type(link))
 
-    glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
-                                     link='not a link')
-    assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(family='normal', link='not a link')
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_alpha_argument():
-    """Test GLM alpha argument
-    """
+@pytest.mark.parametrize('alpha', ['not a number', -4.2])
+def test_glm_alpha_argument(alpha):
+    """Test GLM for invalid alpha argument."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    for alpha in ['not a number', -4.2]:
-        glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
-                                         alpha=alpha)
-        assert_raises(ValueError, glm.fit, X, y)
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', alpha=alpha)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_l1_ratio_argument():
-    """Test GLM l1_ratio argument
-    """
+@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]])
+def test_glm_l1_ratio_argument(l1_ratio):
+    """Test GLM for invalid l1_ratio argument."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    for l1_ratio in ['not a number', -4.2, 1.1, [1]]:
-        glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
-                                         l1_ratio=l1_ratio)
-        assert_raises(ValueError, glm.fit, X, y)
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3]])
+@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3],
+                                [-1]])
 def test_glm_P1_argument(P1):
-    """Test GLM P1 arguments."""
+    """Test GLM for invalid P1 argument."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    glm = GeneralizedLinearRegressor(P1=P1)
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True)
     with pytest.raises((ValueError, TypeError)):
         glm.fit(X, y)
 
 
 @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]],
-                                sparse.csr_matrix([1, 2, 3])])
+                                sparse.csr_matrix([1, 2, 3]), [-1]])
 def test_glm_P2_argument(P2):
-    """Test GLM P2 arguments."""
+    """Test GLM for invalid P2 argument."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
-    with pytest.raises((ValueError, TypeError)):
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(P2=P2, check_input=True)
+    with pytest.raises(ValueError):
         glm.fit(X, y)
 
 
-def test_glm_fit_intercept_argument():
-    """Test GLM fit_intercept argument
-    """
-    y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    for fit_intercept in ['not bool', 1, 0, [True]]:
-        glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
-        assert_raises(ValueError, glm.fit, X, y)
+def test_glm_P2_positive_semidefinite():
+    """Test GLM for a positive semi-definite P2 argument."""
+    n_samples, n_features = 10, 5
+    rng = np.random.RandomState(42)
+    y = np.arange(n_samples)
+    X = np.zeros((n_samples, n_features))
+    P2 = np.diag([100, 10, 5, 0, -1E-5])
+    # construct random orthogonal matrix Q
+    Q, R = linalg.qr(rng.randn(n_features, n_features))
+    P2 = Q.T @ P2 @ Q
+    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
+                                     check_input=True)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_solver_argument():
-    """Test GLM solver argument
-    """
+@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]])
+def test_glm_fit_intercept_argument(fit_intercept):
+    """Test GLM for invalid fit_intercept argument."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for solver in ['not a solver', 1, [1]]:
-        glm = GeneralizedLinearRegressor(solver=solver)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
-    # solver not suitable for L1 penalty
-    for solver in ['irls', 'lbfgs', 'newton-cg']:
-        glm = GeneralizedLinearRegressor(solver=solver, alpha=1, l1_ratio=0.1)
-        assert_raises(ValueError, glm.fit, X, y)
 
+@pytest.mark.parametrize('solver, l1_ratio',
+                         [('not a solver', 0), (1, 0), ([1], 0),
+                          ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)])
+def test_glm_solver_argument(solver, l1_ratio):
+    """Test GLM for invalid solver argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
-def test_glm_max_iter_argument():
-    """Test GLM max_iter argument
-    """
+
+@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]])
+def test_glm_max_iter_argument(max_iter):
+    """Test GLM for invalid max_iter argument."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    for max_iter in ['not a number', 0, -1, 5.5, [1]]:
-        glm = GeneralizedLinearRegressor(max_iter=max_iter)
-        assert_raises(ValueError, glm.fit, X, y)
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(max_iter=max_iter)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_tol_argument():
-    """Test GLM tol argument
-    """
+@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]])
+def test_glm_tol_argument(tol):
+    """Test GLM for invalid tol argument."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    for tol in ['not a number', 0, -1.0, [1e-3]]:
-        glm = GeneralizedLinearRegressor(tol=tol)
-        assert_raises(ValueError, glm.fit, X, y)
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(tol=tol)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_warm_start_argument():
-    """Test GLM warm_start argument
-    """
+@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]])
+def test_glm_warm_start_argument(warm_start):
+    """Test GLM for invalid warm_start argument."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for warm_start in ['not bool', 1, 0, [True]]:
-        glm = GeneralizedLinearRegressor(warm_start=warm_start)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(warm_start=warm_start)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_start_params_argument():
-    """Test GLM start_params argument
-    """
+@pytest.mark.parametrize('start_params',
+                         ['not a start_params', ['zero'], [0, 0, 0],
+                          [[0, 0]], ['a', 'b']])
+def test_glm_start_params_argument(start_params):
+    """Test GLM for invalid start_params argument."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for start_params in ['not a start_params', ['zero'], [0, 0, 0],
-                         [[0, 0]], ['a', 'b']]:
-        glm = GeneralizedLinearRegressor(start_params=start_params)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(start_params=start_params)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_selection_argument():
-    """Test GLM selection argument
-    """
+@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']])
+def test_glm_selection_argument(selection):
+    """Test GLM for invalid selection argument"""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for selection in ['not a selection', 1, 0, ['cyclic']]:
-        glm = GeneralizedLinearRegressor(selection=selection)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(selection=selection)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_random_state_argument():
-    """Test GLM random_state argument
-    """
+@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]])
+def test_glm_random_state_argument(random_state):
+    """Test GLM for invalid random_state argument."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for random_state in ['a string', 0.5, [0]]:
-        glm = GeneralizedLinearRegressor(random_state=random_state)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(random_state=random_state)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_copy_X_argument():
-    """Test GLM copy_X arguments
-    """
+@pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]])
+def test_glm_copy_X_argument(copy_X):
+    """Test GLM for invalid copy_X arguments."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for copy_X in ['not bool', 1, 0, [True]]:
-        glm = GeneralizedLinearRegressor(copy_X=copy_X)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(copy_X=copy_X)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_check_input_argument():
-    """Test GLM check_input argument
-    """
+@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]])
+def test_glm_check_input_argument(check_input):
+    """Test GLM for invalid check_input argument."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for check_input in ['not bool', 1, 0, [True]]:
-        glm = GeneralizedLinearRegressor(check_input=check_input)
-        assert_raises(ValueError, glm.fit, X, y)
-
+    glm = GeneralizedLinearRegressor(check_input=check_input)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
-# TODO: check additional validations if check_input == True
 
-def test_glm_identiy_regression():
-    """Test GLM regression with identity link on a simple dataset
-    """
+@pytest.mark.parametrize(
+    'family',
+    [NormalDistribution(), PoissonDistribution(),
+     GammaDistribution(), InverseGaussianDistribution(),
+     TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
+     GeneralizedHyperbolicSecant()])
+@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
+def test_glm_identiy_regression(family, solver):
+    """Test GLM regression with identity link on a simple dataset."""
     coef = [1, 2]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
-    families = (
-        NormalDistribution(), PoissonDistribution(),
-        GammaDistribution(), InverseGaussianDistribution(),
-        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
-        GeneralizedHyperbolicSecand())
-    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
-        for family in families:
-            glm = GeneralizedLinearRegressor(
-                alpha=0, family=family, fit_intercept=False, solver=solver)
-            res = glm.fit(X, y)
-            assert_array_almost_equal(res.coef_, coef)
-
-
-def test_glm_log_regression():
-    """Test GLM regression with log link on a simple dataset
-    """
+    glm = GeneralizedLinearRegressor(alpha=0, family=family,
+                                     fit_intercept=False, solver=solver)
+    res = glm.fit(X, y)
+    assert_array_almost_equal(res.coef_, coef)
+
+
+@pytest.mark.parametrize(
+    'family',
+    [NormalDistribution(), PoissonDistribution(),
+     GammaDistribution(), InverseGaussianDistribution(),
+     TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
+     GeneralizedHyperbolicSecant()])
+@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
+def test_glm_log_regression(family, solver):
+    """Test GLM regression with log link on a simple dataset."""
     coef = [1, 2]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.exp(np.dot(X, coef))
-    families = (
-        NormalDistribution(), PoissonDistribution(),
-        GammaDistribution(), InverseGaussianDistribution(),
-        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
-        GeneralizedHyperbolicSecand())
-    for solver in ['irls', 'lbfgs', 'newton-cg']:
-        for family in families:
-            glm = GeneralizedLinearRegressor(
+    glm = GeneralizedLinearRegressor(
                 alpha=0, family=family, link=LogLink(), fit_intercept=False,
                 solver=solver, start_params='least_squares')
-            res = glm.fit(X, y)
-            assert_array_almost_equal(res.coef_, coef)
+    res = glm.fit(X, y)
+    assert_array_almost_equal(res.coef_, coef)
 
 
 @pytest.mark.filterwarnings('ignore::DeprecationWarning')
-def test_normal_ridge():
-    """Test ridge regression for Normal distributions
+@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
+def test_normal_ridge(solver):
+    """Test ridge regression for Normal distributions.
 
     Compare to test_ridge in test_ridge.py.
     """
@@ -375,25 +376,23 @@ def test_normal_ridge():
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6,
                   solver='svd', normalize=False)
     ridge.fit(X, y)
-    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
-        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
-                                         family='normal', link='identity',
-                                         fit_intercept=True, tol=1e-6,
-                                         max_iter=100, solver=solver,
-                                         random_state=42)
-        glm.fit(X, y)
-        assert_equal(glm.coef_.shape, (X.shape[1], ))
-        assert_array_almost_equal(glm.coef_, ridge.coef_)
-        assert_almost_equal(glm.intercept_, ridge.intercept_)
-        assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=True,
+                                     tol=1e-6, max_iter=100, solver=solver,
+                                     random_state=42)
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_)
+    assert_almost_equal(glm.intercept_, ridge.intercept_)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T))
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
                   solver='svd', normalize=False)
     ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6,
-                                     family='normal', link='identity',
-                                     fit_intercept=False, solver='irls',
-                                     fit_dispersion='chisqr')
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=False,
+                                     tol=1e-6, max_iter=100, solver=solver,
+                                     random_state=42, fit_dispersion='chisqr')
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
     assert_array_almost_equal(glm.coef_, ridge.coef_)
@@ -413,23 +412,22 @@ def test_normal_ridge():
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9,
                   solver='sag', normalize=False, max_iter=100000)
     ridge.fit(X, y)
-    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
-        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8,
-                                         family='normal', link='identity',
-                                         fit_intercept=True, solver=solver,
-                                         max_iter=300, random_state=42)
-        glm.fit(X, y)
-        assert_equal(glm.coef_.shape, (X.shape[1], ))
-        assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5)
-        assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5)
-        assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8,
+                                     family='normal', link='identity',
+                                     fit_intercept=True, solver=solver,
+                                     max_iter=300, random_state=42)
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5)
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7,
                   solver='sag', normalize=False, max_iter=1000)
     ridge.fit(X, y)
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7,
                                      family='normal', link='identity',
-                                     fit_intercept=False, solver='irls')
+                                     fit_intercept=False, solver=solver)
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
     assert_array_almost_equal(glm.coef_, ridge.coef_)
@@ -438,7 +436,7 @@ def test_normal_ridge():
 
 
 def test_poisson_ridge():
-    """Test ridge regression with poisson family and LogLink
+    """Test ridge regression with poisson family and LogLink.
 
     Compare to R's glmnet"""
     # library("glmnet")
@@ -470,7 +468,7 @@ def test_poisson_ridge():
 
 
 def test_normal_enet():
-    """Tet elastic net regression with normal/gaussian family"""
+    """Test elastic net regression with normal/gaussian family."""
     rng = np.random.RandomState(0)
     alpha, l1_ratio = 0.3, 0.7
     n_samples, n_features = 20, 2
@@ -495,7 +493,7 @@ def test_normal_enet():
 
 
 def test_poisson_enet():
-    """Test elastic net regression with poisson family and LogLink
+    """Test elastic net regression with poisson family and LogLink.
 
     Compare to R's glmnet"""
     # library("glmnet")

From fe876da908a7d5aefe8fa9ac56f4c5130ccf83df Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 24 Feb 2019 12:45:55 +0100
Subject: [PATCH 048/209] Add Logistic regression=Binomial + Logit

    * add Binomial distribution

    * add Logit link

    * tests for binomial against LogisticRegression

    * option 'auto' for link

    * reduce code duplication by replacing @abstractproperty by @property
---
 sklearn/linear_model/glm.py            | 160 +++++++++++++++----------
 sklearn/linear_model/tests/test_glm.py |  57 +++++++--
 2 files changed, 149 insertions(+), 68 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index f583e17433ee3..01e40b322946c 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -42,13 +42,12 @@
 
 
 from __future__ import division
-from abc import ABCMeta, abstractmethod, abstractproperty
+from abc import ABCMeta, abstractmethod
 import numbers
 import numpy as np
-from scipy import linalg, sparse
+from scipy import linalg, sparse, special
 import scipy.sparse.linalg as splinalg
 from scipy.optimize import fmin_l_bfgs_b
-from scipy.special import xlogy
 import warnings
 from .base import LinearRegression
 from .coordinate_descent import ElasticNet
@@ -191,6 +190,28 @@ def inverse_derivative2(self, lin_pred):
         return np.exp(lin_pred)
 
 
+class LogitLink(Link):
+    """The logit link function g(x)=logit(x)."""
+
+    def link(self, mu):
+        return special.logit(mu)
+
+    def derivative(self, mu):
+        return 1. / (mu * (1 - mu))
+
+    def inverse(self, lin_pred):
+        return special.expit(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        ep = special.expit(lin_pred)
+        return ep * (1. - ep)
+
+    def inverse_derivative2(self, lin_pred):
+        ep = special.expit(lin_pred)
+        ep = special.expit(lin_pred)
+        return ep * (1. - ep) * (1. - 2 * ep)
+
+
 class ExponentialDispersionModel(metaclass=ABCMeta):
     r"""Base class for reproductive Exponential Dispersion Models (EDM).
 
@@ -238,26 +259,25 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
 
     https://en.wikipedia.org/wiki/Exponential_dispersion_model.
     """
-
-    @abstractproperty
+    @property
     def lower_bound(self):
-        """The lower bound of values of Y~EDM."""
-        raise NotImplementedError()
+        """Get the lower bound of values for Y~EDM."""
+        return self._lower_bound
 
-    @abstractproperty
+    @property
     def upper_bound(self):
-        """The upper bound of values of Y~EDM."""
-        raise NotImplementedError()
+        """Get the upper bound of values for Y~EDM."""
+        return self._upper_bound
 
-    @abstractproperty
+    @property
     def include_lower_bound(self):
-        """If True, values of y may equal lower bound: y >= lower_bound."""
-        raise NotImplementedError()
+        """Get True if lower bound for y is included: y >= lower_bound."""
+        return self._include_lower_bound
 
-    @abstractproperty
+    @property
     def include_upper_bound(self):
-        """If True, values of y may equal upper bound: y <= upper_bound."""
-        raise NotImplementedError()
+        """Get True if upper bound for y is includede: y <= upper_bound."""
+        return self._include_upper_bound
 
     def in_y_range(self, x):
         """Returns true if `x` is in the valid range of Y~EDM.
@@ -685,22 +705,6 @@ def power(self, power):
                             .format(power))
         self._power = power
 
-    @property
-    def lower_bound(self):
-        return self._lower_bound
-
-    @property
-    def upper_bound(self):
-        return self._upper_bound
-
-    @property
-    def include_lower_bound(self):
-        return self._include_lower_bound
-
-    @property
-    def include_upper_bound(self):
-        return self._include_upper_bound
-
     def unit_variance(self, mu):
         """Compute the unit variance of a Tweedie distribution v(mu)=mu**power.
 
@@ -730,7 +734,7 @@ def unit_deviance(self, y, mu):
         if p == 1:
             # PoissonDistribution
             # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
-            return 2 * (xlogy(y, y/mu) - y + mu)
+            return 2 * (special.xlogy(y, y/mu) - y + mu)
         elif p == 2:
             # GammaDistribution
             return 2 * (np.log(mu/y)+y/mu-1)
@@ -776,22 +780,6 @@ def __init__(self):
         self._include_lower_bound = False
         self._include_upper_bound = False
 
-    @property
-    def lower_bound(self):
-        return self._lower_bound
-
-    @property
-    def upper_bound(self):
-        return self._upper_bound
-
-    @property
-    def include_lower_bound(self):
-        return self._include_lower_bound
-
-    @property
-    def include_upper_bound(self):
-        return self._include_upper_bound
-
     def unit_variance(self, mu):
         return 1 + mu**2
 
@@ -803,6 +791,27 @@ def unit_deviance(self, y, mu):
                 np.log((1+mu**2)/(1+y**2)))
 
 
+class BinomialDistribution(ExponentialDispersionModel):
+    """A class for the Binomial distribution.
+
+    The Binomial distribution is for tagets y in [0, 1].
+    """
+    def __init__(self):
+        self._lower_bound = 0
+        self._upper_bound = 1
+        self._include_lower_bound = True
+        self._include_upper_bound = True
+
+    def unit_variance(self, mu):
+        return mu * (1 - mu)
+
+    def unit_variance_derivative(self, mu):
+        return 1 - 2 * mu
+
+    def unit_deviance(self, y, mu):
+        return 2*(special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu)))
+
+
 def _irls_step(X, W, P2, z):
     """Compute one step in iteratively reweighted least squares.
 
@@ -933,15 +942,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\
-            of class ExponentialDispersionModel, optional(default='normal')
+    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \
+            or an instance of class ExponentialDispersionModel, \
+            optional(default='normal')
         The distributional assumption of the GLM, i.e. which distribution from
         the EDM, specifies the loss function to be minimized.
 
-    link : {'identity', 'log'} or an instance of class Link,
-        optional (default='identity')
+    link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link,
+        optional (default='auto')
         The link function of the GLM, i.e. mapping from linear predictor
-        (X*coef) to expectation (mu).
+        (X*coef) to expectation (mu). Option 'auto' sets the link depending on
+        the chosen family as follows:
+
+        - 'identity' for family 'normal'
+
+        - 'log' for families 'poisson', 'gamma', 'inverse.gaussian'
+
+        - 'logit' for family 'binomial'
 
     fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None)
         Method for estimation of the dispersion parameter phi. Whether to use
@@ -1084,7 +1101,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
           https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
     """
     def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
-                 fit_intercept=True, family='normal', link='identity',
+                 fit_intercept=True, family='normal', link='auto',
                  fit_dispersion=None, solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False, start_params='irls',
                  selection='cyclic', random_state=None, copy_X=True,
@@ -1159,27 +1176,48 @@ def fit(self, X, y, sample_weight=None):
                 self._family_instance = GammaDistribution()
             elif self.family == 'inverse.gaussian':
                 self._family_instance = InverseGaussianDistribution()
+            elif self.family == 'binomial':
+                self._family_instance = BinomialDistribution()
             else:
                 raise ValueError(
                     "The family must be an instance of class"
                     " ExponentialDispersionModel or an element of"
-                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];"
-                    " got (family={0})".format(self.family))
+                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
+                    "'binomial']; got (family={0})".format(self.family))
 
         # Guarantee that self._link_instance is set to an instance of
         # class Link
         if isinstance(self.link, Link):
             self._link_instance = self.link
         else:
-            if self.link == 'identity':
+            if self.link == 'auto':
+                if isinstance(self._family_instance, TweedieDistribution):
+                    if self._family_instance.power <= 0:
+                        self._link_instance = IdentityLink()
+                    if self._family_instance.power >= 1:
+                        self._link_instance = LogLink()
+                elif isinstance(self._family_instance,
+                                GeneralizedHyperbolicSecant):
+                    self._link_instance = IdentityLink()
+                elif isinstance(self._family_instance, BinomialDistribution):
+                    self._link_instance = LogitLink()
+                else:
+                    raise ValueError("No default link known for the "
+                                     "specified distribution family. Please "
+                                     "set link manually, i.e. not to 'auto'; "
+                                     "got (link='auto', family={}"
+                                     .format(self.family))
+            elif self.link == 'identity':
                 self._link_instance = IdentityLink()
             elif self.link == 'log':
                 self._link_instance = LogLink()
+            elif self.link == 'logit':
+                self._link_instance = LogitLink()
             else:
                 raise ValueError(
-                    "The link must be an instance of class Link or"
-                    " an element of ['identity', 'log']; got (link={0})"
-                    .format(self.link))
+                    "The link must be an instance of class Link or "
+                    "an element of ['auto', 'identity', 'log', 'logit']; "
+                    "got (link={0})".format(self.link))
 
         if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
             raise ValueError("Penalty term must be a non-negative number;"
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 361a237f2cc9f..de0857a34fe3a 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -4,16 +4,18 @@
 import scipy as sp
 from scipy import linalg, optimize, sparse
 
+from sklearn.datasets import make_classification
 from sklearn.linear_model.glm import (
     Link,
     IdentityLink,
     LogLink,
+    LogitLink,
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
-    GeneralizedHyperbolicSecant,
+    GeneralizedHyperbolicSecant, BinomialDistribution,
     GeneralizedLinearRegressor)
-from sklearn.linear_model import ElasticNet, Ridge
+from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
 
 from sklearn.utils.testing import (
     assert_equal, assert_almost_equal,
@@ -26,9 +28,19 @@ def test_link_properties(link):
     rng = np.random.RandomState(0)
     x = rng.rand(100)*100
     link = link()  # instatiate object
-    assert_almost_equal(link.link(link.inverse(x)), x, decimal=10)
-    assert_almost_equal(link.inverse_derivative(link.link(x)),
-                        1/link.derivative(x), decimal=10)
+    decimal = 10
+    if isinstance(link, LogitLink):
+        # careful for large x, note expit(36) = 1
+        # limit max eta to 15
+        x = x / 100 * 15
+        decimal = 8
+    assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal)
+    # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
+    assert_almost_equal(link.derivative(link.inverse(x)),
+                        1./link.inverse_derivative(x), decimal=decimal)
+    # for LogitLink, in the following x should be between 0 and 1.
+    # assert_almost_equal(link.inverse_derivative(link.link(x)),
+    #                     1./link.derivative(x), decimal=decimal)
 
 
 @pytest.mark.parametrize(
@@ -214,6 +226,12 @@ def test_glm_P2_positive_semidefinite():
     with pytest.raises(ValueError):
         glm.fit(X, y)
 
+    P2 = sparse.csr_matrix(P2)
+    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
+                                     check_input=True)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
 
 @pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]])
 def test_glm_fit_intercept_argument(fit_intercept):
@@ -331,7 +349,7 @@ def test_glm_identiy_regression(family, solver):
     coef = [1, 2]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
-    glm = GeneralizedLinearRegressor(alpha=0, family=family,
+    glm = GeneralizedLinearRegressor(alpha=0, family=family, link='identity',
                                      fit_intercept=False, solver=solver)
     res = glm.fit(X, y)
     assert_array_almost_equal(res.coef_, coef)
@@ -350,7 +368,7 @@ def test_glm_log_regression(family, solver):
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.exp(np.dot(X, coef))
     glm = GeneralizedLinearRegressor(
-                alpha=0, family=family, link=LogLink(), fit_intercept=False,
+                alpha=0, family=family, link='log', fit_intercept=False,
                 solver=solver, start_params='least_squares')
     res = glm.fit(X, y)
     assert_array_almost_equal(res.coef_, coef)
@@ -557,3 +575,28 @@ def obj(coef):
     glm.fit(X, y)
     assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
     assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
+
+
+@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10])
+def test_binomial_enet(alpha):
+    """Test elastic net regression with binomial family and LogitLink.
+
+    Compare to LogisticRegression.
+    """
+    l1_ratio = 0.5
+    n_samples = 500
+    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6,
+                               n_informative=5, n_redundant=0, n_repeated=0,
+                               random_state=0)
+    log = LogisticRegression(
+        penalty='elasticnet', random_state=0, fit_intercept=False, tol=1e-6,
+        max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha),
+        solver='saga')
+    log.fit(X, y)
+    glm = GeneralizedLinearRegressor(
+        family=BinomialDistribution(), link=LogitLink(), fit_intercept=False,
+        alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic',
+        tol=1e-7)
+    glm.fit(X, y)
+    assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6)
+    assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6)

From 2993e03dbfc89b068373718c82f65957639767ac Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 7 Apr 2019 15:33:27 +0200
Subject: [PATCH 049/209] More efficient sparse matrices and refactor of irls
 and cd solver

* refactor into function _irls_solver

* refactor into function _cd_solver

* replace of safe_sparse_dot by matmul operator @

* more efficient handling of fisher matrix

* sparse coo matrices are converted to csc or csr

* sample weights don't except sparse matrices

* minor doc changes
---
 sklearn/linear_model/glm.py            | 1101 ++++++++++++++----------
 sklearn/linear_model/tests/test_glm.py |   85 +-
 2 files changed, 717 insertions(+), 469 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 01e40b322946c..b2de866a4b69d 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -55,13 +55,12 @@
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array, check_X_y
-from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import newton_cg
 from ..utils.validation import check_is_fitted, check_random_state
 
 
 def _check_weights(sample_weight, n_samples):
-    """Check that weights are non-negative and have the right shape."""
+    """Check that sample weights are non-negative and have the right shape."""
     if sample_weight is None:
         weights = np.ones(n_samples)
     elif np.isscalar(sample_weight):
@@ -70,7 +69,7 @@ def _check_weights(sample_weight, n_samples):
         weights = sample_weight * np.ones(n_samples)
     else:
         _dtype = [np.float64, np.float32]
-        weights = check_array(sample_weight, accept_sparse='csr',
+        weights = check_array(sample_weight, accept_sparse=False,
                               force_all_finite=True, ensure_2d=False,
                               dtype=_dtype)
         if weights.ndim > 1:
@@ -247,12 +246,11 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
     deviance_derivative
     starting_mu
 
+    _mu_deviance_derivative
     _score
     _fisher_matrix
     _observed_information
-    _deviance
-    _deviance_derivative
-    _deviance_hessian
+    _eta_mu_score_fisher
 
     References
     ----------
@@ -280,7 +278,7 @@ def include_upper_bound(self):
         return self._include_upper_bound
 
     def in_y_range(self, x):
-        """Returns true if `x` is in the valid range of Y~EDM.
+        """Returns ``True`` if x is in the valid range of Y~EDM.
 
         Parameters
         ----------
@@ -411,7 +409,7 @@ def unit_deviance_derivative(self, y, mu):
         mu : array, shape (n_samples,)
             Predicted mean.
         """
-        return -2*(y-mu)/self.unit_variance(mu)
+        return -2 * (y - mu) / self.unit_variance(mu)
 
     def deviance(self, y, mu, weights=1):
         r"""Compute the deviance.
@@ -434,13 +432,7 @@ def deviance(self, y, mu, weights=1):
         weights : array, shape (n_samples,) (default=1)
             Weights or exposure to which variance is inverse proportional.
         """
-        return np.sum(weights*self.unit_deviance(y, mu))
-
-    def _deviance(self, coef, X, y, weights, link):
-        """Compute the deviance as a function of the coefficients and data."""
-        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
-        mu = link.inverse(lin_pred)
-        return self.deviance(y, mu, weights)
+        return np.sum(weights * self.unit_deviance(y, mu))
 
     def deviance_derivative(self, y, mu, weights=1):
         """Compute the derivative of the deviance w.r.t. mu.
@@ -458,7 +450,36 @@ def deviance_derivative(self, y, mu, weights=1):
         weights : array, shape (n_samples,) (default=1)
             Weights or exposure to which variance is inverse proportional.
         """
-        return weights*self.unit_deviance_derivative(y, mu)
+        return weights * self.unit_deviance_derivative(y, mu)
+
+    def starting_mu(self, y, weights=1, ind_weight=0.5):
+        """Set starting values for the mean mu.
+
+        These may be good starting points for the (unpenalized) IRLS solver.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+
+        ind_weight : float (default=0.5)
+            Must be between 0 and 1. Specifies how much weight is given to the
+            individual observations instead of the mean of y.
+        """
+        return (ind_weight * y +
+                (1. - ind_weight) * np.average(y, weights=weights))
+
+    def _mu_deviance_derivative(self, coef, X, y, weights, link):
+        """Compute mu, the deviance and it's derivative w.r.t coef."""
+        lin_pred = X @ coef
+        mu = link.inverse(lin_pred)
+        dev = self.deviance(y, mu, weights)
+        d1 = link.inverse_derivative(lin_pred)
+        devp = X.T @ (d1 * self.deviance_derivative(y, mu, weights))
+        return mu, dev, devp
 
     def _score(self, coef, phi, X, y, weights, link):
         r"""Compute the score function.
@@ -476,16 +497,14 @@ def _score(self, coef, phi, X, y, weights, link):
 
         with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and
         :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`.
+        Note: The derivative of the deviance w.r.t. coef equals -2 * score.
         """
-        n_samples = X.shape[0]
-        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        lin_pred = X @ coef
         mu = link.inverse(lin_pred)
         sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         d = link.inverse_derivative(lin_pred)
-        d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0),
-                                        shape=(n_samples, n_samples))
-        temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=True)
-        score = safe_sparse_dot(X.T, temp, dense_output=True)
+        temp = sigma_inv * d * (y - mu)
+        score = X.T @ temp
         return score
 
     def _fisher_matrix(self, coef, phi, X, y, weights, link):
@@ -508,14 +527,13 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link):
         see func:`_score`.
         """
         n_samples = X.shape[0]
-        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        lin_pred = X @ coef
         mu = link.inverse(lin_pred)
         sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         d2 = link.inverse_derivative(lin_pred)**2
         d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0),
                                          shape=(n_samples, n_samples))
-        temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False)
-        fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False)
+        fisher_matrix = X.T @ d2_sigma_inv @ X
         return fisher_matrix
 
     def _observed_information(self, coef, phi, X, y, weights, link):
@@ -542,7 +560,7 @@ def _observed_information(self, coef, phi, X, y, weights, link):
         see :func:`score_` function and :func:`_fisher_matrix`.
         """
         n_samples = X.shape[0]
-        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        lin_pred = X @ coef
         mu = link.inverse(lin_pred)
         sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         dp = link.inverse_derivative2(lin_pred)
@@ -551,80 +569,59 @@ def _observed_information(self, coef, phi, X, y, weights, link):
         r = y - mu
         temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0),
                                  shape=(n_samples, n_samples))
-        temp = safe_sparse_dot(temp, X, dense_output=False)
-        observed_information = safe_sparse_dot(X.T, temp, dense_output=False)
+        observed_information = X.T @ temp @ X
         return observed_information
 
-    def _deviance_derivative(self, coef, X, y, weights, link):
-        r"""Compute the derivative of the deviance w.r.t. coef.
-
-        The derivative of the deviance w.r.t. `coef` (:math:`w`) as a
-        function of the coefficients `coef` and the data.
-        This is equivalent to :math:`-2\phi` times the score function
-        :func:`_score` (derivative of the log-likelihood).
-        """
-        score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights,
-                            link=link)
-        return -2*score
-
-    def _deviance_hessian(self, coef, X, y, weights, link):
-        r"""Compute the hessian matrix of the deviance w.r.t. coef.
-
-        The hessian of the deviance w.r.t. `coef` (:math:`w`) is evaluated as
-        a function of the coefficients `coef` and the data.
-        It is equivalent to :math:`+2\phi` times the observed information
-        matrix.
-        """
-        info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y,
-                                                 weights=weights, link=link)
-        return 2*info_matrix
-
-    def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link):
+    def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link,
+                             diag_fisher=False):
         """Compute linear predictor, mean, score function and fisher matrix.
 
         It calculates the linear predictor, the mean, score function
         (derivative of log-likelihood) and Fisher information matrix
         all in one go as function of `coef` (:math:`w`) and the data.
+
+        Parameters
+        ----------
+        diag_fisher : boolean, optional (default=False)
+            If ``True``, returns only an array d such that
+            fisher = X.T @ np.diag(d) @ X.
+
+        Returns
+        -------
+        (eta, mu, score, fisher) : tuple with 4 elements
+            The 4 elements are:
+
+            * eta: ndarray, shape (X.shape[0],)
+            * mu: ndarray, shape (X.shape[0],)
+            * score: ndarray, shape (X.shape[0],)
+            * fisher:
+
+                * If diag_fisher is ``False``, the full fisher matrix,
+                  an array of shape (X.shape[1], X.shape[1])
+                * If diag_fisher is ``True`, an array of shape (X.shape[0])
         """
         n_samples, n_features = X.shape
         # eta = linear predictor
-        eta = safe_sparse_dot(X, coef, dense_output=True)
+        eta = X @ coef
         mu = link.inverse(eta)
         sigma_inv = 1./self.variance(mu, phi=phi, weights=weights)
         d1 = link.inverse_derivative(eta)  # = h'(eta)
         # Alternatively:
         # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
         # d1 = 1./link.derivative(mu)
-        d1_sigma_inv = sparse.dia_matrix((sigma_inv*d1, 0),
-                                         shape=(n_samples, n_samples))
-        temp = safe_sparse_dot(d1_sigma_inv, (y-mu), dense_output=True)
-        score = safe_sparse_dot(X.T, temp, dense_output=True)
+        score = X.T @ (sigma_inv * d1 * (y - mu))
         #
-        d2_sigma_inv = sparse.dia_matrix((sigma_inv*(d1**2), 0),
-                                         shape=(n_samples, n_samples))
-        temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False)
-        fisher = safe_sparse_dot(X.T, temp, dense_output=False)
-        return eta, mu, score, fisher
-
-    def starting_mu(self, y, weights=1, ind_weight=0.5):
-        """Set starting values for the mean mu.
-
-        These may be good starting points for the (unpenalized) IRLS solver.
-
-        Parameters
-        ----------
-        y : array, shape (n_samples,)
-            Target values.
-
-        weights : array, shape (n_samples,) (default=1)
-            Weights or exposure to which variance is inverse proportional.
-
-        ind_weight : float (default=0.5)
-            Must be between 0 and 1. Specifies how much weight is given to the
-            individual observations instead of the mean of y.
-        """
-        return (ind_weight * y +
-                (1. - ind_weight) * np.average(y, weights=weights))
+        d2_sigma_inv = sigma_inv * (d1**2)
+        if diag_fisher:
+            return eta, mu, score, d2_sigma_inv
+        else:
+            if sparse.issparse(X):
+                d2_sigma_inv = sparse.dia_matrix((d2_sigma_inv, 0),
+                                                 shape=(n_samples, n_samples))
+                fisher = (X.T @ d2_sigma_inv @ X).toarray()
+            else:
+                fisher = (X.T * d2_sigma_inv) @ X
+            return eta, mu, score, fisher
 
 
 class TweedieDistribution(ExponentialDispersionModel):
@@ -724,20 +721,20 @@ def unit_variance_derivative(self, mu):
         mu : array, shape (n_samples,)
             Predicted mean.
         """
-        return self.power*np.power(mu, self.power-1)
+        return self.power * np.power(mu, self.power - 1)
 
     def unit_deviance(self, y, mu):
         p = self.power
         if p == 0:
             # NormalDistribution
-            return (y-mu)**2
+            return (y - mu)**2
         if p == 1:
             # PoissonDistribution
             # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
             return 2 * (special.xlogy(y, y/mu) - y + mu)
         elif p == 2:
             # GammaDistribution
-            return 2 * (np.log(mu/y)+y/mu-1)
+            return 2 * (np.log(mu/y) + y/mu - 1)
         else:
             # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
             #    - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p))
@@ -784,11 +781,11 @@ def unit_variance(self, mu):
         return 1 + mu**2
 
     def unit_variance_derivative(self, mu):
-        return 2*mu
+        return 2 * mu
 
     def unit_deviance(self, y, mu):
-        return (2*y*(np.arctan(y) - np.arctan(mu)) +
-                np.log((1+mu**2)/(1+y**2)))
+        return (2 * y * (np.arctan(y) - np.arctan(mu)) +
+                np.log((1 + mu**2)/(1 + y**2)))
 
 
 class BinomialDistribution(ExponentialDispersionModel):
@@ -809,7 +806,7 @@ def unit_variance_derivative(self, mu):
         return 1 - 2 * mu
 
     def unit_deviance(self, y, mu):
-        return 2*(special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu)))
+        return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu)))
 
 
 def _irls_step(X, W, P2, z):
@@ -824,20 +821,20 @@ def _irls_step(X, W, P2, z):
 
     Parameters
     ----------
-    X : {numpy array, sparse matrix}, shape (n_samples, n_features)
+    X : {ndarray, sparse matrix}, shape (n_samples, n_features)
         Training data (with intercept included if present)
 
-    W : numpy array, shape (n_samples,)
+    W : ndarray, shape (n_samples,)
 
-    P2 : {numpy array, sparse matrix}, shape (n_features, n_features)
+    P2 : {ndarray, sparse matrix}, shape (n_features, n_features)
         The L2-penalty matrix or vector (=diagonal matrix)
 
-    z  : numpy array, shape (n_samples,)
+    z  : ndarray, shape (n_samples,)
         Working observations
 
     Returns
     -------
-    coef: array, shape (X.shape[1])
+    coef: ndarray, shape (X.shape[1])
     """
     # Note: solve vs least squares, what is more appropriate?
     #       scipy.linalg.solve seems faster, but scipy.linalg.lstsq
@@ -868,27 +865,422 @@ def _irls_step(X, W, P2, z):
     return coef
 
 
+def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol):
+    """Solve GLM with L2 penalty by IRLS algorithm.
+
+    Note: If X is sparse, P2 must also be sparse.
+    """
+    # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj'
+    #   Obj = objective function = 1/2 Dev + l2/2 w P2 w
+    #   Dev = deviance, s = normalized weights, variance V(mu) but phi=1
+    #   D   = link.inverse_derivative(eta) = diag_matrix(h'(X w))
+    #   D2  = link.inverse_derivative(eta)^2 = D^2
+    #   W   = D2/V(mu)
+    #   l2  = alpha * (1 - l1_ratio)
+    #   Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w
+    #        = -X' D (y-mu)/V(mu) + l2 P2 w
+    #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
+    #   Use Fisher matrix instead of full info matrix -X'(...) X,
+    #    i.e. E[Dev''] with E[y-mu]=0:
+    #   Obj'' ~ X' W X + l2 P2
+    # (1): w = (X' W X + l2 P2)^-1 X' W z,
+    #      with z = eta + D^-1 (y-mu)
+    # Note: P2 must be symmetrized
+    # Note: ' denotes derivative, but also transpose for matrices
+
+    # eta = linear predictor
+    eta = X @ coef
+    mu = link.inverse(eta)
+    # D = h'(eta)
+    hp = link.inverse_derivative(eta)
+    V = family.variance(mu, phi=1, weights=weights)
+    n_iter = 0
+    while n_iter < max_iter:
+        n_iter += 1
+        # coef_old not used so far.
+        # coef_old = coef
+        # working weights W, in principle a diagonal matrix
+        # therefore here just as 1d array
+        W = hp**2 / V
+        # working observations
+        z = eta + (y - mu) / hp
+        # solve A*coef = b
+        # A = X' W X + P2, b = X' W z
+        coef = _irls_step(X, W, P2, z)
+        # updated linear predictor
+        # do it here for updated values for tolerance
+        eta = X @ coef
+        mu = link.inverse(eta)
+        hp = link.inverse_derivative(eta)
+        V = family.variance(mu, phi=1, weights=weights)
+
+        # which tolerace? |coef - coef_old| or gradient?
+        # use gradient for compliance with newton-cg and lbfgs
+        # gradient = -X' D (y-mu)/V(mu) + l2 P2 w
+        gradient = -(X.T @ (hp*(y-mu)/V))
+        if P2.ndim == 1:
+            gradient += P2*coef
+        else:
+            gradient += P2 @ coef
+        if (np.max(np.abs(gradient)) <= tol):
+            converged = True
+            break
+
+    if not converged:
+        warnings.warn("irls failed to converge. Increase the number "
+                      "of iterations (currently {0})"
+                      .format(max_iter), ConvergenceWarning)
+
+    return coef, n_iter
+
+
+def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
+              max_inner_iter=1000, selection='cyclic',
+              random_state=None, diag_fisher=False):
+    """Compute inner loop of coordinate descent = cycles through features.
+
+    Minimization of 1-d subproblems::
+
+        min_z q(d+z*e_j) - q(d)
+        = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1
+
+    A = f'(w) + d*H(w) + (w+d)*P2
+    B = H+P2
+    Note: f'=-score and H=fisher are updated at the end of outer iteration.
+    """
+    # TODO: use sparsity (coefficient already 0 due to L1 penalty)
+    #       => active set of features for featurelist, see paper
+    #          of Improved GLMNET or Gap Safe Screening Rules
+    #          https://arxiv.org/abs/1611.05780
+    n_samples, n_features = X.shape
+    B = fisher
+    if P2.ndim == 1:
+        coef_P2 = coef * P2
+        if not diag_fisher:
+            B[np.diag_indices_from(B)] += P2
+    else:
+        coef_P2 = P2 @ coef  # P2 is symmetric, mat @ vec is usually faster
+        if not diag_fisher:
+            if sparse.issparse(P2):
+                B += P2.toarray()
+            else:
+                B += P2
+    A = -score + coef_P2  # + d @ (H+P2) but d=0 so far
+    # inner loop
+    inner_iter = 0
+    while inner_iter < max_inner_iter:
+        inner_iter += 1
+        n_cycles += 1
+        if selection == 'random':
+            featurelist = random_state.permutation(n_features)
+        else:
+            featurelist = np.arange(n_features)
+        for j in featurelist:
+            # minimize_z: a z + 1/2 b z^2 + c |d+z|
+            # a = A_j
+            # b = B_jj > 0
+            # c = |P1_j| = P1_j > 0, see 1.3
+            # d = w_j + d_j
+            # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
+            # with beta = z+d, beta_hat = d-a/b and gamma = c/b
+            # z = 1/b * S(bd-a,c) - d
+            # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
+            a = A[j]
+            if diag_fisher:
+                if sparse.issparse(X):
+                    xj = X[:, j]
+                    b = xj.transpose() @ xj.multiply(fisher[:, np.newaxis])
+                    b = b[0, 0]
+                else:
+                    b = X[:, j] @ (fisher * X[:, j])
+
+                if P2.ndim == 1:
+                    b += P2[j]
+                else:
+                    b += P2[j, j]
+            else:
+                b = B[j, j]
+
+            if b <= 0:
+                z = 0
+            elif P1[j] == 0:
+                z = -a/b
+            elif a + P1[j] < b * (coef[j] + d[j]):
+                z = -(a + P1[j])/b
+            elif a - P1[j] > b * (coef[j] + d[j]):
+                z = -(a - P1[j])/b
+            else:
+                z = -(coef[j] + d[j])
+
+            # update direction d
+            d[j] += z
+            # update A because d_j is now d_j+z
+            # A = f'(w) + d*H(w) + (w+d)*P2
+            # => A += (H+P2)*e_j z = B_j * z
+            # Note: B is symmetric B = B.transpose
+            if diag_fisher:
+                if sparse.issparse(X):
+                    A += (X.transpose() @
+                          X[:, j].multiply(fisher[:, np.newaxis])
+                          ).toarray().ravel() * z
+                else:
+                    # A += (X.T @ (fisher * X[:, j])) * z
+                    # same without transpose of X
+                    A += ((fisher * X[:, j]) @ X) * z
+
+                if P2.ndim == 1:
+                    A[j] += P2[j] * z
+                elif sparse.issparse(P2):
+                    # slice columns as P2 is csc
+                    A += P2[:, j].toarray().ravel() * z
+                else:
+                    A += P2[:, j] * z
+            else:
+                # B is symmetric, C- or F-contiguous, but never sparse
+                if B.flags['F_CONTIGUOUS']:
+                    # slice columns like for sparse csc
+                    A += B[:, j] * z
+                else:  # B.flags['C_CONTIGUOUS'] might be true
+                    # slice rows
+                    A += B[j, :] * z
+            # end of cycle
+        # stopping criterion for inner loop
+        # sum_i(|minimum of norm of subgrad of q(d)_i|)
+        mn_subgrad = np.where(coef + d == 0,
+                              np.sign(A) * np.maximum(np.abs(A) - P1, 0),
+                              A + np.sign(coef + d) * P1)
+        mn_subgrad = linalg.norm(mn_subgrad, ord=1)
+        if mn_subgrad <= inner_tol:
+            if inner_iter == 1:
+                inner_tol = inner_tol/4.
+            break
+        # end of inner loop
+    return d, coef_P2, n_cycles, inner_tol
+
+
+def _cd_solver(coef, X, y, weights, P1, P2, family, link,
+               max_iter=100, max_inner_iter=1000, tol=1e-4,
+               selection='cyclic ', random_state=None,
+               diag_fisher=False, copy_X=True):
+    """Solve GLM with L1 and L2 penalty by coordinate descent algorithm.
+
+    The objective beeing minimized in the coefficients w=coef is::
+
+        F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1
+
+    An Improved GLMNET for L1-regularized Logistic Regression:
+
+    1. Find optimal descent direction d by minimizing
+       min_d F(w+d) = min_d F(w+d) - F(w)
+    2. Quadrdatic approximation of F(w+d)-F(w) = q(d):
+       using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives:
+       q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
+       + ||P1*(w+d)||_1 - ||P1*w||_1
+       Then minimize q(d): min_d q(d)
+    3. Coordinate descent by updating coordinate j (d -> d+z*e_j):
+       min_z q(d+z*e_j)
+       = min_z q(d+z*e_j) - q(d)
+       = min_z A_j z + 1/2 B_jj z^2
+               + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
+       A = f'(w) + d*H(w) + (w+d)*P2
+       B = H+P2
+
+    Repeat steps 1-3 until convergence.
+    Note: Use Fisher matrix instead of Hessian for H.
+    Note: f' = -score, H = Fisher matrix
+
+    Parameters
+    ----------
+    coef: ndarray, shape (n_features,)
+
+    X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
+        Training data (with intercept included if present). If not sparse,
+        pass directly as Fortran-contiguous data to avoid
+        unnecessary memory duplication.
+
+    y : ndarray, shape (n_samples,)
+        Target values.
+
+    weights: ndarray, shape (n_samples,)
+        Sample weights with which the deviance is weighted. The weights must
+        bee normalized and sum to 1.
+
+    P1 : {ndarray}, shape (n_features,)
+        The L1-penalty vector (=diagonal matrix)
+
+    P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features)
+        The L2-penalty matrix or vector (=diagonal matrix). If a matrix is
+        passed, it must be symmetric. If X is sparse, P2 must also be sparse.
+
+    family : ExponentialDispersionModel
+
+    link : Link
+
+    max_iter : int, optional (default=100)
+        Maximum numer of outer (Newton) iterations.
+
+    max_inner_iter : int, optional (default=1000)
+        Maximum number of iterations, i.e. cycles over all features, in inner
+        loop.
+
+    tol : float, optional (default=1e-4)
+        Covergence criterion is
+        sum_i(|minimum of norm of subgrad of objective_i|)<=tol.
+
+    selection : str, optional (default='cyclic')
+        If 'random', randomly chose features in inner loop.
+
+    random_state : {int, RandomState instance, None}, optional (default=None)
+
+    diag_fisher : boolean, optional (default=False)
+        'False' calculates full fisher matrix, 'True' only diagonal matrix s.t.
+        fisher = X.T @ diag @ X. This saves storage but needs more
+        matrix-vector multiplications.
+
+    copy_X : boolean, optional (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    Returns
+    -------
+    coef : ndarray, shape (n_features,)
+
+    n_iter : numer of outer iterations = newton iterations
+
+    n_cycles : number of cycles over features
+
+    References
+    ----------
+    Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+    An Improved GLMNET for L1-regularized Logistic Regression,
+    Journal of Machine Learning Research 13 (2012) 1999-2030
+    https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    """
+    X = check_array(X, 'csc', dtype=[np.float64, np.float32],
+                    order='F', copy=copy_X)
+    if P2.ndim == 2:
+        P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32],
+                         order='F', copy=copy_X)
+    if sparse.issparse(X):
+        if not sparse.isspmatrix_csc(X):
+            raise ValueError("If X is sparse, it must be in csc format"
+                             "; got (format={})".format(X.format))
+        if not sparse.isspmatrix_csc(P2):
+            raise ValueError("If X is sparse, P2 must also be sparse csc"
+                             "format. Got P2 not sparse.")
+    random_state = check_random_state(random_state)
+    # Note: we already set P2 = l2*P2, P1 = l1*P1
+    # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+    n_iter = 0  # number of outer iterations
+    n_cycles = 0  # number of (complete) cycles over features
+    converged = False
+    n_samples, n_features = X.shape
+    # line search parameters
+    (beta, sigma) = (0.5, 0.01)
+    # some precalculations
+    # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a
+    #       1d array representing a diagonal matrix.
+    eta, mu, score, fisher = family._eta_mu_score_fisher(
+        coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
+        diag_fisher=diag_fisher)
+    # set up space for search direction d for inner loop
+    d = np.zeros_like(coef)
+    # initial stopping tolerance of inner loop
+    # use L1-norm of minimum of norm of subgradient of F
+    # fp_wP2 = f'(w) + w*P2
+    if P2.ndim == 1:
+        fp_wP2 = -score + coef * P2
+    else:
+        # Note: P2 is symmetric and matrix @ vector is faster for sparse
+        #       matrices.
+        fp_wP2 = -score + P2 @ coef
+    inner_tol = np.where(coef == 0,
+                         np.sign(fp_wP2) * np.maximum(np.abs(fp_wP2) - P1, 0),
+                         fp_wP2 + np.sign(coef) * P1)
+    inner_tol = linalg.norm(inner_tol, ord=1)
+    # outer loop
+    while n_iter < max_iter:
+        n_iter += 1
+        # initialize search direction d (to be optimized) with zero
+        d.fill(0)
+        # inner loop = _cd_cycle
+        d, coef_P2, n_cycles, inner_tol = \
+            _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
+                      max_inner_iter=max_inner_iter, selection=selection,
+                      random_state=random_state, diag_fisher=diag_fisher)
+        # line search by sequence beta^k, k=0, 1, ..
+        # F(w + lambda d) - F(w) <= lambda * bound
+        # bound = sigma * (f'(w)*d + w*P2*d
+        #                  +||P1 (w+d)||_1 - ||P1 w||_1)
+        P1w_1 = linalg.norm(P1 * coef, ord=1)
+        # Note: coef_P2 already calculated and still valid
+        bound = sigma * (-(score @ d) + coef_P2 @ d +
+                         linalg.norm(P1 * (coef + d), ord=1) - P1w_1)
+        Fw = (0.5 * family.deviance(y, mu, weights) +
+              0.5 * (coef_P2 @ coef) + P1w_1)
+        la = 1./beta
+        for k in range(20):
+            la *= beta  # starts with la=1
+            coef_wd = coef + la * d
+            mu_wd = link.inverse(X @ coef_wd)
+            Fwd = (0.5 * family.deviance(y, mu_wd, weights) +
+                   linalg.norm(P1 * coef_wd, ord=1))
+            if P2.ndim == 1:
+                Fwd += 0.5 * ((coef_wd * P2) @ coef_wd)
+            else:
+                Fwd += 0.5 * (coef_wd @ (P2 @ coef_wd))
+            if Fwd - Fw <= sigma * la * bound:
+                break
+        # update coefficients
+        # coef_old = coef.copy()
+        coef += la * d
+        # calculate eta, mu, score, Fisher matrix for next iteration
+        eta, mu, score, fisher = family._eta_mu_score_fisher(
+            coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
+            diag_fisher=diag_fisher)
+        # stopping criterion for outer loop
+        # sum_i(|minimum of norm of subgrad of F(w)_i|)
+        # fp_wP2 = f'(w) + w*P2
+        # Note: eta, mu and score are already updated
+        if P2.ndim == 1:
+            fp_wP2 = -score + coef * P2
+        else:
+            fp_wP2 = -score + P2 @ coef  # P2 is symmetric, mat @ vec is faster
+        mn_subgrad = np.where(coef == 0,
+                              np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
+                              fp_wP2 + np.sign(coef) * P1)
+        mn_subgrad = linalg.norm(mn_subgrad, ord=1)
+        if mn_subgrad <= tol:
+            converged = True
+            break
+        # end of outer loop
+    if not converged:
+        warnings.warn("Coordinate descent failed to converge. Increase"
+                      " the number of iterations (currently {0})"
+                      .format(max_iter), ConvergenceWarning)
+
+    return coef, n_iter, n_cycles
+
+
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """Regression via a Generalized Linear Model (GLM) with penalties.
 
     GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
-    fitting and predicting the mean `mu=h(X*w)`. Therefore the fit minimizes
-    the following objective function with combined L1 and L2 priors as
-    regularizer::
+    fitting and predicting the mean of the target y as mu=h(X*w). Therefore,
+    the fit minimizes the following objective function with combined L1 and L2
+    priors as regularizer::
 
             1/(2*sum(s)) * deviance(y, h(X*w); s)
             + alpha * l1_ratio * ||P1*w||_1
             + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
 
-    with inverse link function `h` and s=`sample_weight` (for
-    ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`).
-    For ``P1=P2='identity'`` (``P1=None``, ``P2=None``), the penalty is the
-    elastic net::
+    with inverse link function h and s=sample_weight. Note that for
+    ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples).
+    For ``P1=P2='identity'``, the penalty is the elastic net::
 
             alpha * l1_ratio * ||w||_1
             + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2
 
-    If you are interested in controlling the L1 and L2 penalty
+    If you are interested in controlling the L1 and L2 penalties
     separately, keep in mind that this is equivalent to::
 
             a * L1 + b * L2
@@ -897,9 +1289,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
             alpha = a + b and l1_ratio = a / (a + b)
 
-    The parameter `l1_ratio` corresponds to alpha in the glmnet R package while
-    'alpha' corresponds to the lambda parameter in glmnet. Specifically,
-    l1_ratio = 1 is the lasso penalty.
+    The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet,
+    while ``alpha`` corresponds to the lambda parameter in glmnet.
+    Specifically, l1_ratio = 1 is the lasso penalty.
 
     Read more in the :ref:`User Guide <Generalized_linear_regression>`.
 
@@ -948,8 +1340,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         The distributional assumption of the GLM, i.e. which distribution from
         the EDM, specifies the loss function to be minimized.
 
-    link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link,
-        optional (default='auto')
+    link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \
+            optional (default='auto')
         The link function of the GLM, i.e. mapping from linear predictor
         (X*coef) to expectation (mu). Option 'auto' sets the link depending on
         the chosen family as follows:
@@ -982,7 +1374,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         'cd'
             Coordinate descent algorithm. It can deal with L1 as well as L2
-            penalties.
+            penalties. Note that in order to avoid unnecessary memory
+            duplication of the X argument in the ``fit`` method, X should be
+            directly passed as a Fortran-contiguous numpy array or sparse csc
+            matrix.
 
     max_iter : int, optional (default=100)
         The maximal number of iterations for solver algorithms.
@@ -990,8 +1385,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     tol : float, optional (default=1e-4)
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
-        where ``g_i`` is the i-th component of the gradient (derivative of
-        the objective function).
+        where g_i is the i-th component of the gradient (derivative) of
+        the objective function. For the cd solver, covergence is reached
+        when ``sum_i(|minimum of norm of g_i|)``, where g_i is the
+        subgradient of the objective.
 
     warm_start : boolean, optional (default=False)
         If set to ``True``, reuse the solution of the previous call to ``fit``
@@ -1007,7 +1404,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         'irls'
             Start values of mu are calculated by family.starting_mu(..). Then,
-            one step of irls obtains start values for ``coef_`. This gives
+            one step of irls obtains start values for ``coef_``. This gives
             usually good results.
 
         'least_squares'
@@ -1042,6 +1439,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         RandomState instance used by `np.random`. Used when ``selection`` ==
         'random'.
 
+    diag_fisher : boolean, (default=False)
+        Only relevant for solver 'cd'. If ``False``, the full Fisher matrix
+        (expected Hessian) is computed in each outer iteretion (Newton
+        iteration). If ``True``, only a diagonal matrix (stored as 1d array) is
+        computed, such that fisher = X.T @ diag @ X. This saves memory and
+        matrix-matrix multiplications, but needs more matrix-vector
+        multiplications. If you use large sparse X or if you have many
+        features, i.e. n_features >> n_samples, you might set this option to
+        ``True``.
+
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
 
@@ -1056,40 +1463,43 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     Attributes
     ----------
     coef_ : array, shape (n_features,)
-        Estimated coefficients for the linear predictor (X*coef_) in the GLM.
+        Estimated coefficients for the linear predictor (X*coef_+intercept_) in
+        the GLM.
 
     intercept_ : float
         Intercept (a.k.a. bias) added to linear predictor.
 
     dispersion_ : float
-        The dispersion parameter :math:`\\phi` if fit_dispersion is set.
+        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
 
     n_iter_ : int
-        Actual number of iterations of the solver.
+        Actual number of iterations used in solver.
 
     Notes
     -----
     The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
-    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`.
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    :ref:`User Guide <Generalized_linear_regression>`.
 
     The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
-    (penalized) maximum likelihood which is equivalent to minimizing the
-    deviance.
+    minimizing the deviance plus penalty term, which is equivalent to
+    (penalized) maximum likelihood estimation.
 
-    For `alpha` > 0, the feature matrix `X` should be standardized in order to
+    For alpha > 0, the feature matrix X should be standardized in order to
     penalize features equally strong. Call
     :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
 
-    If the target `y` is a ratio, appropriate sample weights `s` should be
+    If the target y is a ratio, appropriate sample weights s should be
     provided.
-    As an example, consider Poission distributed counts `z` (integers) and
-    weights `s=exposure` (time, money, persons years, ...). Then you fit
-    `y = z/s`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
+    As an example, consider Poission distributed counts z (integers) and
+    weights s=exposure (time, money, persons years, ...). Then you fit
+    y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
     sample_weight=s)``. The weights are necessary for the right (finite
     sample) mean.
     Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
-    in this case one might say that `y` has a 'scaled' Poisson distributions.
+    in this case one might say that y has a 'scaled' Poisson distributions.
     The same holds for other distributions.
 
     References
@@ -1104,8 +1514,8 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
                  fit_intercept=True, family='normal', link='auto',
                  fit_dispersion=None, solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False, start_params='irls',
-                 selection='cyclic', random_state=None, copy_X=True,
-                 check_input=True, verbose=0):
+                 selection='cyclic', random_state=None, diag_fisher=False,
+                 copy_X=True, check_input=True, verbose=0):
         self.alpha = alpha
         self.l1_ratio = l1_ratio
         self.P1 = P1
@@ -1121,6 +1531,7 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
         self.start_params = start_params
         self.selection = selection
         self.random_state = random_state
+        self.diag_fisher = diag_fisher
         self.copy_X = copy_X
         self.check_input = check_input
         self.verbose = verbose
@@ -1154,11 +1565,13 @@ def fit(self, X, y, sample_weight=None):
         #######################################################################
         # 1.1 validate arguments of fit #######################################
         _dtype = [np.float64, np.float32]
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                         dtype=_dtype, y_numeric=True, multi_output=False)
+        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'],
+                         dtype=_dtype, y_numeric=True, multi_output=False,
+                         copy=self.copy_X)
         # Without converting y to float, deviance might raise
         # ValueError: Integers to negative integer powers are not allowed.
-        y = y.astype(np.float64)
+        # Also, y must not be sparse.
+        y = np.asarray(y, dtype=np.float64)
 
         weights = _check_weights(sample_weight, y.shape[0])
 
@@ -1263,7 +1676,7 @@ def fit(self, X, y, sample_weight=None):
                                  " correct length;"
                                  " got(start_params={0})".format(start_params))
         else:
-            start_params = check_array(start_params, accept_sparse='csr',
+            start_params = check_array(start_params, accept_sparse=False,
                                        force_all_finite=True, ensure_2d=False,
                                        dtype=_dtype, copy=True)
             if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
@@ -1274,12 +1687,14 @@ def fit(self, X, y, sample_weight=None):
                                  .format(X.shape[1] + self.fit_intercept,
                                          start_params.shape[0],
                                          start_params.ndim))
-
         if self.selection not in ['cyclic', 'random']:
             raise ValueError("The argument selection must be 'cyclic' or "
                              "'random'; got (selection={0})"
                              .format(self.selection))
         random_state = check_random_state(self.random_state)
+        if not isinstance(self.diag_fisher, bool):
+            raise ValueError("The argument diag_fisher must be bool;"
+                             " got {0}".format(self.diag_fisher))
         if not isinstance(self.copy_X, bool):
             raise ValueError("The argument copy_X must be bool;"
                              " got {0}".format(self.copy_X))
@@ -1303,26 +1718,34 @@ def fit(self, X, y, sample_weight=None):
                                  "got (P1.shape[0]={0}), "
                                  "needed (X.shape[1]={1})."
                                  .format(P1.shape[0], X.shape[1]))
+        # If X is sparse, make P2 sparse, too.
         if isinstance(self.P2, str) and self.P2 == 'identity':
-            if not sparse.issparse(X):
-                P2 = np.ones(X.shape[1])
-            else:
+            if sparse.issparse(X):
                 P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0),
                       shape=(X.shape[1], X.shape[1]))).tocsr()
+            else:
+                P2 = np.ones(X.shape[1])
         else:
             P2 = check_array(self.P2, copy=True,
-                             accept_sparse=['csr', 'csc', 'coo'],
+                             accept_sparse=['csr', 'csc'],
                              dtype=_dtype, ensure_2d=False)
             if P2.ndim == 1:
+                P2 = np.asarray(P2)
                 if P2.shape[0] != X.shape[1]:
                     raise ValueError("P2 should be a 1d array of shape "
                                      "(n_features,) with "
                                      "n_features=X.shape[1]; "
                                      "got (P2.shape=({0},)), needed ({1},)"
                                      .format(P2.shape[0], X.shape[1]))
-            elif ((P2.ndim != 2) or
-                    (P2.shape[0] != P2.shape[1]) or
-                    (P2.shape[0] != X.shape[1])):
+                if sparse.issparse(X):
+                    P2 = (sparse.dia_matrix((P2, 0),
+                          shape=(X.shape[1], X.shape[1]))).tocsr()
+            elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and
+                    P2.shape[0] == X.shape[1]):
+                if sparse.issparse(X):
+                    P2 = (sparse.dia_matrix((P2, 0),
+                          shape=(X.shape[1], X.shape[1]))).tocsr()
+            else:
                 raise ValueError("P2 must be either None or an array of shape "
                                  "(n_features, n_features) with "
                                  "n_features=X.shape[1]; "
@@ -1335,7 +1758,8 @@ def fit(self, X, y, sample_weight=None):
         if self.fit_intercept:
             # Note: intercept is first column <=> coef[0] is for intecept
             if sparse.issparse(X):
-                Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X])
+                Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X],
+                                     format=X.format)
             else:
                 Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
             P1 = np.concatenate((np.array([0]), P1))
@@ -1343,6 +1767,7 @@ def fit(self, X, y, sample_weight=None):
                 P2 = np.concatenate((np.array([0]), P2))
             elif sparse.issparse(P2):
                 P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2),
+                                       format=P2.format,
                                        dtype=P2.dtype).tocsr()
             else:
                 # as of numpy 1.13 this would work:
@@ -1355,7 +1780,7 @@ def fit(self, X, y, sample_weight=None):
 
         n_samples, n_features = Xnew.shape
         l1 = self.alpha * self.l1_ratio
-        l2 = self.alpha * (1-self.l1_ratio)
+        l2 = self.alpha * (1 - self.l1_ratio)
         P1 *= l1
         P2 *= l2
         # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
@@ -1484,315 +1909,114 @@ def fit(self, X, y, sample_weight=None):
         #######################################################################
         # algorithms for optimiation
         # TODO: Parallelize it?
-        self.n_iter_ = 0
-        converged = False
+
         # 4.1 IRLS ############################################################
-        # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj'
-        #   Obj = objective function = 1/2 Dev + l2/2 w P2 w
-        #   Dev = deviance, s = normalized weights, variance V(mu) but phi=1
-        #   D   = link.inverse_derivative(eta) = diag_matrix(h'(X w))
-        #   D2  = link.inverse_derivative(eta)^2 = D^2
-        #   W   = D2/V(mu)
-        #   l2  = alpha * (1 - l1_ratio)
-        #   Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w
-        #        = -X' D (y-mu)/V(mu) + l2 P2 w
-        #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
-        #   Use Fisher matrix instead of full info matrix -X'(...) X,
-        #    i.e. E[Dev''] with E[y-mu]=0:
-        #   Obj'' ~ X' W X + l2 P2
-        # (1): w = (X' W X + l2 P2)^-1 X' W z,
-        #      with z = eta + D^-1 (y-mu)
         # Note: we already set P2 = l2*P2, see above
         # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
-        # Note: ' denotes derivative, but also transpose for matrices
         if solver == 'irls':
-            # eta = linear predictor
-            eta = safe_sparse_dot(Xnew, coef, dense_output=True)
-            mu = link.inverse(eta)
-            # D = h'(eta)
-            hp = link.inverse_derivative(eta)
-            V = family.variance(mu, phi=1, weights=weights)
-            while self.n_iter_ < self.max_iter:
-                self.n_iter_ += 1
-                # coef_old not used so far.
-                # coef_old = coef
-                # working weights W, in principle a diagonal matrix
-                # therefore here just as 1d array
-                W = (hp**2 / V)
-                # working observations
-                z = eta + (y-mu)/hp
-                # solve A*coef = b
-                # A = X' W X + P2, b = X' W z
-                coef = _irls_step(Xnew, W, P2, z)
-                # updated linear predictor
-                # do it here for updated values for tolerance
-                eta = safe_sparse_dot(Xnew, coef, dense_output=True)
-                mu = link.inverse(eta)
-                hp = link.inverse_derivative(eta)
-                V = family.variance(mu, phi=1, weights=weights)
-
-                # which tolerace? |coef - coef_old| or gradient?
-                # use gradient for compliance with newton-cg and lbfgs
-                # gradient = family._deviance_derivative(
-                #     coef=coef, X=Xnew, y=y, weights=weights, link=link)
-                # gradient = -X' D (y-mu)/V(mu) + l2 P2 w
-                gradient = -safe_sparse_dot(Xnew.T, hp*(y-mu)/V)
+            coef, self.n_iter_ = \
+                _irls_solver(coef=coef, X=Xnew, y=y, weights=weights, P2=P2,
+                             family=family, link=link, max_iter=self.max_iter,
+                             tol=self.tol)
+
+        # 4.2 L-BFGS ##########################################################
+        elif solver == 'lbfgs':
+            def func(coef, X, y, weights, P2, family, link):
+                mu, dev, devp = \
+                    family._mu_deviance_derivative(coef, X, y, weights, link)
                 if P2.ndim == 1:
-                    gradient += P2*coef
+                    L2 = P2 * coef
                 else:
-                    gradient += safe_sparse_dot(P2, coef)
-                if (np.max(np.abs(gradient)) <= self.tol):
-                    converged = True
-                    break
-
-            if not converged:
-                warnings.warn("irls failed to converge. Increase the number "
-                              "of iterations (currently {0})"
-                              .format(self.max_iter), ConvergenceWarning)
-
-        # 4.2 L-BFGS and Newton-CG ############################################
-        # TODO: performance: make one function return both deviance and
-        #       gradient of deviance
-        elif solver in ['lbfgs', 'newton-cg']:
-            def func(coef, *args):
+                    L2 = P2 @ coef
+                obj = 0.5 * dev + 0.5 * (coef @ L2)
+                objp = 0.5 * devp + L2
+                return obj, objp
+
+            args = (Xnew, y, weights, P2, family, link)
+            coef, loss, info = fmin_l_bfgs_b(
+                func, coef, fprime=None, args=args,
+                iprint=(self.verbose > 0) - 1, pgtol=self.tol,
+                maxiter=self.max_iter)
+            if self.verbose > 0:
+                if info["warnflag"] == 1:
+                    warnings.warn("lbfgs failed to converge."
+                                  " Increase the number of iterations.",
+                                  ConvergenceWarning)
+                elif info["warnflag"] == 2:
+                    warnings.warn("lbfgs failed for the reason: {0}"
+                                  .format(info["task"]))
+            self.n_iter_ = info['nit']
+
+        # 4.3 Newton-CG #######################################################
+        # We use again the fisher matrix instead of the hessian. More
+        # precisely, expected hessian of deviance.
+        elif solver == 'newton-cg':
+            def func(coef, X, y, weights, P2, family, link):
                 if P2.ndim == 1:
-                    L2 = safe_sparse_dot(coef.T, P2*coef)
+                    L2 = coef @ (P2 * coef)
                 else:
-                    L2 = safe_sparse_dot(coef.T, safe_sparse_dot(P2, coef))
-                    # A[np.diag_indices_from(A)] += P2
-                return 0.5*family._deviance(coef, *args) + 0.5*L2
+                    L2 = coef @ (P2 @ coef)
+                mu = link.inverse(X @ coef)
+                return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2
 
-            def fprime(coef, *args):
+            def grad(coef, X, y, weights, P2, family, link):
                 if P2.ndim == 1:
-                    L2 = P2*coef
+                    L2 = P2 * coef
                 else:
-                    L2 = safe_sparse_dot(P2, coef)
-                return 0.5*family._deviance_derivative(coef, *args) + L2
+                    L2 = P2 @ coef
+                eta = X @ coef
+                mu = link.inverse(eta)
+                d1 = link.inverse_derivative(eta)
+                grad = X.T @ (d1 * family.deviance_derivative(y, mu, weights))
+                return 0.5 * grad + L2
 
-            def grad_hess(coef, X, y, weights, link):
+            def grad_hess(coef, X, y, weights, P2, family, link):
                 if P2.ndim == 1:
-                    L2 = P2*coef
+                    L2 = P2 * coef
                 else:
-                    L2 = safe_sparse_dot(P2, coef)
-                grad = 0.5*family._deviance_derivative(
-                    coef, X, y, weights, link) + L2
-                hessian = 0.5*family._deviance_hessian(
-                    coef, X, y, weights, link)
-                if P2.ndim == 1:
-                    hessian[np.diag_indices_from(hessian)] += P2
-                else:
-                    hessian = hessian + P2
+                    L2 = P2 @ coef
+                eta = X @ coef
+                mu = link.inverse(eta)
+                d1 = link.inverse_derivative(eta)
+                grad = 0.5 * \
+                    (X.T @ (d1 * family.deviance_derivative(y, mu, weights))) \
+                    + L2
+                # expected hessian = X.T @ diag_matrix @ X
+                # calculate only diag_matrix
+                diag = d1**2 / family.variance(mu, phi=1, weights=weights)
 
                 def Hs(s):
-                    ret = safe_sparse_dot(hessian, s)
+                    ret = 0.5 * (X.T @ (diag * (X @ s)))
+                    if P2.ndim == 1:
+                        ret += P2 * s
+                    else:
+                        ret += P2 @ s
                     return ret
+
                 return grad, Hs
 
-            args = (Xnew, y, weights, link)
-
-            if solver == 'lbfgs':
-                coef, loss, info = fmin_l_bfgs_b(
-                    func, coef, fprime=fprime, args=args,
-                    iprint=(self.verbose > 0) - 1, pgtol=self.tol,
-                    maxiter=self.max_iter)
-                if self.verbose > 0:
-                    if info["warnflag"] == 1:
-                        warnings.warn("lbfgs failed to converge."
-                                      " Increase the number of iterations.",
-                                      ConvergenceWarning)
-                    elif info["warnflag"] == 2:
-                        warnings.warn("lbfgs failed for the reason: {0}"
-                                      .format(info["task"]))
-                self.n_iter_ = info['nit']
-            elif solver == 'newton-cg':
-                coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef,
-                                           args=args, maxiter=self.max_iter,
-                                           tol=self.tol)
-
-        # 4.3 coordinate descent ##############################################
-        # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
-        # An Improved GLMNET for L1-regularized Logistic Regression,
-        # Journal of Machine Learning Research 13 (2012) 1999-2030
-        # Note: Use Fisher matrix instead of Hessian for H
-        #
-        # 1. find optimal descent direction d by minimizing
-        #    min_d F(w+d) = min_d F(w+d) - F(w)
-        #    F = f + g, f(w) = 1/2 deviance, g(w) = 1/2 w*P2*w + ||P1*w||_1
-        # 2. quadrdatic approximation of F(w+d)-F(w) = q(d):
-        #    using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives
-        #    q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
-        #           + ||P1*(w+d)||_1 - ||P1*w||_1
-        #    min_d q(d)
-        # 3. coordinate descent by updating coordinate j (d -> d+z*e_j):
-        #    min_z q(d+z*e_j)
-        #    = min_z q(d+z*e_j) - q(d)
-        #    = min_z A_j z + 1/2 B_jj z^2
-        #            + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
-        #    A = f'(w) + d*H(w) + (w+d)*P2
-        #    B = H+P2
-        # Note: we already set P2 = l2*P2, P1 = l1*P1, see above
+            args = (Xnew, y, weights, P2, family, link)
+            coef, n_iter_i = newton_cg(grad_hess, func, grad, coef,
+                                       args=args, maxiter=self.max_iter,
+                                       tol=self.tol)
+
+        # 4.4 coordinate descent ##############################################
+        # Note: we already set P1 = l1*P1, see above
+        # Note: we already set P2 = l2*P2, see above
         # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
-        # Note: f' = -score, H = Fisher matrix
         elif solver == 'cd':
-            # line search parameters
-            (beta, sigma) = (0.5, 0.01)
-            # max inner loops (cycles through all features)
-            max_inner_iter = 1000
-            # some precalculations
-            eta, mu, score, fisher = family._eta_mu_score_fisher(
-                coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link)
-            # set up space for search direction d for inner loop
-            d = np.zeros_like(coef)
-            # initial stopping tolerance of inner loop
-            # use L1-norm of minimum-norm of subgradient of F
-            # fp_wP2 = f'(w) + w*P2
-            if P2.ndim == 1:
-                fp_wP2 = -score + coef*P2
-            else:
-                fp_wP2 = -score + safe_sparse_dot(coef, P2)
-            inner_tol = (np.where(coef == 0,
-                         np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
-                         fp_wP2+np.sign(coef)*P1))
-            inner_tol = linalg.norm(inner_tol, ord=1)
-            # outer loop
-            while self.n_iter_ < self.max_iter:
-                self.n_iter_ += 1
-                # initialize search direction d (to be optimized) with zero
-                d.fill(0)
-                # inner loop
-                # TODO: use sparsity (coefficient already 0 due to L1 penalty)
-                #       => active set of features for featurelist, see paper
-                #          of Improved GLMNET or Gap Safe Screening Rules
-                #          https://arxiv.org/abs/1611.05780
-                # A = f'(w) + d*H(w) + (w+d)*P2
-                # B = H+P2
-                # Note: f'=-score and H=fisher are updated at the end of outer
-                #       iteration
-                B = fisher
-                if P2.ndim == 1:
-                    coef_P2 = coef * P2
-                    B[np.diag_indices_from(B)] += P2
-                else:
-                    coef_P2 = safe_sparse_dot(coef, P2)
-                    B = B + P2
-                A = -score + coef_P2  # + d*(H+P2) but d=0 so far
-                inner_iter = 0
-                while inner_iter < max_inner_iter:
-                    inner_iter += 1
-                    if self.selection == 'random':
-                        featurelist = random_state.permutation(n_features)
-                    else:
-                        featurelist = np.arange(n_features)
-                    for j in featurelist:
-                        # minimize_z: a z + 1/2 b z^2 + c |d+z|
-                        # a = A_j
-                        # b = B_jj > 0
-                        # c = |P1_j| = P1_j > 0, see 1.3
-                        # d = w_j + d_j
-                        # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
-                        # with beta = z+d, beta_hat = d-a/b and gamma = c/b
-                        # z = 1/b * S(bd-a,c) - d
-                        # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
-                        a = A[j]
-                        b = B[j, j]
-                        if P1[j] == 0:
-                            if b == 0:
-                                z = 0
-                            else:
-                                z = -a/b
-                        elif a + P1[j] < b * (coef[j]+d[j]):
-                            if b == 0:
-                                z = 0
-                            else:
-                                z = -(a + P1[j])/b
-                        elif a - P1[j] > b * (coef[j]+d[j]):
-                            if b == 0:
-                                z = 0
-                            else:
-                                z = -(a - P1[j])/b
-                        else:
-                            z = -(coef[j] + d[j])
-                        # update direction d
-                        d[j] += z
-                        # update A because d_j is now d_j+z
-                        # A = f'(w) + d*H(w) + (w+d)*P2
-                        # => A += (H+P2)*e_j z  = B_j * z
-                        # Note: B is symmetric B = B.transpose
-                        if sparse.issparse(B):
-                            if sparse.isspmatrix_csc(B):
-                                # slice columns
-                                A += B[:, j].toarray().ravel() * z
-                            else:
-                                # slice rows
-                                A += B[j, :].toarray().ravel() * z
-                        else:
-                            A += B[j, :] * z
-                        # end of cycle
-                    # stopping criterion for inner loop
-                    # sum_i(|minimum-norm subgrad of q(d)_i|)
-                    mn_subgrad = (np.where(coef + d == 0,
-                                  np.sign(A)*np.maximum(np.abs(A)-P1, 0),
-                                  A+np.sign(coef+d)*P1))
-                    mn_subgrad = linalg.norm(mn_subgrad, ord=1)
-                    if mn_subgrad <= inner_tol:
-                        if inner_iter == 1:
-                            inner_tol = inner_tol/4.
-                        break
-                    # end of inner loop
-                # line search by sequence beta^k, k=0, 1, ..
-                # F(w + lambda d) - F(w) <= lambda * bound
-                # bound = sigma * (f'(w)*d + w*P2*d
-                #                  +||P1 (w+d)||_1 - ||P1 w||_1)
-                P1w_1 = linalg.norm(P1*coef, ord=1)
-                # Note: coef_P2 already calculated and still valid
-                bound = sigma * (
-                    safe_sparse_dot(-score, d) +
-                    safe_sparse_dot(coef_P2, d) +
-                    linalg.norm(P1*(coef+d), ord=1) -
-                    P1w_1)
-                Fw = (0.5 * family.deviance(y, mu, weights) +
-                      0.5 * safe_sparse_dot(coef_P2, coef) +
-                      P1w_1)
-                la = 1./beta
-                for k in range(20):
-                    la *= beta  # starts with la=1
-                    mu_wd = link.inverse(safe_sparse_dot(Xnew, coef+la*d,
-                                         dense_output=True))
-                    Fwd = (0.5 * family.deviance(y, mu_wd, weights) +
-                           linalg.norm(P1*(coef+la*d), ord=1))
-                    if P2.ndim == 1:
-                        Fwd += 0.5 * safe_sparse_dot((coef+la*d)*P2, coef+la*d)
-                    else:
-                        Fwd += 0.5 * (safe_sparse_dot(coef+la*d,
-                                      safe_sparse_dot(P2, coef+la*d)))
-                    if Fwd-Fw <= sigma*la*bound:
-                        break
-                # update coefficients
-                # coef_old = coef.copy()
-                coef += la * d
-                # calculate eta, mu, score, Fisher matrix for next iteration
-                eta, mu, score, fisher = family._eta_mu_score_fisher(
-                    coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link)
-                # stopping criterion for outer loop
-                # sum_i(|minimum-norm subgrad of F(w)_i|)
-                # fp_wP2 = f'(w) + w*P2
-                # Note: eta, mu and score are already updated
-                if P2.ndim == 1:
-                    fp_wP2 = -score + coef*P2
-                else:
-                    fp_wP2 = -score + safe_sparse_dot(coef, P2)
-                mn_subgrad = (np.where(coef == 0,
-                              np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
-                              fp_wP2+np.sign(coef)*P1))
-                mn_subgrad = linalg.norm(mn_subgrad, ord=1)
-                if mn_subgrad <= self.tol:
-                    converged = True
-                    break
-                # end of outer loop
-            if not converged:
-                warnings.warn("Coordinate descent failed to converge. Increase"
-                              " the number of iterations (currently {0})"
-                              .format(self.max_iter), ConvergenceWarning)
+            # For coordinate descent, if X is sparse, it should be csc format
+            # If X is sparse, P2 must also be csc
+            if sparse.issparse(Xnew):
+                Xnew = Xnew.tocsc(copy=self.copy_X)
+                P2 = sparse.csc_matrix(P2)
+
+            coef, self.n_iter_, self._n_cycles = \
+                _cd_solver(coef=coef, X=Xnew, y=y, weights=weights, P1=P1,
+                           P2=P2, family=family, link=link,
+                           max_iter=self.max_iter, tol=self.tol,
+                           selection=self.selection, random_state=random_state,
+                           diag_fisher=self.diag_fisher, copy_X=self.copy_X)
 
         #######################################################################
         # 5. postprocessing                                                   #
@@ -1828,8 +2052,7 @@ def linear_predictor(self, X):
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
                         dtype='numeric', copy=True, ensure_2d=True,
                         allow_nd=False)
-        return safe_sparse_dot(X, self.coef_,
-                               dense_output=True) + self.intercept_
+        return X @ self.coef_ + self.intercept_
 
     def predict(self, X, sample_weight=None):
         """Predict uing GLM with feature matrix X.
@@ -1884,7 +2107,7 @@ def estimate_phi(self, X, y, sample_weight=None):
                          dtype=_dtype, y_numeric=True, multi_output=False)
         n_samples, n_features = X.shape
         weights = _check_weights(sample_weight, n_samples)
-        eta = safe_sparse_dot(X, self.coef_, dense_output=True)
+        eta = X @ self.coef_
         if self.fit_intercept is True:
             eta += self.intercept_
             n_features += 1
@@ -1907,19 +2130,19 @@ def estimate_phi(self, X, y, sample_weight=None):
     # "AssertionError: -0.28014056555724598 not greater than 0.5"
     # unless GeneralizedLinearRegressor has a score which passes the test.
     def score(self, X, y, sample_weight=None):
-        r"""Compute D^2, the percentage of deviance explained.
+        """Compute D^2, the percentage of deviance explained.
 
         D^2 is a generalization of the coefficient of determination R^2.
         R^2 uses squared error and D^2 deviance. Note that those two are equal
         for family='normal'.
 
         D^2 is defined as
-        :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}`
-        is the null deviance, i.e. the deviance of a model with intercept
-        alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean
-        :math:`\bar{y}` is averaged by sample_weight.
-        Best possible score is 1.0 and it can be negative (because the
-        model can be arbitrarily worse).
+        :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
+        :math:`D_{null}` is the null deviance, i.e. the deviance of a model
+        with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
+        The mean :math:`\\bar{y}` is averaged by sample_weight.
+        Best possible score is 1.0 and it can be negative (because the model
+        can be arbitrarily worse).
 
         Parameters
         ----------
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index de0857a34fe3a..ae8a5f4cfc5e7 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -138,16 +138,18 @@ def test_sample_weights_validation():
         glm.fit(X, y, weights)
 
 
-def test_glm_family_argument():
+@pytest.mark.parametrize('f, fam',
+                         [('normal', NormalDistribution()),
+                          ('poisson', PoissonDistribution()),
+                          ('gamma', GammaDistribution()),
+                          ('inverse.gaussian', InverseGaussianDistribution()),
+                          ('binomial', BinomialDistribution())])
+def test_glm_family_argument(f, fam):
     """Test GLM family argument set as string."""
-    y = np.array([1, 2])
+    y = np.array([0.1, 0.5])  # in range of all distributions
     X = np.array([[1], [2]])
-    for (f, fam) in [('normal', NormalDistribution()),
-                     ('poisson', PoissonDistribution()),
-                     ('gamma', GammaDistribution()),
-                     ('inverse.gaussian', InverseGaussianDistribution())]:
-        glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
-        assert_equal(type(glm._family_instance), type(fam))
+    glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
+    assert_equal(type(glm._family_instance), type(fam))
 
     glm = GeneralizedLinearRegressor(family='not a family',
                                      fit_intercept=False)
@@ -155,14 +157,16 @@ def test_glm_family_argument():
         glm.fit(X, y)
 
 
-def test_glm_link_argument():
+@pytest.mark.parametrize('l, link',
+                         [('identity', IdentityLink()),
+                          ('log', LogLink()),
+                          ('logit', LogitLink())])
+def test_glm_link_argument(l, link):
     """Test GLM link argument set as string."""
-    y = np.array([1, 2])
+    y = np.array([0.1, 0.5])  # in range of all distributions
     X = np.array([[1], [2]])
-    for (l, link) in [('identity', IdentityLink()),
-                      ('log', LogLink())]:
-        glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y)
-        assert_equal(type(glm._link_instance), type(link))
+    glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y)
+    assert_equal(type(glm._link_instance), type(link))
 
     glm = GeneralizedLinearRegressor(family='normal', link='not a link')
     with pytest.raises(ValueError):
@@ -317,6 +321,16 @@ def test_glm_random_state_argument(random_state):
         glm.fit(X, y)
 
 
+@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]])
+def test_glm_diag_fisher_argument(diag_fisher):
+    """Test GLM for invalid diag_fisher arguments."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
 @pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]])
 def test_glm_copy_X_argument(copy_X):
     """Test GLM for invalid copy_X arguments."""
@@ -453,7 +467,12 @@ def test_normal_ridge(solver):
     assert_array_almost_equal(glm.predict(T), ridge.predict(T))
 
 
-def test_poisson_ridge():
+@pytest.mark.parametrize('solver, decimal, tol',
+                         [('irls', 7, 1e-8),
+                          ('lbfgs', 5, 1e-7),
+                          ('newton-cg', 5, 1e-7),
+                          ('cd', 7, 1e-8)])
+def test_poisson_ridge(solver, decimal, tol):
     """Test ridge regression with poisson family and LogLink.
 
     Compare to R's glmnet"""
@@ -470,22 +489,20 @@ def test_poisson_ridge():
     # b            0.03741173122
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
-    s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7}
-    s_tol = {'irls': 1e-8, 'lbfgs': 1e-7, 'newton-cg': 1e-7, 'cd': 1e-8}
-    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
-        glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
-                                         fit_intercept=True, family='poisson',
-                                         link='log', tol=s_tol[solver],
-                                         solver=solver, max_iter=300,
-                                         random_state=42)
-        glm.fit(X, y)
-        assert_almost_equal(glm.intercept_, -0.12889386979,
-                            decimal=s_dec[solver])
-        assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
-                                  decimal=s_dec[solver])
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
+                                     fit_intercept=True, family='poisson',
+                                     link='log', tol=tol,
+                                     solver=solver, max_iter=300,
+                                     random_state=42)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, -0.12889386979,
+                        decimal=decimal)
+    assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
+                              decimal=decimal)
 
 
-def test_normal_enet():
+@pytest.mark.parametrize('diag_fisher', [False, True])
+def test_normal_enet(diag_fisher):
     """Test elastic net regression with normal/gaussian family."""
     rng = np.random.RandomState(0)
     alpha, l1_ratio = 0.3, 0.7
@@ -494,12 +511,14 @@ def test_normal_enet():
     beta = rng.randn(n_features)
     y = 2 + np.dot(X, beta) + rng.randn(n_samples)
 
+    # 1. test normal enet on dense data
     glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio,
                                      family='normal', link='identity',
                                      fit_intercept=True, tol=1e-8,
                                      max_iter=100, selection='cyclic',
                                      solver='cd', start_params='zero',
-                                     check_input=False)
+                                     check_input=False,
+                                     diag_fisher=diag_fisher)
     glm.fit(X, y)
 
     enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True,
@@ -509,6 +528,12 @@ def test_normal_enet():
     assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
     assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
 
+    # 2. test normal enet on sparse data
+    X = sparse.csc_matrix(X)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
+    assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
+
 
 def test_poisson_enet():
     """Test elastic net regression with poisson family and LogLink.

From a6f9f13db9dd11d01d1dcab93819e67d5ca18b9d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 20 Apr 2019 15:59:39 +0200
Subject: [PATCH 050/209] Treat the intercept separately, i.e. X, P1, P2 never
 include intercept

---
 sklearn/linear_model/glm.py            | 637 +++++++++++++++----------
 sklearn/linear_model/tests/test_glm.py |   6 +-
 2 files changed, 386 insertions(+), 257 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index b2de866a4b69d..2afd1ddf8c79c 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -7,8 +7,6 @@
 # License: BSD 3 clause
 
 # TODO: Write more examples.
-# TODO: Make option self.copy_X more meaningful.
-#       So far, fit uses Xnew instead of X.
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
 # TODO: Add cross validation support?
@@ -39,6 +37,7 @@
 #   sklearn.linear_models uses w for coefficients, standard literature on
 #   GLMs use beta for coefficients and w for (sample) weights.
 #   So far, coefficients=w and sample weights=s.
+# - The intercept term is the first index, i.e. coef[0]
 
 
 from __future__ import division
@@ -86,6 +85,83 @@ def _check_weights(sample_weight, n_samples):
     return weights
 
 
+def _safe_lin_pred(X, coef):
+    """Compute the linear predictor taking care if intercept is present."""
+    if coef.size == X.shape[1] + 1:
+        return X @ coef[1:] + coef[0]
+    else:
+        return X @ coef
+
+
+def _safe_sandwich_dot(X, d, intercept=False):
+    """Compute sandwich product X.T @ diag(d) @ X.
+
+    With ``intercept=True``, X is treated as if a column of 1 were appended as
+    first column of X.
+    X can be sparse, d must be an ndarray. Always returns a ndarray."""
+    if sparse.issparse(X):
+        temp = (X.transpose().multiply(d) @ X).toarray()
+    else:
+        temp = (X.T * d) @ X
+    if intercept:
+        dim = X.shape[1] + 1
+        if sparse.issparse(X):
+            order = 'F' if sparse.isspmatrix_csc(X) else 'C'
+        else:
+            order = 'F' if X.flags['F_CONTIGUOUS'] else 'C'
+        res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order)
+        res[0, 0] = d.sum()
+        res[1:, 0] = d @ X
+        res[0, 1:] = res[1:, 0]
+        res[1:, 1:] = temp
+    else:
+        res = temp
+    return res
+
+
+def _min_norm_sugrad(coef, grad, P2, P1):
+    """Compute the gradient of all subgradients with minimal L2-norm.
+
+    subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1)
+
+    g_i = grad_i + (P2*coef)_i
+
+    if coef_i > 0:   g_i + P1_i
+    if coef_i < 0:   g_i - P1_i
+    if coef_i = 0:   sign(g_i) * max(|g_i|-P1_i, 0)
+
+    Parameters
+    ----------
+    coef : ndarray
+        coef[0] may be intercept.
+
+    grad : ndarray, shape=coef.shape
+
+    P2 : {1d or 2d array, None}
+        always without intercept, ``None`` means P2 = 0
+
+    P1 : ndarray
+        always without intercept
+    """
+    intercept = (coef.size == P1.size + 1)
+    idx = 1 if intercept else 0  # offset if coef[0] is intercept
+    # compute grad + coef @ P2 without intercept
+    grad_wP2 = grad[idx:].copy()
+    if P2 is None:
+        pass
+    elif P2.ndim == 1:
+        grad_wP2 += coef[idx:] * P2
+    else:
+        grad_wP2 += coef[idx:] @ P2
+    res = np.where(coef[idx:] == 0,
+                   np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0),
+                   grad_wP2 + np.sign(coef[idx:]) * P1)
+    if intercept:
+        return np.concatenate(([grad[0]], res))
+    else:
+        return res
+
+
 class Link(metaclass=ABCMeta):
     """Abstract base class for Link funtions."""
 
@@ -473,13 +549,16 @@ def starting_mu(self, y, weights=1, ind_weight=0.5):
                 (1. - ind_weight) * np.average(y, weights=weights))
 
     def _mu_deviance_derivative(self, coef, X, y, weights, link):
-        """Compute mu, the deviance and it's derivative w.r.t coef."""
-        lin_pred = X @ coef
+        """Compute mu and the derivative of the deviance w.r.t coef."""
+        lin_pred = _safe_lin_pred(X, coef)
         mu = link.inverse(lin_pred)
-        dev = self.deviance(y, mu, weights)
         d1 = link.inverse_derivative(lin_pred)
-        devp = X.T @ (d1 * self.deviance_derivative(y, mu, weights))
-        return mu, dev, devp
+        temp = d1 * self.deviance_derivative(y, mu, weights)
+        if coef.size == X.shape[1] + 1:
+            devp = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            devp = temp @ X  # sampe as X.T @ temp
+        return mu, devp
 
     def _score(self, coef, phi, X, y, weights, link):
         r"""Compute the score function.
@@ -499,12 +578,15 @@ def _score(self, coef, phi, X, y, weights, link):
         :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`.
         Note: The derivative of the deviance w.r.t. coef equals -2 * score.
         """
-        lin_pred = X @ coef
+        lin_pred = _safe_lin_pred(X, coef)
         mu = link.inverse(lin_pred)
         sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         d = link.inverse_derivative(lin_pred)
         temp = sigma_inv * d * (y - mu)
-        score = X.T @ temp
+        if coef.size == X.shape[1] + 1:
+            score = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            score = temp @ X  # sampe as X.T @ temp
         return score
 
     def _fisher_matrix(self, coef, phi, X, y, weights, link):
@@ -526,14 +608,14 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link):
         with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`,
         see func:`_score`.
         """
-        n_samples = X.shape[0]
-        lin_pred = X @ coef
+        lin_pred = _safe_lin_pred(X, coef)
         mu = link.inverse(lin_pred)
         sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
-        d2 = link.inverse_derivative(lin_pred)**2
-        d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0),
-                                         shape=(n_samples, n_samples))
-        fisher_matrix = X.T @ d2_sigma_inv @ X
+        d = link.inverse_derivative(lin_pred)
+        d2_sigma_inv = sigma_inv * d * d
+        intercept = (coef.size == X.shape[1] + 1)
+        fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv,
+                                           intercept=intercept)
         return fisher_matrix
 
     def _observed_information(self, coef, phi, X, y, weights, link):
@@ -559,17 +641,17 @@ def _observed_information(self, coef, phi, X, y, weights, link):
         \right)`,
         see :func:`score_` function and :func:`_fisher_matrix`.
         """
-        n_samples = X.shape[0]
-        lin_pred = X @ coef
+        lin_pred = _safe_lin_pred(X, coef)
         mu = link.inverse(lin_pred)
         sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         dp = link.inverse_derivative2(lin_pred)
         d2 = link.inverse_derivative(lin_pred)**2
         v = self.unit_variance_derivative(mu)/self.unit_variance(mu)
         r = y - mu
-        temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0),
-                                 shape=(n_samples, n_samples))
-        observed_information = X.T @ temp @ X
+        temp = sigma_inv * (-dp * r + d2 * v * r + d2)
+        intercept = (coef.size == X.shape[1] + 1)
+        observed_information = _safe_sandwich_dot(X, temp,
+                                                  intercept=intercept)
         return observed_information
 
     def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link,
@@ -600,28 +682,29 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link,
                   an array of shape (X.shape[1], X.shape[1])
                 * If diag_fisher is ``True`, an array of shape (X.shape[0])
         """
-        n_samples, n_features = X.shape
+        intercept = (coef.size == X.shape[1] + 1)
         # eta = linear predictor
-        eta = X @ coef
+        eta = _safe_lin_pred(X, coef)
         mu = link.inverse(eta)
         sigma_inv = 1./self.variance(mu, phi=phi, weights=weights)
         d1 = link.inverse_derivative(eta)  # = h'(eta)
         # Alternatively:
         # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
         # d1 = 1./link.derivative(mu)
-        score = X.T @ (sigma_inv * d1 * (y - mu))
-        #
-        d2_sigma_inv = sigma_inv * (d1**2)
+        d1_sigma_inv = d1 * sigma_inv
+        temp = d1_sigma_inv * (y - mu)
+        if intercept:
+            score = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            score = temp @ X
+
+        d2_sigma_inv = d1 * d1_sigma_inv
         if diag_fisher:
-            return eta, mu, score, d2_sigma_inv
+            fisher_matrix = d2_sigma_inv
         else:
-            if sparse.issparse(X):
-                d2_sigma_inv = sparse.dia_matrix((d2_sigma_inv, 0),
-                                                 shape=(n_samples, n_samples))
-                fisher = (X.T @ d2_sigma_inv @ X).toarray()
-            else:
-                fisher = (X.T * d2_sigma_inv) @ X
-            return eta, mu, score, fisher
+            fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv,
+                                               intercept=intercept)
+        return eta, mu, score, fisher_matrix
 
 
 class TweedieDistribution(ExponentialDispersionModel):
@@ -809,7 +892,7 @@ def unit_deviance(self, y, mu):
         return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu)))
 
 
-def _irls_step(X, W, P2, z):
+def _irls_step(X, W, P2, z, fit_intercept=True):
     """Compute one step in iteratively reweighted least squares.
 
     Solve A w = b for w with
@@ -829,43 +912,57 @@ def _irls_step(X, W, P2, z):
     P2 : {ndarray, sparse matrix}, shape (n_features, n_features)
         The L2-penalty matrix or vector (=diagonal matrix)
 
-    z  : ndarray, shape (n_samples,)
+    z : ndarray, shape (n_samples,)
         Working observations
 
+    fit_intercept : boolean, optional (default=True)
+
     Returns
     -------
-    coef: ndarray, shape (X.shape[1])
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
     """
     # Note: solve vs least squares, what is more appropriate?
     #       scipy.linalg.solve seems faster, but scipy.linalg.lstsq
     #       is more robust.
-    n_samples, n_features = X.shape
-    if sparse.issparse(X):
-        W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr()
+    # Note: X.T @ W @ X is not sparse, even when X is sparse.
+    #      Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b)
+    if fit_intercept:
+        Wz = W * z
+        if sparse.issparse(X):
+            b = np.concatenate(([Wz.sum()], X.transpose() @ Wz))
+        else:
+            b = np.concatenate(([Wz.sum()], X.T @ Wz))
+        A = _safe_sandwich_dot(X, W, intercept=fit_intercept)
         if P2.ndim == 1:
-            L2 = (sparse.dia_matrix((P2, 0), shape=(n_features, n_features))
-                  ).tocsr()
+            idx = np.arange(start=1, stop=A.shape[0])
+            A[(idx, idx)] += P2  # add to diag elements without intercept
+        elif sparse.issparse(P2):
+            A[1:, 1:] += P2.toarray()
         else:
-            L2 = sparse.csr_matrix(P2)
-        XtW = X.transpose() * W
-        A = XtW * X + L2
-        b = XtW * z
-        # coef = splinalg.spsolve(A, b)
-        coef, *_ = splinalg.lsmr(A, b)
+            A[1:, 1:] += P2
     else:
-        XtW = (X.T * W)
-        A = XtW.dot(X)
+        if sparse.issparse(X):
+            XtW = X.transpose().multiply(W)
+            A = (XtW @ X).toarray()
+        else:
+            XtW = (X.T * W)
+            A = XtW @ X
+        b = XtW @ z
         if P2.ndim == 1:
             A[np.diag_indices_from(A)] += P2
+        elif sparse.issparse(P2):
+            A += P2.toarray()
         else:
             A += P2
-        b = XtW.dot(z)
-        # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True)
-        coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
+    # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True)
+    coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
     return coef
 
 
-def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol):
+def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link,
+                 max_iter, tol):
     """Solve GLM with L2 penalty by IRLS algorithm.
 
     Note: If X is sparse, P2 must also be sparse.
@@ -889,7 +986,7 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol):
     # Note: ' denotes derivative, but also transpose for matrices
 
     # eta = linear predictor
-    eta = X @ coef
+    eta = _safe_lin_pred(X, coef)
     mu = link.inverse(eta)
     # D = h'(eta)
     hp = link.inverse_derivative(eta)
@@ -906,10 +1003,10 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol):
         z = eta + (y - mu) / hp
         # solve A*coef = b
         # A = X' W X + P2, b = X' W z
-        coef = _irls_step(X, W, P2, z)
+        coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept)
         # updated linear predictor
         # do it here for updated values for tolerance
-        eta = X @ coef
+        eta = _safe_lin_pred(X, coef)
         mu = link.inverse(eta)
         hp = link.inverse_derivative(eta)
         V = family.variance(mu, phi=1, weights=weights)
@@ -917,11 +1014,18 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol):
         # which tolerace? |coef - coef_old| or gradient?
         # use gradient for compliance with newton-cg and lbfgs
         # gradient = -X' D (y-mu)/V(mu) + l2 P2 w
-        gradient = -(X.T @ (hp*(y-mu)/V))
+        temp = hp * (y - mu) / V
+        if sparse.issparse(X):
+            gradient = -(X.transpose() @ temp)
+        else:
+            gradient = -(X.T @ temp)
+        idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
         if P2.ndim == 1:
-            gradient += P2*coef
+            gradient += P2 * coef[idx:]
         else:
-            gradient += P2 @ coef
+            gradient += P2 @ coef[idx:]
+        if fit_intercept:
+            gradient = np.concatenate(([-temp.sum()], gradient))
         if (np.max(np.abs(gradient)) <= tol):
             converged = True
             break
@@ -937,7 +1041,7 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol):
 def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
               max_inner_iter=1000, selection='cyclic',
               random_state=None, diag_fisher=False):
-    """Compute inner loop of coordinate descent = cycles through features.
+    """Compute inner loop of coordinate descent, i.e. cycles through features.
 
     Minimization of 1-d subproblems::
 
@@ -953,24 +1057,31 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
     #          of Improved GLMNET or Gap Safe Screening Rules
     #          https://arxiv.org/abs/1611.05780
     n_samples, n_features = X.shape
+    intercept = (coef.size == X.shape[1] + 1)
+    idx = 1 if intercept else 0  # offset if coef[0] is intercept
     B = fisher
     if P2.ndim == 1:
-        coef_P2 = coef * P2
+        coef_P2 = coef[idx:] * P2
         if not diag_fisher:
-            B[np.diag_indices_from(B)] += P2
+            idiag = np.arange(start=idx, stop=B.shape[0])
+            # B[np.diag_indices_from(B)] += P2
+            B[(idiag, idiag)] += P2
     else:
-        coef_P2 = P2 @ coef  # P2 is symmetric, mat @ vec is usually faster
+        coef_P2 = coef[idx:] @ P2
         if not diag_fisher:
             if sparse.issparse(P2):
-                B += P2.toarray()
+                B[idx:, idx:] += P2.toarray()
             else:
-                B += P2
-    A = -score + coef_P2  # + d @ (H+P2) but d=0 so far
+                B[idx:, idx:] += P2
+    # A = -score + coef_P2
+    A = -score
+    A[idx:] += coef_P2
+    # A += d @ (H+P2) but so far d=0
     # inner loop
-    inner_iter = 0
-    while inner_iter < max_inner_iter:
+    for inner_iter in range(1, max_inner_iter+1):
         inner_iter += 1
         n_cycles += 1
+        # cycle through features, update intercept separately at the end
         if selection == 'random':
             featurelist = random_state.permutation(n_features)
         else:
@@ -985,70 +1096,85 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
             # with beta = z+d, beta_hat = d-a/b and gamma = c/b
             # z = 1/b * S(bd-a,c) - d
             # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
-            a = A[j]
+            jdx = j+idx  # index for arrays containing entries for intercept
+            a = A[jdx]
             if diag_fisher:
+                # Note: fisher is ndarray of shape (n_samples,) => no idx
+                # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway
+                Bj = np.zeros_like(A)
+                if intercept:
+                    Bj[0] = fisher.sum()
                 if sparse.issparse(X):
-                    xj = X[:, j]
-                    b = xj.transpose() @ xj.multiply(fisher[:, np.newaxis])
-                    b = b[0, 0]
+                    Bj[idx:] = (X[:, j].transpose().multiply(fisher) @ X
+                                ).toarray().ravel()
                 else:
-                    b = X[:, j] @ (fisher * X[:, j])
+                    Bj[idx:] = (fisher * X[:, j]) @ X
 
                 if P2.ndim == 1:
-                    b += P2[j]
+                    Bj[idx:] += P2[j]
                 else:
-                    b += P2[j, j]
+                    if sparse.issparse(P2):
+                        # slice columns as P2 is csc
+                        Bj[idx:] += P2[:, j].toarray().ravel()
+                    else:
+                        Bj[idx:] += P2[:, j]
+                b = Bj[jdx]
             else:
-                b = B[j, j]
+                b = B[jdx, jdx]
 
+            # those ten lines aree what it is all about
             if b <= 0:
                 z = 0
             elif P1[j] == 0:
                 z = -a/b
-            elif a + P1[j] < b * (coef[j] + d[j]):
+            elif a + P1[j] < b * (coef[jdx] + d[jdx]):
                 z = -(a + P1[j])/b
-            elif a - P1[j] > b * (coef[j] + d[j]):
+            elif a - P1[j] > b * (coef[jdx] + d[jdx]):
                 z = -(a - P1[j])/b
             else:
-                z = -(coef[j] + d[j])
+                z = -(coef[jdx] + d[jdx])
 
             # update direction d
-            d[j] += z
+            d[jdx] += z
             # update A because d_j is now d_j+z
             # A = f'(w) + d*H(w) + (w+d)*P2
             # => A += (H+P2)*e_j z = B_j * z
             # Note: B is symmetric B = B.transpose
             if diag_fisher:
-                if sparse.issparse(X):
-                    A += (X.transpose() @
-                          X[:, j].multiply(fisher[:, np.newaxis])
-                          ).toarray().ravel() * z
-                else:
-                    # A += (X.T @ (fisher * X[:, j])) * z
-                    # same without transpose of X
-                    A += ((fisher * X[:, j]) @ X) * z
-
-                if P2.ndim == 1:
-                    A[j] += P2[j] * z
-                elif sparse.issparse(P2):
-                    # slice columns as P2 is csc
-                    A += P2[:, j].toarray().ravel() * z
-                else:
-                    A += P2[:, j] * z
+                # Bj = B[:, j] calculated above, still valid
+                A += Bj * z
             else:
                 # B is symmetric, C- or F-contiguous, but never sparse
                 if B.flags['F_CONTIGUOUS']:
                     # slice columns like for sparse csc
-                    A += B[:, j] * z
+                    A += B[:, jdx] * z
                 else:  # B.flags['C_CONTIGUOUS'] might be true
                     # slice rows
-                    A += B[j, :] * z
-            # end of cycle
+                    A += B[jdx, :] * z
+            # end of cycle over features
+        # update intercept
+        if intercept:
+            if diag_fisher:
+                Bj = np.zeros_like(A)
+                Bj[0] = fisher.sum()
+                Bj[1:] = fisher @ X
+                b = Bj[0]
+            else:
+                b = B[0, 0]
+            z = 0 if b <= 0 else -A[0]/b
+            d[0] += z
+            if diag_fisher:
+                A += Bj * z
+            else:
+                if B.flags['F_CONTIGUOUS']:
+                    A += B[:, 0] * z
+                else:
+                    A += B[0, :] * z
+        # end of complete cycle
         # stopping criterion for inner loop
         # sum_i(|minimum of norm of subgrad of q(d)_i|)
-        mn_subgrad = np.where(coef + d == 0,
-                              np.sign(A) * np.maximum(np.abs(A) - P1, 0),
-                              A + np.sign(coef + d) * P1)
+        # subgrad q(d) = A + subgrad ||P1*(w+d)||_1
+        mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1)
         mn_subgrad = linalg.norm(mn_subgrad, ord=1)
         if mn_subgrad <= inner_tol:
             if inner_iter == 1:
@@ -1058,7 +1184,7 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
     return d, coef_P2, n_cycles, inner_tol
 
 
-def _cd_solver(coef, X, y, weights, P1, P2, family, link,
+def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
                max_iter=100, max_inner_iter=1000, tol=1e-4,
                selection='cyclic ', random_state=None,
                diag_fisher=False, copy_X=True):
@@ -1083,7 +1209,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
        = min_z A_j z + 1/2 B_jj z^2
                + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
        A = f'(w) + d*H(w) + (w+d)*P2
-       B = H+P2
+       B = H + P2
 
     Repeat steps 1-3 until convergence.
     Note: Use Fisher matrix instead of Hessian for H.
@@ -1091,7 +1217,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
 
     Parameters
     ----------
-    coef: ndarray, shape (n_features,)
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
 
     X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
         Training data (with intercept included if present). If not sparse,
@@ -1112,6 +1240,10 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
         The L2-penalty matrix or vector (=diagonal matrix). If a matrix is
         passed, it must be symmetric. If X is sparse, P2 must also be sparse.
 
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
     family : ExponentialDispersionModel
 
     link : Link
@@ -1120,8 +1252,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
         Maximum numer of outer (Newton) iterations.
 
     max_inner_iter : int, optional (default=1000)
-        Maximum number of iterations, i.e. cycles over all features, in inner
-        loop.
+        Maximum number of iterations in each inner loop, i.e. max number of
+        cycles over all features per inner loop.
 
     tol : float, optional (default=1e-4)
         Covergence criterion is
@@ -1133,8 +1265,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
     random_state : {int, RandomState instance, None}, optional (default=None)
 
     diag_fisher : boolean, optional (default=False)
-        'False' calculates full fisher matrix, 'True' only diagonal matrix s.t.
-        fisher = X.T @ diag @ X. This saves storage but needs more
+        ``False`` calculates full fisher matrix, ``True`` only diagonal matrix
+        s.t. fisher = X.T @ diag @ X. This saves storage but needs more
         matrix-vector multiplications.
 
     copy_X : boolean, optional (default=True)
@@ -1142,7 +1274,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
 
     Returns
     -------
-    coef : ndarray, shape (n_features,)
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
 
     n_iter : numer of outer iterations = newton iterations
 
@@ -1174,6 +1308,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
     n_cycles = 0  # number of (complete) cycles over features
     converged = False
     n_samples, n_features = X.shape
+    idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
     # line search parameters
     (beta, sigma) = (0.5, 0.01)
     # some precalculations
@@ -1186,16 +1321,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
     d = np.zeros_like(coef)
     # initial stopping tolerance of inner loop
     # use L1-norm of minimum of norm of subgradient of F
-    # fp_wP2 = f'(w) + w*P2
-    if P2.ndim == 1:
-        fp_wP2 = -score + coef * P2
-    else:
-        # Note: P2 is symmetric and matrix @ vector is faster for sparse
-        #       matrices.
-        fp_wP2 = -score + P2 @ coef
-    inner_tol = np.where(coef == 0,
-                         np.sign(fp_wP2) * np.maximum(np.abs(fp_wP2) - P1, 0),
-                         fp_wP2 + np.sign(coef) * P1)
+    inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
     inner_tol = linalg.norm(inner_tol, ord=1)
     # outer loop
     while n_iter < max_iter:
@@ -1211,23 +1337,23 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
         # F(w + lambda d) - F(w) <= lambda * bound
         # bound = sigma * (f'(w)*d + w*P2*d
         #                  +||P1 (w+d)||_1 - ||P1 w||_1)
-        P1w_1 = linalg.norm(P1 * coef, ord=1)
+        P1w_1 = linalg.norm(P1 * coef[idx:], ord=1)
+        P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1)
         # Note: coef_P2 already calculated and still valid
-        bound = sigma * (-(score @ d) + coef_P2 @ d +
-                         linalg.norm(P1 * (coef + d), ord=1) - P1w_1)
+        bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1)
         Fw = (0.5 * family.deviance(y, mu, weights) +
-              0.5 * (coef_P2 @ coef) + P1w_1)
+              0.5 * (coef_P2 @ coef[idx:]) + P1w_1)
         la = 1./beta
         for k in range(20):
             la *= beta  # starts with la=1
             coef_wd = coef + la * d
-            mu_wd = link.inverse(X @ coef_wd)
+            mu_wd = link.inverse(_safe_lin_pred(X, coef_wd))
             Fwd = (0.5 * family.deviance(y, mu_wd, weights) +
-                   linalg.norm(P1 * coef_wd, ord=1))
+                   linalg.norm(P1 * coef_wd[idx:], ord=1))
             if P2.ndim == 1:
-                Fwd += 0.5 * ((coef_wd * P2) @ coef_wd)
+                Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:])
             else:
-                Fwd += 0.5 * (coef_wd @ (P2 @ coef_wd))
+                Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:]))
             if Fwd - Fw <= sigma * la * bound:
                 break
         # update coefficients
@@ -1238,16 +1364,10 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
             coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
             diag_fisher=diag_fisher)
         # stopping criterion for outer loop
-        # sum_i(|minimum of norm of subgrad of F(w)_i|)
+        # sum_i(|minimum-norm of subgrad of F(w)_i|)
         # fp_wP2 = f'(w) + w*P2
         # Note: eta, mu and score are already updated
-        if P2.ndim == 1:
-            fp_wP2 = -score + coef * P2
-        else:
-            fp_wP2 = -score + P2 @ coef  # P2 is symmetric, mat @ vec is faster
-        mn_subgrad = np.where(coef == 0,
-                              np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
-                              fp_wP2 + np.sign(coef) * P1)
+        mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
         mn_subgrad = linalg.norm(mn_subgrad, ord=1)
         if mn_subgrad <= tol:
             converged = True
@@ -1255,8 +1375,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
         # end of outer loop
     if not converged:
         warnings.warn("Coordinate descent failed to converge. Increase"
-                      " the number of iterations (currently {0})"
-                      .format(max_iter), ConvergenceWarning)
+                      " the maximum number of iterations max_iter"
+                      " (currently {0})".format(max_iter), ConvergenceWarning)
 
     return coef, n_iter, n_cycles
 
@@ -1387,8 +1507,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where g_i is the i-th component of the gradient (derivative) of
         the objective function. For the cd solver, covergence is reached
-        when ``sum_i(|minimum of norm of g_i|)``, where g_i is the
-        subgradient of the objective.
+        when ``sum_i(|minimum-norm of g_i|)``, where g_i is the
+        subgradient of the objective and minimum-norm of g_i is the element of
+        the subgradient g_i with the smallest L2-norm.
 
     warm_start : boolean, optional (default=False)
         If set to ``True``, reuse the solution of the previous call to ``fit``
@@ -1563,19 +1684,7 @@ def fit(self, X, y, sample_weight=None):
         #######################################################################
         # 1. input validation                                                 #
         #######################################################################
-        # 1.1 validate arguments of fit #######################################
-        _dtype = [np.float64, np.float32]
-        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'],
-                         dtype=_dtype, y_numeric=True, multi_output=False,
-                         copy=self.copy_X)
-        # Without converting y to float, deviance might raise
-        # ValueError: Integers to negative integer powers are not allowed.
-        # Also, y must not be sparse.
-        y = np.asarray(y, dtype=np.float64)
-
-        weights = _check_weights(sample_weight, y.shape[0])
-
-        # 1.2 validate arguments of __init__ ##################################
+        # 1.1 validate arguments of __init__ ##################################
         # Guarantee that self._family_instance is an instance of class
         # ExponentialDispersionModel
         if isinstance(self.family, ExponentialDispersionModel):
@@ -1668,25 +1777,6 @@ def fit(self, X, y, sample_weight=None):
         if not isinstance(self.warm_start, bool):
             raise ValueError("The argument warm_start must be bool;"
                              " got {0}".format(self.warm_start))
-        start_params = self.start_params
-        if isinstance(start_params, str):
-            if start_params not in ['irls', 'least_squares', 'zero']:
-                raise ValueError("The argument start_params must be 'irls', "
-                                 "'least-squares', 'zero' or an array of "
-                                 " correct length;"
-                                 " got(start_params={0})".format(start_params))
-        else:
-            start_params = check_array(start_params, accept_sparse=False,
-                                       force_all_finite=True, ensure_2d=False,
-                                       dtype=_dtype, copy=True)
-            if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
-                    (start_params.ndim != 1)):
-                raise ValueError("Start values for parameters must have the"
-                                 "right length and dimension; required (length"
-                                 "={0}, ndim=1); got (length={1}, ndim={2})."
-                                 .format(X.shape[1] + self.fit_intercept,
-                                         start_params.shape[0],
-                                         start_params.ndim))
         if self.selection not in ['cyclic', 'random']:
             raise ValueError("The argument selection must be 'cyclic' or "
                              "'random'; got (selection={0})"
@@ -1702,36 +1792,59 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("The argument check_input must be bool; got "
                              "(check_input={0})".format(self.check_input))
 
+        family = self._family_instance
+        link = self._link_instance
+
+        # 1.2 validate arguments of fit #######################################
+        _dtype = [np.float64, np.float32]
+        if solver == 'cd':
+            _stype = ['csc']
+        else:
+            _stype = ['csc', 'csr']
+        X, y = check_X_y(X, y, accept_sparse=_stype,
+                         dtype=_dtype, y_numeric=True, multi_output=False,
+                         copy=self.copy_X)
+        # Without converting y to float, deviance might raise
+        # ValueError: Integers to negative integer powers are not allowed.
+        # Also, y must not be sparse.
+        y = np.asarray(y, dtype=np.float64)
+
+        weights = _check_weights(sample_weight, y.shape[0])
+
+        n_samples, n_features = X.shape
+
+        # 1.3 arguments to take special care ##################################
+        # P1, P2, start_params
         if isinstance(self.P1, str) and self.P1 == 'identity':
-            P1 = np.ones(X.shape[1])
+            P1 = np.ones(n_features)
         else:
             P1 = np.atleast_1d(self.P1)
             try:
-                P1 = P1.astype(np.float64, casting='safe', copy=True)
+                P1 = P1.astype(np.float64, casting='safe', copy=False)
             except TypeError:
                 raise TypeError("The given P1 cannot be converted to a numeric"
                                 "array; got (P1.dtype={0})."
                                 .format(P1.dtype))
-            if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]):
+            if (P1.ndim != 1) or (P1.shape[0] != n_features):
                 raise ValueError("P1 must be either 'identity' or a 1d array "
                                  "with the length of X.shape[1]; "
                                  "got (P1.shape[0]={0}), "
                                  "needed (X.shape[1]={1})."
-                                 .format(P1.shape[0], X.shape[1]))
+                                 .format(P1.shape[0], n_features))
         # If X is sparse, make P2 sparse, too.
         if isinstance(self.P2, str) and self.P2 == 'identity':
             if sparse.issparse(X):
-                P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0),
-                      shape=(X.shape[1], X.shape[1]))).tocsr()
+                P2 = (sparse.dia_matrix((np.ones(n_features), 0),
+                      shape=(n_features, n_features))).tocsc()
             else:
-                P2 = np.ones(X.shape[1])
+                P2 = np.ones(n_features)
         else:
             P2 = check_array(self.P2, copy=True,
-                             accept_sparse=['csr', 'csc'],
+                             accept_sparse=_stype,
                              dtype=_dtype, ensure_2d=False)
             if P2.ndim == 1:
                 P2 = np.asarray(P2)
-                if P2.shape[0] != X.shape[1]:
+                if P2.shape[0] != n_features:
                     raise ValueError("P2 should be a 1d array of shape "
                                      "(n_features,) with "
                                      "n_features=X.shape[1]; "
@@ -1739,12 +1852,12 @@ def fit(self, X, y, sample_weight=None):
                                      .format(P2.shape[0], X.shape[1]))
                 if sparse.issparse(X):
                     P2 = (sparse.dia_matrix((P2, 0),
-                          shape=(X.shape[1], X.shape[1]))).tocsr()
+                          shape=(n_features, n_features))).tocsc()
             elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and
                     P2.shape[0] == X.shape[1]):
                 if sparse.issparse(X):
                     P2 = (sparse.dia_matrix((P2, 0),
-                          shape=(X.shape[1], X.shape[1]))).tocsr()
+                          shape=(n_features, n_features))).tocsc()
             else:
                 raise ValueError("P2 must be either None or an array of shape "
                                  "(n_features, n_features) with "
@@ -1752,37 +1865,31 @@ def fit(self, X, y, sample_weight=None):
                                  "got (P2.shape=({0}, {1})), needed ({2}, {2})"
                                  .format(P2.shape[0], P2.shape[1], X.shape[1]))
 
-        family = self._family_instance
-        link = self._link_instance
-
-        if self.fit_intercept:
-            # Note: intercept is first column <=> coef[0] is for intecept
-            if sparse.issparse(X):
-                Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X],
-                                     format=X.format)
-            else:
-                Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
-            P1 = np.concatenate((np.array([0]), P1))
-            if P2.ndim == 1:
-                P2 = np.concatenate((np.array([0]), P2))
-            elif sparse.issparse(P2):
-                P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2),
-                                       format=P2.format,
-                                       dtype=P2.dtype).tocsr()
-            else:
-                # as of numpy 1.13 this would work:
-                # P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))],
-                #                [np.zeros((X.shape[1], 1)), P2]])
-                P2 = np.hstack((np.zeros((X.shape[1], 1)), P2))
-                P2 = np.vstack((np.zeros((1, X.shape[1]+1)), P2))
+        start_params = self.start_params
+        if isinstance(start_params, str):
+            if start_params not in ['irls', 'least_squares', 'zero']:
+                raise ValueError("The argument start_params must be 'irls', "
+                                 "'least-squares', 'zero' or an array of "
+                                 " correct length;"
+                                 " got(start_params={0})".format(start_params))
         else:
-            Xnew = X
+            start_params = check_array(start_params, accept_sparse=False,
+                                       force_all_finite=True, ensure_2d=False,
+                                       dtype=_dtype, copy=True)
+            if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
+                    (start_params.ndim != 1)):
+                raise ValueError("Start values for parameters must have the"
+                                 "right length and dimension; required (length"
+                                 "={0}, ndim=1); got (length={1}, ndim={2})."
+                                 .format(X.shape[1] + self.fit_intercept,
+                                         start_params.shape[0],
+                                         start_params.ndim))
 
-        n_samples, n_features = Xnew.shape
         l1 = self.alpha * self.l1_ratio
         l2 = self.alpha * (1 - self.l1_ratio)
-        P1 *= l1
-        P2 *= l2
+        # P1 and P2 are now for sure copies
+        P1 = l1 * P1
+        P2 = l2 * P2
         # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
         # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
         if P2.ndim == 2:
@@ -1791,14 +1898,12 @@ def fit(self, X, y, sample_weight=None):
             else:
                 P2 = 0.5 * (P2 + P2.T)
 
-        # 1.3 additional validations ##########################################
+        # 1.4 additional validations ##########################################
         if self.check_input:
             if not np.all(family.in_y_range(y)):
                 raise ValueError("Some value(s) of y are out of the valid "
                                  "range for family {0}"
                                  .format(family.__class__.__name__))
-            if not np.all(weights >= 0):
-                raise ValueError("Sample weights must be non-negative.")
             # check if P1 has only non-negative values, negative values might
             # indicate group lasso in the future.
             if not isinstance(self.P1, str):  # if self.P1 != 'identity':
@@ -1830,7 +1935,7 @@ def fit(self, X, y, sample_weight=None):
                 else:
                     if not np.all(linalg.eigvalsh(P2) >= epsneg):
                         raise ValueError("P2 must be positive semi-definite.")
-            # TODO: if alpha=0 check that Xnew is not rank deficient
+            # TODO: if alpha=0 check that X is not rank deficient
             # TODO: what else to check?
 
         #######################################################################
@@ -1874,13 +1979,14 @@ def fit(self, X, y, sample_weight=None):
                 z = eta + (y-mu)/hp
                 # solve A*coef = b
                 # A = X' W X + l2 P2, b = X' W z
-                coef = _irls_step(Xnew, W, P2, z)
+                coef = _irls_step(X, W, P2, z,
+                                  fit_intercept=self.fit_intercept)
             elif start_params == 'least_squares':
                 # less restrictive tolerance for finding start values
                 tol = np.max([self.tol, np.sqrt(self.tol)])
                 if self.alpha == 0:
                     reg = LinearRegression(copy_X=True, fit_intercept=False)
-                    reg.fit(Xnew, link.link(y))
+                    reg.fit(X, link.link(y))
                     coef = reg.coef_
                 elif self.l1_ratio <= 0.01:
                     # ElasticNet says l1_ratio <= 0.01 is not reliable
@@ -1888,19 +1994,21 @@ def fit(self, X, y, sample_weight=None):
                     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
                     reg = Ridge(copy_X=True, fit_intercept=False,
                                 alpha=self.alpha*n_samples, tol=tol)
-                    reg.fit(Xnew, link.link(y))
+                    reg.fit(X, link.link(y))
                     coef = reg.coef_
                 else:
                     # TODO: Does this make sense at all?
                     reg = ElasticNet(copy_X=True, fit_intercept=False,
                                      alpha=self.alpha, l1_ratio=self.l1_ratio,
                                      tol=tol)
-                    reg.fit(Xnew, link.link(y))
+                    reg.fit(X, link.link(y))
                     coef = reg.coef_
             else:  # start_params == 'zero'
-                coef = np.zeros(n_features)
                 if self.fit_intercept:
+                    coef = np.zeros(n_features+1)
                     coef[0] = link.link(np.average(y, weights=weights))
+                else:
+                    coef = np.zeros(n_features)
         else:  # assign given array as start values
             coef = start_params
 
@@ -1915,24 +2023,28 @@ def fit(self, X, y, sample_weight=None):
         # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
         if solver == 'irls':
             coef, self.n_iter_ = \
-                _irls_solver(coef=coef, X=Xnew, y=y, weights=weights, P2=P2,
-                             family=family, link=link, max_iter=self.max_iter,
-                             tol=self.tol)
+                _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2,
+                             fit_intercept=self.fit_intercept, family=family,
+                             link=link, max_iter=self.max_iter, tol=self.tol)
 
         # 4.2 L-BFGS ##########################################################
         elif solver == 'lbfgs':
             def func(coef, X, y, weights, P2, family, link):
-                mu, dev, devp = \
+                mu, devp = \
                     family._mu_deviance_derivative(coef, X, y, weights, link)
+                dev = family.deviance(y, mu, weights)
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
                 if P2.ndim == 1:
-                    L2 = P2 * coef
+                    L2 = P2 * coef[idx:]
                 else:
-                    L2 = P2 @ coef
-                obj = 0.5 * dev + 0.5 * (coef @ L2)
-                objp = 0.5 * devp + L2
+                    L2 = P2 @ coef[idx:]
+                obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2)
+                objp = 0.5 * devp
+                objp[idx:] += L2
                 return obj, objp
 
-            args = (Xnew, y, weights, P2, family, link)
+            args = (X, y, weights, P2, family, link)
             coef, loss, info = fmin_l_bfgs_b(
                 func, coef, fprime=None, args=args,
                 iprint=(self.verbose > 0) - 1, pgtol=self.tol,
@@ -1952,50 +2064,66 @@ def func(coef, X, y, weights, P2, family, link):
         # precisely, expected hessian of deviance.
         elif solver == 'newton-cg':
             def func(coef, X, y, weights, P2, family, link):
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
                 if P2.ndim == 1:
-                    L2 = coef @ (P2 * coef)
+                    L2 = coef[idx:] @ (P2 * coef[idx:])
                 else:
-                    L2 = coef @ (P2 @ coef)
-                mu = link.inverse(X @ coef)
+                    L2 = coef[idx:] @ (P2 @ coef[idx:])
+                mu = link.inverse(_safe_lin_pred(X, coef))
                 return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2
 
             def grad(coef, X, y, weights, P2, family, link):
+                mu, devp = \
+                    family._mu_deviance_derivative(coef, X, y, weights, link)
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
                 if P2.ndim == 1:
-                    L2 = P2 * coef
+                    L2 = P2 * coef[idx:]
                 else:
-                    L2 = P2 @ coef
-                eta = X @ coef
-                mu = link.inverse(eta)
-                d1 = link.inverse_derivative(eta)
-                grad = X.T @ (d1 * family.deviance_derivative(y, mu, weights))
-                return 0.5 * grad + L2
+                    L2 = P2 @ coef[idx:]
+                objp = 0.5 * devp
+                objp[idx:] += L2
+                return objp
 
             def grad_hess(coef, X, y, weights, P2, family, link):
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
                 if P2.ndim == 1:
-                    L2 = P2 * coef
+                    L2 = P2 * coef[idx:]
                 else:
-                    L2 = P2 @ coef
-                eta = X @ coef
+                    L2 = P2 @ coef[idx:]
+                eta = _safe_lin_pred(X, coef)
                 mu = link.inverse(eta)
                 d1 = link.inverse_derivative(eta)
-                grad = 0.5 * \
-                    (X.T @ (d1 * family.deviance_derivative(y, mu, weights))) \
-                    + L2
-                # expected hessian = X.T @ diag_matrix @ X
+                temp = d1 * family.deviance_derivative(y, mu, weights)
+                if intercept:
+                    grad = np.concatenate(([0.5 * temp.sum()],
+                                           0.5 * temp @ X + L2))
+                else:
+                    grad = 0.5 * temp @ X + L2  # sampe as 0.5* X.T @ temp + L2
+
+                # expected hessian = fisher = X.T @ diag_matrix @ X
                 # calculate only diag_matrix
                 diag = d1**2 / family.variance(mu, phi=1, weights=weights)
 
-                def Hs(s):
-                    ret = 0.5 * (X.T @ (diag * (X @ s)))
+                def Hs(coef):
+                    # return (0.5 * fisher + P2) @ coef
+                    # ret = 0.5 * (X.T @ (diag * (X @ coef)))
+                    ret = 0.5 * ((diag * (X @ coef[idx:])) @ X)
                     if P2.ndim == 1:
-                        ret += P2 * s
+                        ret += P2 * coef[idx:]
                     else:
-                        ret += P2 @ s
+                        ret += P2 @ coef[idx:]
+                    if intercept:
+                        h0i = np.concatenate(([diag.sum()], diag @ X))
+                        ret = np.concatenate(([0.5 * (h0i @ coef)],
+                                             ret + 0.5 * coef[0] * h0i[1:]))
                     return ret
 
                 return grad, Hs
 
-            args = (Xnew, y, weights, P2, family, link)
+            args = (X, y, weights, P2, family, link)
             coef, n_iter_i = newton_cg(grad_hess, func, grad, coef,
                                        args=args, maxiter=self.max_iter,
                                        tol=self.tol)
@@ -2007,13 +2135,14 @@ def Hs(s):
         elif solver == 'cd':
             # For coordinate descent, if X is sparse, it should be csc format
             # If X is sparse, P2 must also be csc
-            if sparse.issparse(Xnew):
-                Xnew = Xnew.tocsc(copy=self.copy_X)
+            if sparse.issparse(X):
+                X = X.tocsc(copy=self.copy_X)
                 P2 = sparse.csc_matrix(P2)
 
             coef, self.n_iter_, self._n_cycles = \
-                _cd_solver(coef=coef, X=Xnew, y=y, weights=weights, P1=P1,
-                           P2=P2, family=family, link=link,
+                _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1,
+                           P2=P2, fit_intercept=self.fit_intercept,
+                           family=family, link=link,
                            max_iter=self.max_iter, tol=self.tol,
                            selection=self.selection, random_state=random_state,
                            diag_fisher=self.diag_fisher, copy_X=self.copy_X)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index ae8a5f4cfc5e7..cdac151b77de6 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -566,7 +566,7 @@ def obj(coef):
         pd = PoissonDistribution()
         link = LogLink()
         N = y.shape[0]
-        mu = link.inverse(X @ coef[1:]+coef[0])
+        mu = link.inverse(X @ coef[1:] + coef[0])
         alpha, l1_ratio = (1, 0.5)
         return 1./(2.*N) * pd.deviance(y, mu) \
             + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \
@@ -587,9 +587,9 @@ def obj(coef):
     assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
     assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
 
-    # start_params='least_squares' with different alpha
+    # check warm_start, therefore start with different alpha
     glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5,
-                                     family='poisson',
+                                     family='poisson', max_iter=300,
                                      link='log', solver='cd', tol=1e-5,
                                      start_params='zero')
     glm.fit(X, y)

From c9a7a95e89deaadc40b92edd8e1208d550998a72 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 21 Apr 2019 17:03:07 +0200
Subject: [PATCH 051/209] Revised option start_params

* renamed option irls into guess

* removed option least_squares

* updated tests
---
 doc/modules/linear_model.rst           |  12 +-
 sklearn/linear_model/glm.py            | 223 ++++++++++++++-----------
 sklearn/linear_model/tests/test_glm.py | 140 ++++++++--------
 3 files changed, 207 insertions(+), 168 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index e60e9e84a4747..4bede17af581a 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -925,12 +925,12 @@ follows:
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
     GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5,
-                               check_input=True, copy_X=True, family='poisson',
-                               fit_dispersion=None, fit_intercept=True, l1_ratio=0,
-                               link='log', max_iter=100, random_state=None,
-                               selection='cyclic', solver='auto',
-                               start_params='irls', tol=0.0001, verbose=0,
-                               warm_start=False)
+                               check_input=True, copy_X=True, diag_fisher=False,
+                               family='poisson', fit_dispersion=None,
+                               fit_intercept=True, l1_ratio=0, link='log',
+                               max_iter=100, random_state=None, selection='cyclic',
+                               solver='auto', start_params='guess', tol=0.0001,
+                               verbose=0, warm_start=False)
     >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE
     array([0.24630169, 0.43373464])
     >>> reg.intercept_ #doctest: +ELLIPSIS
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 2afd1ddf8c79c..a53cc39ecd307 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -48,9 +48,6 @@
 import scipy.sparse.linalg as splinalg
 from scipy.optimize import fmin_l_bfgs_b
 import warnings
-from .base import LinearRegression
-from .coordinate_descent import ElasticNet
-from .ridge import Ridge
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array, check_X_y
@@ -93,6 +90,14 @@ def _safe_lin_pred(X, coef):
         return X @ coef
 
 
+def _safe_toarray(X):
+    """Returns a numpy array."""
+    if sparse.issparse(X):
+        return X.toarray()
+    else:
+        return np.asarray(X)
+
+
 def _safe_sandwich_dot(X, d, intercept=False):
     """Compute sandwich product X.T @ diag(d) @ X.
 
@@ -100,7 +105,9 @@ def _safe_sandwich_dot(X, d, intercept=False):
     first column of X.
     X can be sparse, d must be an ndarray. Always returns a ndarray."""
     if sparse.issparse(X):
-        temp = (X.transpose().multiply(d) @ X).toarray()
+        temp = (X.transpose() @ X.multiply(d[:, np.newaxis]))
+        # for older versions of numpy and scipy, temp may be a np.matrix
+        temp = _safe_toarray(temp)
     else:
         temp = (X.T * d) @ X
     if intercept:
@@ -945,7 +952,8 @@ def _irls_step(X, W, P2, z, fit_intercept=True):
     else:
         if sparse.issparse(X):
             XtW = X.transpose().multiply(W)
-            A = (XtW @ X).toarray()
+            # for older versions of numpy and scipy, A may be a np.matrix
+            A = _safe_toarray(XtW @ X)
         else:
             XtW = (X.T * W)
             A = XtW @ X
@@ -1105,8 +1113,9 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
                 if intercept:
                     Bj[0] = fisher.sum()
                 if sparse.issparse(X):
-                    Bj[idx:] = (X[:, j].transpose().multiply(fisher) @ X
-                                ).toarray().ravel()
+                    Bj[idx:] = _safe_toarray(X[:, j].transpose() @
+                                             X.multiply(fisher[:, np.newaxis])
+                                             ).ravel()
                 else:
                     Bj[idx:] = (fisher * X[:, j]) @ X
 
@@ -1477,27 +1486,32 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         the chi squared statisic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \
+    solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \
             optional (default='auto')
         Algorithm to use in the optimization problem:
 
         'auto'
             Sets 'irls' if l1_ratio equals 0, else 'cd'.
 
+        'cd'
+            Coordinate descent algorithm. It can deal with L1 as well as L2
+            penalties. Note that in order to avoid unnecessary memory
+            duplication of X in the ``fit`` method, X should be directly passed
+            as a Fortran-contiguous numpy array or sparse csc matrix.
+
         'irls'
-            Iterated reweighted least squares (with Fisher scoring).
+            Iterated reweighted least squares.
             It is the standard algorithm for GLMs. It cannot deal with
             L1 penalties.
 
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties.
+
         'newton-cg', 'lbfgs'
-            Cannot deal with L1 penalties.
+            Newton conjugate gradient algorithm cannot deal with L1 penalties.
 
-        'cd'
-            Coordinate descent algorithm. It can deal with L1 as well as L2
-            penalties. Note that in order to avoid unnecessary memory
-            duplication of the X argument in the ``fit`` method, X should be
-            directly passed as a Fortran-contiguous numpy array or sparse csc
-            matrix.
+        Note that all solvers except lbfgs use the fisher matrix, i.e. the
+        expected Hessian instead of the Hessian matrix.
 
     max_iter : int, optional (default=100)
         The maximal number of iterations for solver algorithms.
@@ -1505,11 +1519,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     tol : float, optional (default=1e-4)
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
-        where g_i is the i-th component of the gradient (derivative) of
+        where ``g_i`` is the i-th component of the gradient (derivative) of
         the objective function. For the cd solver, covergence is reached
-        when ``sum_i(|minimum-norm of g_i|)``, where g_i is the
-        subgradient of the objective and minimum-norm of g_i is the element of
-        the subgradient g_i with the smallest L2-norm.
+        when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the
+        subgradient of the objective and minimum-norm of ``g_i`` is the element
+        of the subgradient ``g_i`` with the smallest L2-norm.
 
     warm_start : boolean, optional (default=False)
         If set to ``True``, reuse the solution of the previous call to ``fit``
@@ -1518,23 +1532,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         does not exit (first call to ``fit``), option ``start_params`` sets the
         start values for ``coef_`` and ``intercept_``.
 
-    start_params : {'irls', 'least_squares', 'zero', array of shape \
-            (n_features*, )}, optional (default='irls')
+    start_params : {'guess', 'zero', array of shape (n_features*, )}, \
+            optional (default='guess')
         Relevant only if ``warm_start=False`` or if fit is called
         the first time (``self.coef_`` does not yet exist).
 
-        'irls'
+        'guess'
             Start values of mu are calculated by family.starting_mu(..). Then,
-            one step of irls obtains start values for ``coef_``. This gives
-            usually good results.
-
-        'least_squares'
-        Start values for ``coef_`` are obtained by a least squares fit in the
-        link space (y is transformed to the space of the linear predictor).
+            one Newton step obtains start values for ``coef_``. If
+            ``solver='irls'``, it uses one irls step, else the Newton step is
+            calculated by the cd solver.
+            This gives usually good starting values.
 
         'zero'
         All coefficients are set to zero. If ``fit_intercept=True``, the
-        start value for the intercept is obtained by the average of y.
+        start value for the intercept is obtained by the weighted average of y.
 
         array
         The array of size n_features* is directly used as start values
@@ -1560,17 +1572,17 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         RandomState instance used by `np.random`. Used when ``selection`` ==
         'random'.
 
-    diag_fisher : boolean, (default=False)
-        Only relevant for solver 'cd'. If ``False``, the full Fisher matrix
-        (expected Hessian) is computed in each outer iteretion (Newton
-        iteration). If ``True``, only a diagonal matrix (stored as 1d array) is
-        computed, such that fisher = X.T @ diag @ X. This saves memory and
-        matrix-matrix multiplications, but needs more matrix-vector
-        multiplications. If you use large sparse X or if you have many
-        features, i.e. n_features >> n_samples, you might set this option to
-        ``True``.
-
-    copy_X : boolean, optional, default True
+    diag_fisher : boolean, optional, (default=False)
+        Only relevant for solver 'cd' (see also ``start_params='guess'``).
+        If ``False``, the full Fisher matrix (expected Hessian) is computed in
+        each outer iteration (Newton iteration). If ``True``, only a diagonal
+        matrix (stored as 1d array) is computed, such that
+        fisher = X.T @ diag @ X. This saves memory and matrix-matrix
+        multiplications, but needs more matrix-vector multiplications. If you
+        use large sparse X or if you have many features,
+        i.e. n_features >> n_samples, you might set this option to ``True``.
+
+    copy_X : boolean, optional, (default=True)
         If ``True``, X will be copied; else, it may be overwritten.
 
     check_input : boolean, optional (default=True)
@@ -1634,7 +1646,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
                  fit_intercept=True, family='normal', link='auto',
                  fit_dispersion=None, solver='auto', max_iter=100,
-                 tol=1e-4, warm_start=False, start_params='irls',
+                 tol=1e-4, warm_start=False, start_params='guess',
                  selection='cyclic', random_state=None, diag_fisher=False,
                  copy_X=True, check_input=True, verbose=0):
         self.alpha = alpha
@@ -1867,11 +1879,10 @@ def fit(self, X, y, sample_weight=None):
 
         start_params = self.start_params
         if isinstance(start_params, str):
-            if start_params not in ['irls', 'least_squares', 'zero']:
-                raise ValueError("The argument start_params must be 'irls', "
-                                 "'least-squares', 'zero' or an array of "
-                                 " correct length;"
-                                 " got(start_params={0})".format(start_params))
+            if start_params not in ['guess',  'zero']:
+                raise ValueError("The argument start_params must be 'guess', "
+                                 "'zero' or an array of correct length; "
+                                 "got(start_params={0})".format(start_params))
         else:
             start_params = check_array(start_params, accept_sparse=False,
                                        force_all_finite=True, ensure_2d=False,
@@ -1894,10 +1905,17 @@ def fit(self, X, y, sample_weight=None):
         # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
         if P2.ndim == 2:
             if sparse.issparse(P2):
-                P2 = 0.5 * (P2 + P2.transpose())
+                if sparse.isspmatrix_csc(P2):
+                    P2 = 0.5 * (P2 + P2.transpose()).tocsc()
+                else:
+                    P2 = 0.5 * (P2 + P2.transpose()).tocsr()
             else:
                 P2 = 0.5 * (P2 + P2.T)
 
+        # For coordinate descent, if X is sparse, P2 must also be csc
+        if solver == 'cd' and sparse.issparse(X):
+            P2 = sparse.csc_matrix(P2)
+
         # 1.4 additional validations ##########################################
         if self.check_input:
             if not np.all(family.in_y_range(y)):
@@ -1964,45 +1982,63 @@ def fit(self, X, y, sample_weight=None):
             else:
                 coef = self.coef_
         elif isinstance(start_params, str):
-            if start_params == 'irls':
-                # See 3.1 IRLS
-                # Use mu_start and apply one irls step to calculate coef
+            if start_params == 'guess':
+                # Set mu=starting_mu of the family and do one Newton step
+                # If solver=cd use cd, else irls
                 mu = family.starting_mu(y, weights=weights)
-                # linear predictor
-                eta = link.link(mu)
-                # h'(eta)
-                hp = link.inverse_derivative(eta)
-                # working weights W, in principle a diagonal matrix
-                # therefore here just as 1d array
-                W = (hp**2 / family.variance(mu, phi=1, weights=weights))
-                # working observations
-                z = eta + (y-mu)/hp
-                # solve A*coef = b
-                # A = X' W X + l2 P2, b = X' W z
-                coef = _irls_step(X, W, P2, z,
-                                  fit_intercept=self.fit_intercept)
-            elif start_params == 'least_squares':
-                # less restrictive tolerance for finding start values
-                tol = np.max([self.tol, np.sqrt(self.tol)])
-                if self.alpha == 0:
-                    reg = LinearRegression(copy_X=True, fit_intercept=False)
-                    reg.fit(X, link.link(y))
-                    coef = reg.coef_
-                elif self.l1_ratio <= 0.01:
-                    # ElasticNet says l1_ratio <= 0.01 is not reliable
-                    # => use Ridge
-                    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
-                    reg = Ridge(copy_X=True, fit_intercept=False,
-                                alpha=self.alpha*n_samples, tol=tol)
-                    reg.fit(X, link.link(y))
-                    coef = reg.coef_
+                eta = link.link(mu)  # linear predictor
+                if solver in ['cd', 'lbfgs', 'newton-cg']:
+                    # see function _cd_solver
+                    sigma_inv = 1/family.variance(mu, phi=1, weights=weights)
+                    d1 = link.inverse_derivative(eta)
+                    temp = sigma_inv * d1 * (y - mu)
+                    if self.fit_intercept:
+                        score = np.concatenate(([temp.sum()], temp @ X))
+                    else:
+                        score = temp @ X  # sampe as X.T @ temp
+
+                    d2_sigma_inv = d1 * d1 * sigma_inv
+                    diag_fisher = self.diag_fisher
+                    if diag_fisher:
+                        fisher = d2_sigma_inv
+                    else:
+                        fisher = \
+                            _safe_sandwich_dot(X, d2_sigma_inv,
+                                               intercept=self.fit_intercept)
+                    # set up space for search direction d for inner loop
+                    if self.fit_intercept:
+                        coef = np.zeros(n_features+1)
+                    else:
+                        coef = np.zeros(n_features)
+                    d = np.zeros_like(coef)
+                    # initial stopping tolerance of inner loop
+                    # use L1-norm of minimum of norm of subgradient of F
+                    # use less restrictive tolerance for initial guess
+                    inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2,
+                                                 P1=P1)
+                    inner_tol = 4 * linalg.norm(inner_tol, ord=1)
+                    # just one outer loop = Newton step
+                    n_cycles = 0
+                    d, coef_P2, n_cycles, inner_tol = \
+                        _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles,
+                                  inner_tol, max_inner_iter=1000,
+                                  selection=self.selection,
+                                  random_state=self.random_state,
+                                  diag_fisher=self.diag_fisher)
+                    coef += d  # for simplicity no line search here
                 else:
-                    # TODO: Does this make sense at all?
-                    reg = ElasticNet(copy_X=True, fit_intercept=False,
-                                     alpha=self.alpha, l1_ratio=self.l1_ratio,
-                                     tol=tol)
-                    reg.fit(X, link.link(y))
-                    coef = reg.coef_
+                    # See _irls_solver
+                    # h'(eta)
+                    hp = link.inverse_derivative(eta)
+                    # working weights W, in principle a diagonal matrix
+                    # therefore here just as 1d array
+                    W = (hp**2 / family.variance(mu, phi=1, weights=weights))
+                    # working observations
+                    z = eta + (y-mu)/hp
+                    # solve A*coef = b
+                    # A = X' W X + l2 P2, b = X' W z
+                    coef = _irls_step(X, W, P2, z,
+                                      fit_intercept=self.fit_intercept)
             else:  # start_params == 'zero'
                 if self.fit_intercept:
                     coef = np.zeros(n_features+1)
@@ -2048,7 +2084,7 @@ def func(coef, X, y, weights, P2, family, link):
             coef, loss, info = fmin_l_bfgs_b(
                 func, coef, fprime=None, args=args,
                 iprint=(self.verbose > 0) - 1, pgtol=self.tol,
-                maxiter=self.max_iter)
+                maxiter=self.max_iter, factr=1e3)
             if self.verbose > 0:
                 if info["warnflag"] == 1:
                     warnings.warn("lbfgs failed to converge."
@@ -2106,6 +2142,8 @@ def grad_hess(coef, X, y, weights, P2, family, link):
                 # expected hessian = fisher = X.T @ diag_matrix @ X
                 # calculate only diag_matrix
                 diag = d1**2 / family.variance(mu, phi=1, weights=weights)
+                if intercept:
+                    h0i = np.concatenate(([diag.sum()], diag @ X))
 
                 def Hs(coef):
                     # return (0.5 * fisher + P2) @ coef
@@ -2116,7 +2154,6 @@ def Hs(coef):
                     else:
                         ret += P2 @ coef[idx:]
                     if intercept:
-                        h0i = np.concatenate(([diag.sum()], diag @ X))
                         ret = np.concatenate(([0.5 * (h0i @ coef)],
                                              ret + 0.5 * coef[0] * h0i[1:]))
                     return ret
@@ -2124,21 +2161,15 @@ def Hs(coef):
                 return grad, Hs
 
             args = (X, y, weights, P2, family, link)
-            coef, n_iter_i = newton_cg(grad_hess, func, grad, coef,
-                                       args=args, maxiter=self.max_iter,
-                                       tol=self.tol)
+            coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef,
+                                           args=args, maxiter=self.max_iter,
+                                           tol=self.tol)
 
         # 4.4 coordinate descent ##############################################
         # Note: we already set P1 = l1*P1, see above
         # Note: we already set P2 = l2*P2, see above
         # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
         elif solver == 'cd':
-            # For coordinate descent, if X is sparse, it should be csc format
-            # If X is sparse, P2 must also be csc
-            if sparse.issparse(X):
-                X = X.tocsc(copy=self.copy_X)
-                P2 = sparse.csc_matrix(P2)
-
             coef, self.n_iter_, self._n_cycles = \
                 _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1,
                            P2=P2, fit_intercept=self.fit_intercept,
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index cdac151b77de6..17535c067bbb6 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -4,7 +4,7 @@
 import scipy as sp
 from scipy import linalg, optimize, sparse
 
-from sklearn.datasets import make_classification
+from sklearn.datasets import make_classification, make_regression
 from sklearn.linear_model.glm import (
     Link,
     IdentityLink,
@@ -22,10 +22,12 @@
     assert_array_equal, assert_array_almost_equal)
 
 
+rng = np.random.RandomState(42)
+
+
 @pytest.mark.parametrize('link', Link.__subclasses__())
 def test_link_properties(link):
     """Test link inverse and derivative."""
-    rng = np.random.RandomState(0)
     x = rng.rand(100)*100
     link = link()  # instatiate object
     decimal = 10
@@ -86,7 +88,6 @@ def test_deviance_zero(family, chk_values):
 def test_fisher_matrix(family, link):
     """Test the Fisher matrix numerically.
     Trick: Use numerical differentiation with y = mu"""
-    rng = np.random.RandomState(0)
     coef = np.array([-2, 1, 0, 1, 2.5])
     phi = 0.5
     X = rng.randn(10, 5)
@@ -218,7 +219,6 @@ def test_glm_P2_argument(P2):
 def test_glm_P2_positive_semidefinite():
     """Test GLM for a positive semi-definite P2 argument."""
     n_samples, n_features = 10, 5
-    rng = np.random.RandomState(42)
     y = np.arange(n_samples)
     X = np.zeros((n_samples, n_features))
     P2 = np.diag([100, 10, 5, 0, -1E-5])
@@ -351,20 +351,15 @@ def test_glm_check_input_argument(check_input):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize(
-    'family',
-    [NormalDistribution(), PoissonDistribution(),
-     GammaDistribution(), InverseGaussianDistribution(),
-     TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
-     GeneralizedHyperbolicSecant()])
 @pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
-def test_glm_identiy_regression(family, solver):
+def test_glm_identiy_regression(solver):
     """Test GLM regression with identity link on a simple dataset."""
     coef = [1, 2]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
-    glm = GeneralizedLinearRegressor(alpha=0, family=family, link='identity',
-                                     fit_intercept=False, solver=solver)
+    glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
+                                     fit_intercept=False, solver=solver,
+                                     start_params='zero', tol=1e-7)
     res = glm.fit(X, y)
     assert_array_almost_equal(res.coef_, coef)
 
@@ -375,34 +370,42 @@ def test_glm_identiy_regression(family, solver):
      GammaDistribution(), InverseGaussianDistribution(),
      TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
      GeneralizedHyperbolicSecant()])
-@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
-def test_glm_log_regression(family, solver):
+@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6),
+                                              ('lbfgs', 1e-6, 6),
+                                              ('newton-cg', 1e-7, 6),
+                                              ('cd', 1e-7, 6)])
+def test_glm_log_regression(family, solver, tol, dec):
     """Test GLM regression with log link on a simple dataset."""
-    coef = [1, 2]
+    coef = [0.2, -0.1]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.exp(np.dot(X, coef))
     glm = GeneralizedLinearRegressor(
                 alpha=0, family=family, link='log', fit_intercept=False,
-                solver=solver, start_params='least_squares')
+                solver=solver, start_params='guess', tol=tol)
     res = glm.fit(X, y)
-    assert_array_almost_equal(res.coef_, coef)
+    assert_array_almost_equal(res.coef_, coef, decimal=dec)
 
 
 @pytest.mark.filterwarnings('ignore::DeprecationWarning')
-@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
-def test_normal_ridge(solver):
+@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6),
+                                              ('lbfgs', 1e-6, 5),
+                                              ('newton-cg', 1e-6, 5),
+                                              ('cd', 1e-6, 6)])
+def test_normal_ridge(solver, tol, dec):
     """Test ridge regression for Normal distributions.
 
     Compare to test_ridge in test_ridge.py.
     """
-    rng = np.random.RandomState(0)
     alpha = 1.0
 
     # 1. With more samples than features
-    n_samples, n_features, n_predict = 10, 5, 10
-    y = rng.randn(n_samples)
-    X = rng.randn(n_samples, n_features)
-    T = rng.randn(n_predict, n_features)
+    n_samples, n_features, n_predict = 100, 7, 10
+    X, y, coef = make_regression(n_samples=n_samples+n_predict,
+                                 n_features=n_features,
+                                 n_informative=n_features-2, noise=0.5,
+                                 coef=True, random_state=rng)
+    y = y[0:n_samples]
+    X, T = X[0:n_samples], X[n_samples:]
 
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6,
@@ -410,69 +413,74 @@ def test_normal_ridge(solver):
     ridge.fit(X, y)
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
                                      link='identity', fit_intercept=True,
-                                     tol=1e-6, max_iter=100, solver=solver,
-                                     random_state=42)
+                                     tol=tol, max_iter=100, solver=solver,
+                                     check_input=False, random_state=rng)
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
-    assert_array_almost_equal(glm.coef_, ridge.coef_)
-    assert_almost_equal(glm.intercept_, ridge.intercept_)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
                   solver='svd', normalize=False)
     ridge.fit(X, y)
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
                                      link='identity', fit_intercept=False,
-                                     tol=1e-6, max_iter=100, solver=solver,
-                                     random_state=42, fit_dispersion='chisqr')
+                                     tol=tol, max_iter=100, solver=solver,
+                                     check_input=False, random_state=rng,
+                                     fit_dispersion='chisqr')
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
-    assert_array_almost_equal(glm.coef_, ridge.coef_)
-    assert_almost_equal(glm.intercept_, ridge.intercept_)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
     mu = glm.predict(X)
     assert_almost_equal(glm.dispersion_,
                         np.sum((y-mu)**2/(n_samples-n_features)))
 
     # 2. With more features than samples and sparse
-    n_samples, n_features, n_predict = 5, 10, 10
-    y = rng.randn(n_samples)
-    X = sparse.csr_matrix(rng.randn(n_samples, n_features))
-    T = sparse.csr_matrix(rng.randn(n_predict, n_features))
+    n_samples, n_features, n_predict = 10, 100, 10
+    X, y, coef = make_regression(n_samples=n_samples+n_predict,
+                                 n_features=n_features,
+                                 n_informative=n_features-2, noise=0.5,
+                                 coef=True, random_state=rng)
+    y = y[0:n_samples]
+    X, T = X[0:n_samples], X[n_samples:]
 
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9,
                   solver='sag', normalize=False, max_iter=100000)
     ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8,
-                                     family='normal', link='identity',
-                                     fit_intercept=True, solver=solver,
-                                     max_iter=300, random_state=42)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=True,
+                                     tol=tol, max_iter=300, solver=solver,
+                                     check_input=False, random_state=rng)
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
-    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5)
-    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5)
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7,
                   solver='sag', normalize=False, max_iter=1000)
     ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7,
-                                     family='normal', link='identity',
-                                     fit_intercept=False, solver=solver)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=False,
+                                     tol=tol*2, max_iter=300, solver=solver,
+                                     check_input=False, random_state=rng)
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
-    assert_array_almost_equal(glm.coef_, ridge.coef_)
-    assert_almost_equal(glm.intercept_, ridge.intercept_)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2)
 
 
-@pytest.mark.parametrize('solver, decimal, tol',
-                         [('irls', 7, 1e-8),
-                          ('lbfgs', 5, 1e-7),
-                          ('newton-cg', 5, 1e-7),
-                          ('cd', 7, 1e-8)])
-def test_poisson_ridge(solver, decimal, tol):
+@pytest.mark.parametrize('solver, tol, dec',
+                         [('irls', 1e-7, 6),
+                          ('lbfgs', 1e-7, 5),
+                          ('newton-cg', 1e-7, 5),
+                          ('cd', 1e-7, 7)])
+def test_poisson_ridge(solver, tol, dec):
     """Test ridge regression with poisson family and LogLink.
 
     Compare to R's glmnet"""
@@ -493,18 +501,17 @@ def test_poisson_ridge(solver, decimal, tol):
                                      fit_intercept=True, family='poisson',
                                      link='log', tol=tol,
                                      solver=solver, max_iter=300,
-                                     random_state=42)
+                                     random_state=rng)
     glm.fit(X, y)
     assert_almost_equal(glm.intercept_, -0.12889386979,
-                        decimal=decimal)
+                        decimal=dec)
     assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
-                              decimal=decimal)
+                              decimal=dec)
 
 
 @pytest.mark.parametrize('diag_fisher', [False, True])
 def test_normal_enet(diag_fisher):
     """Test elastic net regression with normal/gaussian family."""
-    rng = np.random.RandomState(0)
     alpha, l1_ratio = 0.3, 0.7
     n_samples, n_features = 20, 2
     X = rng.randn(n_samples, n_features).copy(order='F')
@@ -556,7 +563,8 @@ def test_poisson_enet():
     y = np.array([0, 1, 1, 2])
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
                                      link='log', solver='cd', tol=1e-8,
-                                     selection='random', random_state=42)
+                                     selection='random', random_state=rng,
+                                     start_params='guess')
     glm.fit(X, y)
     assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7)
     assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7)
@@ -591,7 +599,7 @@ def obj(coef):
     glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5,
                                      family='poisson', max_iter=300,
                                      link='log', solver='cd', tol=1e-5,
-                                     start_params='zero')
+                                     selection='cyclic', start_params='zero')
     glm.fit(X, y)
     # warm start with original alpha and use of sparse matrices
     glm.warm_start = True
@@ -612,9 +620,9 @@ def test_binomial_enet(alpha):
     n_samples = 500
     X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6,
                                n_informative=5, n_redundant=0, n_repeated=0,
-                               random_state=0)
+                               random_state=rng)
     log = LogisticRegression(
-        penalty='elasticnet', random_state=0, fit_intercept=False, tol=1e-6,
+        penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6,
         max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha),
         solver='saga')
     log.fit(X, y)

From a7755de2cdc5022b6d41285730ed12bd41628e66 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Tue, 4 Jun 2019 17:34:11 +0200
Subject: [PATCH 052/209] Fix a few typos

---
 doc/modules/linear_model.rst                  |  2 +-
 .../plot_poisson_spline_regression.py         |  2 +-
 sklearn/linear_model/glm.py                   | 53 ++++++++++---------
 sklearn/linear_model/tests/test_glm.py        |  4 +-
 4 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 4bede17af581a..8f17c67d950de 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -887,7 +887,7 @@ combination of the input variables :math:`X` via an inverse link function
 .. math::    \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p).
 
 Secondly, the squared loss function is replaced by the deviance :math:`D` of an
-exponential dispersion model (EDM) [11]_. The objective function beeing minimized
+exponential dispersion model (EDM) [11]_. The objective function being minimized
 becomes
 
 .. math::    \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1
diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py
index fce85fae1ea8c..30b5881bba1f5 100644
--- a/examples/linear_model/plot_poisson_spline_regression.py
+++ b/examples/linear_model/plot_poisson_spline_regression.py
@@ -5,7 +5,7 @@
 
 As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py`
 example, a Poisson regression with penalized B-splines (P-splines) [1]_ is
-fitted on slightly different sinusodial, Poisson distributed data and
+fitted on slightly different sinusoidal, Poisson distributed data and
 compared to an AdaBoost model with decision trees.
 One can see, that this is a hard problem for both estimators.
 
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index a53cc39ecd307..a35c68828de81 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -29,7 +29,7 @@
 #   L2: w*P2*w with P2 a (semi-) positive definite matrix, e.g. P2 could be
 #   a 1st or 2nd order difference matrix (compare B-spline penalties and
 #   Tikhonov regularization).
-# - The link funtion (instance of class Link) is necessary for the evaluation
+# - The link function (instance of class Link) is necessary for the evaluation
 #   of deviance, score, Fisher and Hessian matrix as functions of the
 #   coefficients, which is needed by optimizers.
 #   Solution: link as argument in those functions
@@ -170,7 +170,7 @@ def _min_norm_sugrad(coef, grad, P2, P1):
 
 
 class Link(metaclass=ABCMeta):
-    """Abstract base class for Link funtions."""
+    """Abstract base class for Link functions."""
 
     @abstractmethod
     def link(self, mu):
@@ -201,7 +201,7 @@ def derivative(self, mu):
     def inverse(self, lin_pred):
         """Compute the inverse link function h(lin_pred).
 
-        Gives the inverse relationship between linkear predictor and the mean
+        Gives the inverse relationship between linker predictor and the mean
         mu=E[Y], i.e. h(linear predictor) = mu.
 
         Parameters
@@ -357,7 +357,7 @@ def include_lower_bound(self):
 
     @property
     def include_upper_bound(self):
-        """Get True if upper bound for y is includede: y <= upper_bound."""
+        """Get True if upper bound for y is included: y <= upper_bound."""
         return self._include_upper_bound
 
     def in_y_range(self, x):
@@ -859,7 +859,7 @@ def __init__(self):
 class GeneralizedHyperbolicSecant(ExponentialDispersionModel):
     """A class for the Generalized Hyperbolic Secant (GHS) distribution.
 
-    The GHS distribution is for tagets y in (-inf, inf).
+    The GHS distribution is for targets y in (-inf, inf).
     """
     def __init__(self):
         self._lower_bound = -np.Inf
@@ -881,7 +881,7 @@ def unit_deviance(self, y, mu):
 class BinomialDistribution(ExponentialDispersionModel):
     """A class for the Binomial distribution.
 
-    The Binomial distribution is for tagets y in [0, 1].
+    The Binomial distribution is for targets y in [0, 1].
     """
     def __init__(self):
         self._lower_bound = 0
@@ -1131,7 +1131,7 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
             else:
                 b = B[jdx, jdx]
 
-            # those ten lines aree what it is all about
+            # those ten lines are what it is all about
             if b <= 0:
                 z = 0
             elif P1[j] == 0:
@@ -1199,7 +1199,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
                diag_fisher=False, copy_X=True):
     """Solve GLM with L1 and L2 penalty by coordinate descent algorithm.
 
-    The objective beeing minimized in the coefficients w=coef is::
+    The objective being minimized in the coefficients w=coef is::
 
         F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1
 
@@ -1207,7 +1207,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
 
     1. Find optimal descent direction d by minimizing
        min_d F(w+d) = min_d F(w+d) - F(w)
-    2. Quadrdatic approximation of F(w+d)-F(w) = q(d):
+    2. Quadratic approximation of F(w+d)-F(w) = q(d):
        using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives:
        q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
        + ||P1*(w+d)||_1 - ||P1*w||_1
@@ -1228,7 +1228,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
     ----------
     coef : ndarray, shape (c,)
         If fit_intercept=False, shape c=X.shape[1].
-        If fit_intercept=True, then c=X.shapee[1] + 1.
+        If fit_intercept=True, then c=X.shape[1] + 1.
 
     X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
         Training data (with intercept included if present). If not sparse,
@@ -1265,7 +1265,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
         cycles over all features per inner loop.
 
     tol : float, optional (default=1e-4)
-        Covergence criterion is
+        Convergence criterion is
         sum_i(|minimum of norm of subgrad of objective_i|)<=tol.
 
     selection : str, optional (default='cyclic')
@@ -1285,9 +1285,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
     -------
     coef : ndarray, shape (c,)
         If fit_intercept=False, shape c=X.shape[1].
-        If fit_intercept=True, then c=X.shapee[1] + 1.
+        If fit_intercept=True, then c=X.shape[1] + 1.
 
-    n_iter : numer of outer iterations = newton iterations
+    n_iter : number of outer iterations = newton iterations
 
     n_cycles : number of cycles over features
 
@@ -1312,7 +1312,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
                              "format. Got P2 not sparse.")
     random_state = check_random_state(random_state)
     # Note: we already set P2 = l2*P2, P1 = l1*P1
-    # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+    # Note: we already symmetrized P2 = 1/2 (P2 + P2')
     n_iter = 0  # number of outer iterations
     n_cycles = 0  # number of (complete) cycles over features
     converged = False
@@ -1427,7 +1427,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     Parameters
     ----------
     alpha : float, optional (default=1)
-        Constant that multiplies the penalty terms und thus determines the
+        Constant that multiplies the penalty terms and thus determines the
         regularization strength.
         See the notes for the exact mathematical meaning of this
         parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
@@ -1481,9 +1481,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         - 'logit' for family 'binomial'
 
-    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None)
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
         Method for estimation of the dispersion parameter phi. Whether to use
-        the chi squared statisic or the deviance statistic. If None, the
+        the chi squared statistic or the deviance statistic. If None, the
         dispersion is not estimated.
 
     solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \
@@ -1520,7 +1520,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative) of
-        the objective function. For the cd solver, covergence is reached
+        the objective function. For the cd solver, convergence is reached
         when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the
         subgradient of the objective and minimum-norm of ``g_i`` is the element
         of the subgradient ``g_i`` with the smallest L2-norm.
@@ -1626,7 +1626,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     If the target y is a ratio, appropriate sample weights s should be
     provided.
-    As an example, consider Poission distributed counts z (integers) and
+    As an example, consider Poisson distributed counts z (integers) and
     weights s=exposure (time, money, persons years, ...). Then you fit
     y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
     sample_weight=s)``. The weights are necessary for the right (finite
@@ -1681,7 +1681,7 @@ def fit(self, X, y, sample_weight=None):
             Target values.
 
         sample_weight : {None, array-like}, shape (n_samples,),\
-                optinal (default=None)
+                optional (default=None)
             Individual weights w_i for each sample. Note that for an
             Exponential Dispersion Model (EDM), one has
             Var[Y_i]=phi/w_i * v(mu).
@@ -1995,7 +1995,7 @@ def fit(self, X, y, sample_weight=None):
                     if self.fit_intercept:
                         score = np.concatenate(([temp.sum()], temp @ X))
                     else:
-                        score = temp @ X  # sampe as X.T @ temp
+                        score = temp @ X  # same as X.T @ temp
 
                     d2_sigma_inv = d1 * d1 * sigma_inv
                     diag_fisher = self.diag_fisher
@@ -2051,12 +2051,12 @@ def fit(self, X, y, sample_weight=None):
         #######################################################################
         # 4. fit                                                              #
         #######################################################################
-        # algorithms for optimiation
+        # algorithms for optimization
         # TODO: Parallelize it?
 
         # 4.1 IRLS ############################################################
         # Note: we already set P2 = l2*P2, see above
-        # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+        # Note: we already symmetrized P2 = 1/2 (P2 + P2')
         if solver == 'irls':
             coef, self.n_iter_ = \
                 _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2,
@@ -2137,7 +2137,7 @@ def grad_hess(coef, X, y, weights, P2, family, link):
                     grad = np.concatenate(([0.5 * temp.sum()],
                                            0.5 * temp @ X + L2))
                 else:
-                    grad = 0.5 * temp @ X + L2  # sampe as 0.5* X.T @ temp + L2
+                    grad = 0.5 * temp @ X + L2  # same as 0.5* X.T @ temp + L2
 
                 # expected hessian = fisher = X.T @ diag_matrix @ X
                 # calculate only diag_matrix
@@ -2168,7 +2168,7 @@ def Hs(coef):
         # 4.4 coordinate descent ##############################################
         # Note: we already set P1 = l1*P1, see above
         # Note: we already set P2 = l2*P2, see above
-        # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+        # Note: we already symmetrized P2 = 1/2 (P2 + P2')
         elif solver == 'cd':
             coef, self.n_iter_, self._n_cycles = \
                 _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1,
@@ -2215,7 +2215,8 @@ def linear_predictor(self, X):
         return X @ self.coef_ + self.intercept_
 
     def predict(self, X, sample_weight=None):
-        """Predict uing GLM with feature matrix X.
+        """Predict using GLM with feature matrix X.
+
         If sample_weight is given, returns prediction*sample_weight.
 
         Parameters
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 17535c067bbb6..6172824cf1b79 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -29,7 +29,7 @@
 def test_link_properties(link):
     """Test link inverse and derivative."""
     x = rng.rand(100)*100
-    link = link()  # instatiate object
+    link = link()  # instantiate object
     decimal = 10
     if isinstance(link, LogitLink):
         # careful for large x, note expit(36) = 1
@@ -133,7 +133,7 @@ def test_sample_weights_validation():
     with pytest.raises(ValueError):
         glm.fit(X, y, weights)
 
-    # 5. 1d but weith a negative value
+    # 5. 1d but with a negative value
     weights = [2, -1]
     with pytest.raises(ValueError):
         glm.fit(X, y, weights)

From 9aa1fc41d726e5ac9007d9c867546aa70ea79ba7 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Tue, 4 Jun 2019 17:40:08 +0200
Subject: [PATCH 053/209] Make module private

---
 sklearn/linear_model/__init__.py         | 4 ++--
 sklearn/linear_model/{glm.py => _glm.py} | 0
 sklearn/linear_model/tests/test_glm.py   | 9 +++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)
 rename sklearn/linear_model/{glm.py => _glm.py} (100%)

diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 0c5840f343a3a..0f7856fcc2046 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,8 +18,8 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
-from .glm import (TweedieDistribution,
-                  GeneralizedLinearRegressor)
+from ._glm import (TweedieDistribution,
+                   GeneralizedLinearRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/_glm.py
similarity index 100%
rename from sklearn/linear_model/glm.py
rename to sklearn/linear_model/_glm.py
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 6172824cf1b79..a148d11cb2632 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -1,3 +1,7 @@
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#
+# License: BSD 3 clause
+
 import numpy as np
 from numpy.testing import assert_allclose
 import pytest
@@ -5,7 +9,8 @@
 from scipy import linalg, optimize, sparse
 
 from sklearn.datasets import make_classification, make_regression
-from sklearn.linear_model.glm import (
+from sklearn.linear_model import GeneralizedLinearRegressor
+from sklearn.linear_model._glm import (
     Link,
     IdentityLink,
     LogLink,
@@ -14,7 +19,7 @@
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
     GeneralizedHyperbolicSecant, BinomialDistribution,
-    GeneralizedLinearRegressor)
+)
 from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
 
 from sklearn.utils.testing import (

From ca3eae24b4db9931eeac9fb925dba4f48199976c Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Tue, 4 Jun 2019 21:35:11 +0200
Subject: [PATCH 054/209] Working on tests

---
 sklearn/linear_model/tests/test_glm.py | 52 +++++++++++++-------------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index a148d11cb2632..2c8a9c3d2c72c 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -23,7 +23,7 @@
 from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
 
 from sklearn.utils.testing import (
-    assert_equal, assert_almost_equal,
+    assert_almost_equal,
     assert_array_equal, assert_array_almost_equal)
 
 
@@ -35,16 +35,14 @@ def test_link_properties(link):
     """Test link inverse and derivative."""
     x = rng.rand(100)*100
     link = link()  # instantiate object
-    decimal = 10
     if isinstance(link, LogitLink):
         # careful for large x, note expit(36) = 1
         # limit max eta to 15
         x = x / 100 * 15
-        decimal = 8
-    assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal)
+    assert_allclose(link.link(link.inverse(x)), x)
     # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
-    assert_almost_equal(link.derivative(link.inverse(x)),
-                        1./link.inverse_derivative(x), decimal=decimal)
+    assert_allclose(link.derivative(link.inverse(x)),
+                    1./link.inverse_derivative(x))
     # for LogitLink, in the following x should be between 0 and 1.
     # assert_almost_equal(link.inverse_derivative(link.link(x)),
     #                     1./link.derivative(x), decimal=decimal)
@@ -79,7 +77,7 @@ def test_family_bounds(family, expected):
 def test_deviance_zero(family, chk_values):
     """Test deviance(y,y) = 0 for different families."""
     for x in chk_values:
-        assert_almost_equal(family.deviance(x, x), 0, decimal=10)
+        assert_allclose(family.deviance(x, x), 0, atol=1e-9)
 
 
 @pytest.mark.parametrize(
@@ -155,7 +153,7 @@ def test_glm_family_argument(f, fam):
     y = np.array([0.1, 0.5])  # in range of all distributions
     X = np.array([[1], [2]])
     glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
-    assert_equal(type(glm._family_instance), type(fam))
+    assert isinstance(glm._family_instance, fam.__class__)
 
     glm = GeneralizedLinearRegressor(family='not a family',
                                      fit_intercept=False)
@@ -172,7 +170,7 @@ def test_glm_link_argument(l, link):
     y = np.array([0.1, 0.5])  # in range of all distributions
     X = np.array([[1], [2]])
     glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y)
-    assert_equal(type(glm._link_instance), type(link))
+    assert isinstance(glm._link_instance, link.__class__)
 
     glm = GeneralizedLinearRegressor(family='normal', link='not a link')
     with pytest.raises(ValueError):
@@ -366,7 +364,7 @@ def test_glm_identiy_regression(solver):
                                      fit_intercept=False, solver=solver,
                                      start_params='zero', tol=1e-7)
     res = glm.fit(X, y)
-    assert_array_almost_equal(res.coef_, coef)
+    assert_allclose(res.coef_, coef)
 
 
 @pytest.mark.parametrize(
@@ -375,11 +373,11 @@ def test_glm_identiy_regression(solver):
      GammaDistribution(), InverseGaussianDistribution(),
      TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
      GeneralizedHyperbolicSecant()])
-@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6),
-                                              ('lbfgs', 1e-6, 6),
-                                              ('newton-cg', 1e-7, 6),
-                                              ('cd', 1e-7, 6)])
-def test_glm_log_regression(family, solver, tol, dec):
+@pytest.mark.parametrize('solver, tol', [('irls', 1e-6),
+                                         ('lbfgs', 1e-6),
+                                         ('newton-cg', 1e-7),
+                                         ('cd', 1e-7)])
+def test_glm_log_regression(family, solver, tol):
     """Test GLM regression with log link on a simple dataset."""
     coef = [0.2, -0.1]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
@@ -388,7 +386,7 @@ def test_glm_log_regression(family, solver, tol, dec):
                 alpha=0, family=family, link='log', fit_intercept=False,
                 solver=solver, start_params='guess', tol=tol)
     res = glm.fit(X, y)
-    assert_array_almost_equal(res.coef_, coef, decimal=dec)
+    assert_allclose(res.coef_, coef)
 
 
 @pytest.mark.filterwarnings('ignore::DeprecationWarning')
@@ -421,10 +419,10 @@ def test_normal_ridge(solver, tol, dec):
                                      tol=tol, max_iter=100, solver=solver,
                                      check_input=False, random_state=rng)
     glm.fit(X, y)
-    assert_equal(glm.coef_.shape, (X.shape[1], ))
-    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
-    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
+    assert glm.coef_.shape == (X.shape[1], )
+    assert_allclose(glm.coef_, ridge.coef_)
+    assert glm.intercept_ == pytest.approx(ridge.intercept_)
+    assert_allclose(glm.predict(T), ridge.predict(T))
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
                   solver='svd', normalize=False)
@@ -435,13 +433,13 @@ def test_normal_ridge(solver, tol, dec):
                                      check_input=False, random_state=rng,
                                      fit_dispersion='chisqr')
     glm.fit(X, y)
-    assert_equal(glm.coef_.shape, (X.shape[1], ))
-    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert glm.coef_.shape == (X.shape[1], )
+    assert_allclose(glm.coef_, ridge.coef_)
     assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
+    assert_allclose(glm.predict(T), ridge.predict(T))
     mu = glm.predict(X)
-    assert_almost_equal(glm.dispersion_,
-                        np.sum((y-mu)**2/(n_samples-n_features)))
+    assert_allclose(glm.dispersion_,
+                    np.sum((y-mu)**2/(n_samples-n_features)))
 
     # 2. With more features than samples and sparse
     n_samples, n_features, n_predict = 10, 100, 10
@@ -461,7 +459,7 @@ def test_normal_ridge(solver, tol, dec):
                                      tol=tol, max_iter=300, solver=solver,
                                      check_input=False, random_state=rng)
     glm.fit(X, y)
-    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert glm.coef_.shape == (X.shape[1], )
     assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
     assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
     assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
@@ -474,7 +472,7 @@ def test_normal_ridge(solver, tol, dec):
                                      tol=tol*2, max_iter=300, solver=solver,
                                      check_input=False, random_state=rng)
     glm.fit(X, y)
-    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert glm.coef_.shape == (X.shape[1], )
     assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1)
     assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1)
     assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2)

From 61bc6b8e2b1e227f539656744e9b4a4fa9f514f2 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 5 Jun 2019 18:45:20 +0200
Subject: [PATCH 055/209] Improve tests

---
 sklearn/linear_model/tests/test_glm.py | 82 +++++++++++++-------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 2c8a9c3d2c72c..7d747c23ae441 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -22,9 +22,7 @@
 )
 from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
 
-from sklearn.utils.testing import (
-    assert_almost_equal,
-    assert_array_equal, assert_array_almost_equal)
+from sklearn.utils.testing import assert_array_equal
 
 
 rng = np.random.RandomState(42)
@@ -355,16 +353,16 @@ def test_glm_check_input_argument(check_input):
 
 
 @pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
-def test_glm_identiy_regression(solver):
+def test_glm_identity_regression(solver):
     """Test GLM regression with identity link on a simple dataset."""
-    coef = [1, 2]
+    coef = [1., 2.]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
     glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
                                      fit_intercept=False, solver=solver,
                                      start_params='zero', tol=1e-7)
     res = glm.fit(X, y)
-    assert_allclose(res.coef_, coef)
+    assert_allclose(res.coef_, coef, rtol=1e-6)
 
 
 @pytest.mark.parametrize(
@@ -386,7 +384,7 @@ def test_glm_log_regression(family, solver, tol):
                 alpha=0, family=family, link='log', fit_intercept=False,
                 solver=solver, start_params='guess', tol=tol)
     res = glm.fit(X, y)
-    assert_allclose(res.coef_, coef)
+    assert_allclose(res.coef_, coef, rtol=5e-6)
 
 
 @pytest.mark.filterwarnings('ignore::DeprecationWarning')
@@ -420,9 +418,9 @@ def test_normal_ridge(solver, tol, dec):
                                      check_input=False, random_state=rng)
     glm.fit(X, y)
     assert glm.coef_.shape == (X.shape[1], )
-    assert_allclose(glm.coef_, ridge.coef_)
-    assert glm.intercept_ == pytest.approx(ridge.intercept_)
-    assert_allclose(glm.predict(T), ridge.predict(T))
+    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6)
+    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
+    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6)
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
                   solver='svd', normalize=False)
@@ -434,9 +432,9 @@ def test_normal_ridge(solver, tol, dec):
                                      fit_dispersion='chisqr')
     glm.fit(X, y)
     assert glm.coef_.shape == (X.shape[1], )
-    assert_allclose(glm.coef_, ridge.coef_)
-    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
-    assert_allclose(glm.predict(T), ridge.predict(T))
+    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-5)
+    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6)
+    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6)
     mu = glm.predict(X)
     assert_allclose(glm.dispersion_,
                     np.sum((y-mu)**2/(n_samples-n_features)))
@@ -452,7 +450,8 @@ def test_normal_ridge(solver, tol, dec):
 
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9,
-                  solver='sag', normalize=False, max_iter=100000)
+                  solver='sag', normalize=False, max_iter=100000,
+                  random_state=42)
     ridge.fit(X, y)
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
                                      link='identity', fit_intercept=True,
@@ -460,22 +459,24 @@ def test_normal_ridge(solver, tol, dec):
                                      check_input=False, random_state=rng)
     glm.fit(X, y)
     assert glm.coef_.shape == (X.shape[1], )
-    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
-    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
+    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6)
+    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6)
+    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7,
-                  solver='sag', normalize=False, max_iter=1000)
+                  solver='sag', normalize=False, max_iter=1000,
+                  random_state=42)
     ridge.fit(X, y)
+
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
                                      link='identity', fit_intercept=False,
                                      tol=tol*2, max_iter=300, solver=solver,
                                      check_input=False, random_state=rng)
     glm.fit(X, y)
     assert glm.coef_.shape == (X.shape[1], )
-    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1)
-    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2)
+    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-4)
+    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
+    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
 
 
 @pytest.mark.parametrize('solver, tol, dec',
@@ -506,10 +507,8 @@ def test_poisson_ridge(solver, tol, dec):
                                      solver=solver, max_iter=300,
                                      random_state=rng)
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, -0.12889386979,
-                        decimal=dec)
-    assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
-                              decimal=dec)
+    assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5)
+    assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-6)
 
 
 @pytest.mark.parametrize('diag_fisher', [False, True])
@@ -535,14 +534,14 @@ def test_normal_enet(diag_fisher):
                       normalize=False, tol=1e-8, copy_X=True)
     enet.fit(X, y)
 
-    assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
-    assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
+    assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7)
+    assert_allclose(glm.coef_, enet.coef_, rtol=5e-5)
 
     # 2. test normal enet on sparse data
     X = sparse.csc_matrix(X)
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
-    assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
+    assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7)
+    assert_allclose(glm.coef_, enet.coef_, rtol=5e-5)
 
 
 def test_poisson_enet():
@@ -569,8 +568,8 @@ def test_poisson_enet():
                                      selection='random', random_state=rng,
                                      start_params='guess')
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7)
-    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7)
+    assert_allclose(glm.intercept_, glmnet_intercept, rtol=2e-6)
+    assert_allclose(glm.coef_, glmnet_coef, rtol=2e-7)
 
     # test results with general optimization procedure
     def obj(coef):
@@ -584,10 +583,10 @@ def obj(coef):
             + alpha * l1_ratio * np.sum(np.abs(coef[1:]))
     res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10,
                             options={'maxiter': 1000, 'disp': False})
-    assert_almost_equal(glm.intercept_, res.x[0], decimal=5)
-    assert_almost_equal(glm.coef_, res.x[1:], decimal=5)
-    assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))),
-                        res.fun, decimal=8)
+    assert_allclose(glm.intercept_, res.x[0], rtol=1e-5)
+    assert_allclose(glm.coef_, res.x[1:], rtol=1e-5, atol=1e-9)
+    assert_allclose(obj(np.concatenate(([glm.intercept_], glm.coef_))),
+                    res.fun, rtol=1e-8)
 
     # same for start_params='zero' and selection='cyclic'
     # with reduced precision
@@ -595,8 +594,8 @@ def obj(coef):
                                      link='log', solver='cd', tol=1e-5,
                                      selection='cyclic', start_params='zero')
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
-    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
+    assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4)
+    assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4)
 
     # check warm_start, therefore start with different alpha
     glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5,
@@ -609,8 +608,8 @@ def obj(coef):
     glm.alpha = 1
     X = sparse.csr_matrix(X)
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
-    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
+    assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4)
+    assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4)
 
 
 @pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10])
@@ -629,10 +628,11 @@ def test_binomial_enet(alpha):
         max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha),
         solver='saga')
     log.fit(X, y)
+
     glm = GeneralizedLinearRegressor(
         family=BinomialDistribution(), link=LogitLink(), fit_intercept=False,
         alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic',
         tol=1e-7)
     glm.fit(X, y)
-    assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6)
-    assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6)
+    assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6)
+    assert_allclose(log.coef_[0, :], glm.coef_, rtol=2e-6)

From b24a7cab9fc7d9f5dcec5b9d7657fee2d0a94283 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 5 Jun 2019 18:53:58 +0200
Subject: [PATCH 056/209] Remove unused dec parameter in tests

---
 sklearn/linear_model/tests/test_glm.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 7d747c23ae441..a3e943403a7a7 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -388,11 +388,11 @@ def test_glm_log_regression(family, solver, tol):
 
 
 @pytest.mark.filterwarnings('ignore::DeprecationWarning')
-@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6),
-                                              ('lbfgs', 1e-6, 5),
-                                              ('newton-cg', 1e-6, 5),
-                                              ('cd', 1e-6, 6)])
-def test_normal_ridge(solver, tol, dec):
+@pytest.mark.parametrize('solver, tol', [('irls', 1e-6),
+                                         ('lbfgs', 1e-6),
+                                         ('newton-cg', 1e-6),
+                                         ('cd', 1e-6)])
+def test_normal_ridge(solver, tol):
     """Test ridge regression for Normal distributions.
 
     Compare to test_ridge in test_ridge.py.
@@ -479,12 +479,12 @@ def test_normal_ridge(solver, tol, dec):
     assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
 
 
-@pytest.mark.parametrize('solver, tol, dec',
-                         [('irls', 1e-7, 6),
-                          ('lbfgs', 1e-7, 5),
-                          ('newton-cg', 1e-7, 5),
-                          ('cd', 1e-7, 7)])
-def test_poisson_ridge(solver, tol, dec):
+@pytest.mark.parametrize('solver, tol',
+                         [('irls', 1e-7),
+                          ('lbfgs', 1e-7),
+                          ('newton-cg', 1e-7),
+                          ('cd', 1e-7)])
+def test_poisson_ridge(solver, tol):
     """Test ridge regression with poisson family and LogLink.
 
     Compare to R's glmnet"""

From f95b390c04fd4c0a333f08a649d9450ec44f0395 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 18 Jul 2017 21:50:10 +0200
Subject: [PATCH 057/209] ENH: add Generalized Linear Models, issue #5975

* new estimator GeneralizedLinearRegressor
* loss functions for Tweedie family and Binomial
* elasitc net penalties
* control of penalties by matrix P2 and vector P1
* new solvers: coordinate descent, irls
* tests
* documentation
* example for Poisson regression
---
 doc/modules/classes.rst                       |    1 +
 doc/modules/linear_model.rst                  |  129 +
 .../plot_poisson_spline_regression.py         |   85 +
 sklearn/linear_model/__init__.py              |    6 +-
 sklearn/linear_model/glm.py                   | 2331 +++++++++++++++++
 sklearn/linear_model/tests/test_glm.py        |  640 +++++
 6 files changed, 3191 insertions(+), 1 deletion(-)
 create mode 100644 examples/linear_model/plot_poisson_spline_regression.py
 create mode 100644 sklearn/linear_model/glm.py
 create mode 100644 sklearn/linear_model/tests/test_glm.py

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 2dcf582a6ab39..4158e34e8bb8c 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -727,6 +727,7 @@ Kernels:
    linear_model.BayesianRidge
    linear_model.ElasticNet
    linear_model.ElasticNetCV
+   linear_model.GeneralizedLinearRegressor
    linear_model.HuberRegressor
    linear_model.Lars
    linear_model.LarsCV
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 02f406f629e04..888566fab3601 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -893,6 +893,135 @@ to warm-starting (see :term:`Glossary <warm_start>`).
     .. [9] `"Performance Evaluation of Lbfgs vs other solvers"
             <http://www.fuzihao.org/blog/2016/01/16/Comparison-of-Gradient-Descent-Stochastic-Gradient-Descent-and-L-BFGS/>`_
 
+.. _Generalized_linear_regression:
+
+Generalized Linear Regression
+=============================
+
+:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two
+ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear
+combination of the input variables :math:`X` via an inverse link function
+:math:`h` as
+
+.. math::    \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p).
+
+Secondly, the squared loss function is replaced by the deviance :math:`D` of an
+exponential dispersion model (EDM) [11]_. The objective function beeing minimized
+becomes
+
+.. math::    \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1
+            +\frac{\alpha(1-\rho)}{2} w^T P_2 w
+
+with sample weights :math:`s`.
+:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in
+the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows
+for a more versatile L2 penalty.
+
+Use cases, where a loss different from the squared loss might be appropriate,
+are the following:
+
+  * If the target values :math:`y` are counts (non-negative integer valued) or
+    frequencies (non-negative), you might use a Poisson deviance with log-link.
+
+  * If the target values are positive valued and skewed, you might try a
+    Gamma deviance with log-link.
+
+  * If the target values seem to be heavier tailed than a Gamma distribution,
+    you might try an Inverse Gaussian deviance (or even higher variance powers
+    of the Tweedie family).
+
+Since the linear predictor :math:`Xw` can be negative and
+Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
+it is convenient to apply a link function different from the identity link
+:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with
+:math:`h(Xw)=\exp(Xw)`.
+
+Note that the feature matrix `X` should be standardized before fitting. This
+ensures that the penalty treats features equally. The estimator can be used as
+follows:
+
+    >>> from sklearn.linear_model import GeneralizedLinearRegressor
+    >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
+    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
+    GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5,
+                               check_input=True, copy_X=True, diag_fisher=False,
+                               family='poisson', fit_dispersion=None,
+                               fit_intercept=True, l1_ratio=0, link='log',
+                               max_iter=100, random_state=None, selection='cyclic',
+                               solver='auto', start_params='guess', tol=0.0001,
+                               verbose=0, warm_start=False)
+    >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE
+    array([0.24630169, 0.43373464])
+    >>> reg.intercept_ #doctest: +ELLIPSIS
+    -0.76383633...
+
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py`
+
+Mathematical formulation
+------------------------
+
+In the unpenalized case, the assumptions are the following:
+
+    * The target values :math:`y_i` are realizations of random variables
+      :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})`
+      with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter
+      :math:`\phi` and sample weights :math:`s_i`.
+    * The aim is to predict the expectation :math:`\mu_i` with
+      :math:`\hat{y_i} = h(\eta_i)`, linear predictor
+      :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`.
+
+Note that the first assumption implies
+:math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance
+function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the
+same as specifying a unit variance function (they are one-to-one).
+
+Including penalties helps to avoid overfitting or, in case of L1 penalty, to
+obtain sparse solutions. But there are also other motivations to include them,
+e.g. accounting for the dependence structure of :math:`y`.
+
+The objective function, which is independent of :math:`\phi`, is minimized with
+respect to the coefficients :math:`w`.
+
+The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
+likelihood as
+
+.. math::     d(y, \mu) = -2\phi\cdot
+              \left(loglike(y,\mu,\phi)
+              - loglike(y,y,\phi)\right) \\
+              D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i)
+
+===================================== ===============================  ================================= ============================================
+Distribution                          Target Domain                    Variance Function :math:`v(\mu)`  Unit Deviance :math:`d(y, \mu)`
+===================================== ===============================  ================================= ============================================
+Normal ("normal")                     :math:`y \in (-\infty, \infty)`  :math:`1`                         :math:`(y-\mu)^2`
+Poisson ("poisson")                   :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{\mu}-y+\mu)`
+Gamma ("gamma")                       :math:`y \in (0, \infty)`        :math:`\mu^2`                     :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
+Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)`        :math:`\mu^3`                     :math:`\frac{(y-\mu)^2}{y\mu^2}`
+===================================== ===============================  ================================= ============================================
+
+Two remarks:
+
+* The deviances for at least Normal, Poisson and Gamma distributions are
+  strictly consistent scoring functions for the mean :math:`\mu`, see Eq.
+  (19)-(20) in [12]_.
+
+* If you want to model a frequency, i.e. counts per exposure (time, volume, ...)
+  you can do so by a Poisson distribution and passing
+  :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together
+  with :math:`s=\mathrm{exposure}` as sample weights.
+
+
+.. topic:: References:
+
+    .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+
+    .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
+           See also `Exponential dispersion model. <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
+
+    .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
 
 Stochastic Gradient Descent - SGD
 =================================
diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py
new file mode 100644
index 0000000000000..fce85fae1ea8c
--- /dev/null
+++ b/examples/linear_model/plot_poisson_spline_regression.py
@@ -0,0 +1,85 @@
+"""
+=================================
+Poisson Regression with B-Splines
+=================================
+
+As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py`
+example, a Poisson regression with penalized B-splines (P-splines) [1]_ is
+fitted on slightly different sinusodial, Poisson distributed data and
+compared to an AdaBoost model with decision trees.
+One can see, that this is a hard problem for both estimators.
+
+.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines
+       and penalties". Statist. Sci. 11 (1996), no. 2, 89--121.
+       `doi:10.1214/ss/1038425655
+       <https://projecteuclid.org/euclid.ss/1038425655>`_
+
+"""
+print(__doc__)
+
+# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
+# based on the AdaBoost regression example from Noel Dawe <noel.dawe@gmail.com>
+# License: BSD 3 clause
+
+# importing necessary libraries
+import numpy as np
+from scipy.linalg import toeplitz
+# from scipy.interpolate import BSpline
+from scipy.interpolate import splev
+import matplotlib.pyplot as plt
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import AdaBoostRegressor
+from sklearn.linear_model import GeneralizedLinearRegressor
+
+
+# Create the dataset
+xmin, xmax = 0, 6
+rng = np.random.RandomState(1)
+X = np.linspace(xmin, xmax, 500)[:, np.newaxis]
+y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel())
+y = rng.poisson(y_true, X.shape[0])
+
+# b-spline basis
+nknots, degree = 40, 3
+ns = nknots - degree - 1  # number of base spline functions
+dx = (xmax - xmin) / (nknots - 1 - 2 * degree)
+knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots)
+coef = np.zeros(ns)
+splineBasis = np.empty((X.shape[0], ns), dtype=float)
+for i in range(ns):
+    coef[i] = 1
+#    splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \
+#        .ravel()
+    splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel()
+    coef[i] = 0
+
+# second order difference matrix
+P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float)
+P2[0, 0] = P2[-1, -1] = 1
+
+# Fit regression model
+regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
+                           n_estimators=10, random_state=rng)
+
+regr_2 = GeneralizedLinearRegressor(family='poisson', link='log',
+                                    fit_intercept=True, alpha=0.02,
+                                    l1_ratio=0.1, P2=P2)
+
+regr_1.fit(X, y)
+regr_2.fit(splineBasis, y)
+
+# Predict
+y_1 = regr_1.predict(X)
+y_2 = regr_2.predict(splineBasis)
+
+# Plot the results
+plt.figure()
+plt.plot(X, y_true, c="b", label="true mean")
+plt.scatter(X, y, c="k", marker='.', label="training samples")
+plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2)
+plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2)
+plt.xlabel("data")
+plt.ylabel("target")
+plt.title("Regression Comparison")
+plt.legend()
+plt.show()
diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 770a1a49b600e..cbb2ad8826358 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,6 +18,8 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
+from .glm import (TweedieDistribution,
+                  GeneralizedLinearRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
@@ -78,4 +80,6 @@
            'orthogonal_mp',
            'orthogonal_mp_gram',
            'ridge_regression',
-           'RANSACRegressor']
+           'RANSACRegressor',
+           'GeneralizedLinearRegressor',
+           'TweedieDistribution']
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
new file mode 100644
index 0000000000000..ac0007c1789a8
--- /dev/null
+++ b/sklearn/linear_model/glm.py
@@ -0,0 +1,2331 @@
+"""
+Generalized Linear Models with Exponential Dispersion Family
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# some parts and tricks stolen from other sklearn files.
+# License: BSD 3 clause
+
+# TODO: Should the option `normalize` be included (like other linear models)?
+#       So far, it is not included. User must pass a normalized X.
+# TODO: Add cross validation support, e.g. GCV?
+# TODO: Should GeneralizedLinearRegressor inherit from LinearModel?
+#       So far, it does not.
+# TODO: Include further classes in class.rst? ExponentialDispersionModel?
+#       TweedieDistribution?
+# TODO: Negative values in P1 are not allowed so far. They could be used
+#       for group lasso.
+
+# Design Decisions:
+# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
+#   Estimators in sklearn are either regressors or classifiers. A GLM can do
+#   both depending on the distr (Normal => regressor, Binomial => classifier).
+#   Solution: GeneralizedLinearRegressor since this is the focus.
+# - Allow for finer control of penalty terms:
+#   L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude
+#       factors from the L1 penalty.
+#   L2: w*P2*w with P2 a positive (semi-) definite matrix, e.g. P2 could be
+#   a 1st or 2nd order difference matrix (compare B-spline penalties and
+#   Tikhonov regularization).
+# - The link funtion (instance of class Link) is necessary for the evaluation
+#   of deviance, score, Fisher and Hessian matrix as a functions of the
+#   coefficients, which is needed by optimizers.
+#   Solution: link as argument in those functions
+# - Which name/symbol for sample_weight in docu?
+#   sklearn.linear_models uses w for coefficients, standard literature on
+#   GLMs use beta for coefficients and w for (sample) weights.
+#   So far, coefficients=w and sample weights=s.
+# - The intercept term is the first index, i.e. coef[0]
+
+
+from __future__ import division
+from abc import ABCMeta, abstractmethod
+import numbers
+import numpy as np
+from scipy import linalg, sparse, special
+import scipy.sparse.linalg as splinalg
+from scipy.optimize import fmin_l_bfgs_b
+import warnings
+from ..base import BaseEstimator, RegressorMixin
+from ..exceptions import ConvergenceWarning
+from ..utils import check_array, check_X_y
+from ..utils.optimize import newton_cg
+from ..utils.validation import check_is_fitted, check_random_state
+
+
+def _check_weights(sample_weight, n_samples):
+    """Check that sample weights are non-negative and have the right shape."""
+    if sample_weight is None:
+        weights = np.ones(n_samples)
+    elif np.isscalar(sample_weight):
+        if sample_weight <= 0:
+            raise ValueError("Sample weights must be non-negative.")
+        weights = sample_weight * np.ones(n_samples)
+    else:
+        _dtype = [np.float64, np.float32]
+        weights = check_array(sample_weight, accept_sparse=False,
+                              force_all_finite=True, ensure_2d=False,
+                              dtype=_dtype)
+        if weights.ndim > 1:
+            raise ValueError("Sample weight must be 1D array or scalar")
+        elif weights.shape[0] != n_samples:
+            raise ValueError("Sample weights must have the same length as "
+                             "y")
+        if not np.all(weights >= 0):
+            raise ValueError("Sample weights must be non-negative.")
+        elif not np.sum(weights) > 0:
+            raise ValueError("Sample weights must have at least one positive "
+                             "element.")
+
+    return weights
+
+
+def _safe_lin_pred(X, coef):
+    """Compute the linear predictor taking care if intercept is present."""
+    if coef.size == X.shape[1] + 1:
+        return X @ coef[1:] + coef[0]
+    else:
+        return X @ coef
+
+
+def _safe_toarray(X):
+    """Returns a numpy array."""
+    if sparse.issparse(X):
+        return X.toarray()
+    else:
+        return np.asarray(X)
+
+
+def _safe_sandwich_dot(X, d, intercept=False):
+    """Compute sandwich product X.T @ diag(d) @ X.
+
+    With ``intercept=True``, X is treated as if a column of 1 were appended as
+    first column of X.
+    X can be sparse, d must be an ndarray. Always returns a ndarray."""
+    if sparse.issparse(X):
+        temp = (X.transpose() @ X.multiply(d[:, np.newaxis]))
+        # for older versions of numpy and scipy, temp may be a np.matrix
+        temp = _safe_toarray(temp)
+    else:
+        temp = (X.T * d) @ X
+    if intercept:
+        dim = X.shape[1] + 1
+        if sparse.issparse(X):
+            order = 'F' if sparse.isspmatrix_csc(X) else 'C'
+        else:
+            order = 'F' if X.flags['F_CONTIGUOUS'] else 'C'
+        res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order)
+        res[0, 0] = d.sum()
+        res[1:, 0] = d @ X
+        res[0, 1:] = res[1:, 0]
+        res[1:, 1:] = temp
+    else:
+        res = temp
+    return res
+
+
+def _min_norm_sugrad(coef, grad, P2, P1):
+    """Compute the gradient of all subgradients with minimal L2-norm.
+
+    subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1)
+
+    g_i = grad_i + (P2*coef)_i
+
+    if coef_i > 0:   g_i + P1_i
+    if coef_i < 0:   g_i - P1_i
+    if coef_i = 0:   sign(g_i) * max(|g_i|-P1_i, 0)
+
+    Parameters
+    ----------
+    coef : ndarray
+        coef[0] may be intercept.
+
+    grad : ndarray, shape=coef.shape
+
+    P2 : {1d or 2d array, None}
+        always without intercept, ``None`` means P2 = 0
+
+    P1 : ndarray
+        always without intercept
+    """
+    intercept = (coef.size == P1.size + 1)
+    idx = 1 if intercept else 0  # offset if coef[0] is intercept
+    # compute grad + coef @ P2 without intercept
+    grad_wP2 = grad[idx:].copy()
+    if P2 is None:
+        pass
+    elif P2.ndim == 1:
+        grad_wP2 += coef[idx:] * P2
+    else:
+        grad_wP2 += coef[idx:] @ P2
+    res = np.where(coef[idx:] == 0,
+                   np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0),
+                   grad_wP2 + np.sign(coef[idx:]) * P1)
+    if intercept:
+        return np.concatenate(([grad[0]], res))
+    else:
+        return res
+
+
+class Link(metaclass=ABCMeta):
+    """Abstract base class for Link funtions."""
+
+    @abstractmethod
+    def link(self, mu):
+        """Compute the link function g(mu).
+
+        The link function links the mean mu=E[Y] to the so called linear
+        predictor (X*w), i.e. g(mu) = linear predictor.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Usually the (predicted) mean.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def derivative(self, mu):
+        """Compute the derivative of the link g'(mu).
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Usually the (predicted) mean.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def inverse(self, lin_pred):
+        """Compute the inverse link function h(lin_pred).
+
+        Gives the inverse relationship between linkear predictor and the mean
+        mu=E[Y], i.e. h(linear predictor) = mu.
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def inverse_derivative(self, lin_pred):
+        """Compute the derivative of the inverse link function h'(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def inverse_derivative2(self, lin_pred):
+        """Compute 2nd derivative of the inverse link function h''(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+        raise NotImplementedError
+
+
+class IdentityLink(Link):
+    """The identity link function g(x)=x."""
+
+    def link(self, mu):
+        return mu
+
+    def derivative(self, mu):
+        return np.ones_like(mu)
+
+    def inverse(self, lin_pred):
+        return lin_pred
+
+    def inverse_derivative(self, lin_pred):
+        return np.ones_like(lin_pred)
+
+    def inverse_derivative2(self, lin_pred):
+        return np.zeros_like(lin_pred)
+
+
+class LogLink(Link):
+    """The log link function g(x)=log(x)."""
+
+    def link(self, mu):
+        return np.log(mu)
+
+    def derivative(self, mu):
+        return 1./mu
+
+    def inverse(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative2(self, lin_pred):
+        return np.exp(lin_pred)
+
+
+class LogitLink(Link):
+    """The logit link function g(x)=logit(x)."""
+
+    def link(self, mu):
+        return special.logit(mu)
+
+    def derivative(self, mu):
+        return 1. / (mu * (1 - mu))
+
+    def inverse(self, lin_pred):
+        return special.expit(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        ep = special.expit(lin_pred)
+        return ep * (1. - ep)
+
+    def inverse_derivative2(self, lin_pred):
+        ep = special.expit(lin_pred)
+        ep = special.expit(lin_pred)
+        return ep * (1. - ep) * (1. - 2 * ep)
+
+
+class ExponentialDispersionModel(metaclass=ABCMeta):
+    r"""Base class for reproductive Exponential Dispersion Models (EDM).
+
+    The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
+
+    .. math:: p(y| \theta, \phi) = c(y, \phi)
+        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
+        = \tilde{c}(y, \phi)
+            \exp\left(-\frac{d(y, \mu)}{2\phi}\right)
+
+    with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`,
+    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`,
+    unit variance :math:`v(\mu)` and
+    unit deviance :math:`d(y,\mu)`.
+
+    Attributes
+    ----------
+    lower_bound
+    upper_bound
+    include_lower_bound
+    include_upper_bound
+
+    Methods
+    -------
+    in_y_range
+    unit_variance
+    unit_variance_derivative
+    variance
+    variance_derivative
+    unit_deviance
+    unit_deviance_derivative
+    deviance
+    deviance_derivative
+    starting_mu
+
+    _mu_deviance_derivative
+    _score
+    _fisher_matrix
+    _observed_information
+    _eta_mu_score_fisher
+
+    References
+    ----------
+
+    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+    """
+    @property
+    def lower_bound(self):
+        """Get the lower bound of values for Y~EDM."""
+        return self._lower_bound
+
+    @property
+    def upper_bound(self):
+        """Get the upper bound of values for Y~EDM."""
+        return self._upper_bound
+
+    @property
+    def include_lower_bound(self):
+        """Get True if lower bound for y is included: y >= lower_bound."""
+        return self._include_lower_bound
+
+    @property
+    def include_upper_bound(self):
+        """Get True if upper bound for y is includede: y <= upper_bound."""
+        return self._include_upper_bound
+
+    def in_y_range(self, x):
+        """Returns ``True`` if x is in the valid range of Y~EDM.
+
+        Parameters
+        ----------
+        x : array, shape (n_samples,)
+            Target values.
+        """
+        if self.include_lower_bound:
+            if self.include_upper_bound:
+                return np.logical_and(np.greater_equal(x, self.lower_bound),
+                                      np.less_equal(x, self.upper_bound))
+            else:
+                return np.logical_and(np.greater_equal(x, self.lower_bound),
+                                      np.less(x, self.upper_bound))
+        else:
+            if self.include_upper_bound:
+                return np.logical_and(np.greater(x, self.lower_bound),
+                                      np.less_equal(x, self.upper_bound))
+            else:
+                return np.logical_and(np.greater(x, self.lower_bound),
+                                      np.less(x, self.upper_bound))
+
+    @abstractmethod
+    def unit_variance(self, mu):
+        r"""Compute the unit variance function.
+
+        The unit variance :math:`v(\mu)` determines the variance as
+        a function of the mean :math:`\mu` by
+        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`.
+        It can also be derived from the unit deviance :math:`d(y,\mu)` as
+
+        .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{
+            \partial\mu^2}}\big|_{y=\mu}
+
+        See also :func:`variance`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def unit_variance_derivative(self, mu):
+        r"""Compute the derivative of the unit variance w.r.t. mu.
+
+        Return :math:`v'(\mu)`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Target values.
+        """
+        raise NotImplementedError()
+
+    def variance(self, mu, phi=1, weights=1):
+        r"""Compute the variance function.
+
+        The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
+        :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`,
+        with unit variance :math:`v(\mu)` and weights :math:`s_i`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        phi : float (default=1)
+            Dispersion parameter.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return phi/weights * self.unit_variance(mu)
+
+    def variance_derivative(self, mu, phi=1, weights=1):
+        r"""Compute the derivative of the variance w.r.t. mu.
+
+        Returns
+        :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
+        =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)`
+        and weights :math:`s_i`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        phi : float (default=1)
+            Dispersion parameter.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return phi/weights * self.unit_variance_derivative(mu)
+
+    @abstractmethod
+    def unit_deviance(self, y, mu):
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,\mu)` can be defined by the
+        log-likelihood as
+        :math:`d(y,\mu) = -2\phi\cdot
+        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        raise NotImplementedError()
+
+    def unit_deviance_derivative(self, y, mu):
+        r"""Compute the derivative of the unit deviance w.r.t. mu.
+
+        The derivative of the unit deviance is given by
+        :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
+        with unit variance :math:`v(\mu)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        return -2 * (y - mu) / self.unit_variance(mu)
+
+    def deviance(self, y, mu, weights=1):
+        r"""Compute the deviance.
+
+        The deviance is a weighted sum of the per sample unit deviances,
+        :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)`
+        with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`.
+        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
+        \left(loglike(y,\mu,\frac{phi}{s})
+        - loglike(y,y,\frac{phi}{s})\right)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return np.sum(weights * self.unit_deviance(y, mu))
+
+    def deviance_derivative(self, y, mu, weights=1):
+        """Compute the derivative of the deviance w.r.t. mu.
+
+        It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return weights * self.unit_deviance_derivative(y, mu)
+
+    def starting_mu(self, y, weights=1, ind_weight=0.5):
+        """Set starting values for the mean mu.
+
+        These may be good starting points for the (unpenalized) IRLS solver.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+
+        ind_weight : float (default=0.5)
+            Must be between 0 and 1. Specifies how much weight is given to the
+            individual observations instead of the mean of y.
+        """
+        return (ind_weight * y +
+                (1. - ind_weight) * np.average(y, weights=weights))
+
+    def _mu_deviance_derivative(self, coef, X, y, weights, link):
+        """Compute mu and the derivative of the deviance w.r.t coef."""
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        d1 = link.inverse_derivative(lin_pred)
+        temp = d1 * self.deviance_derivative(y, mu, weights)
+        if coef.size == X.shape[1] + 1:
+            devp = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            devp = temp @ X  # sampe as X.T @ temp
+        return mu, devp
+
+    def _score(self, coef, phi, X, y, weights, link):
+        r"""Compute the score function.
+
+        The score function is the derivative of the
+        log-likelihood w.r.t. `coef` (:math:`w`).
+        It is given by
+
+        .. math:
+
+            \mathbf{score}(\boldsymbol{w})
+            = \frac{\partial loglike}{\partial\boldsymbol{w}}
+            = \mathbf{X}^T \mathbf{D}
+            \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,,
+
+        with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and
+        :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`.
+        Note: The derivative of the deviance w.r.t. coef equals -2 * score.
+        """
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
+        d = link.inverse_derivative(lin_pred)
+        temp = sigma_inv * d * (y - mu)
+        if coef.size == X.shape[1] + 1:
+            score = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            score = temp @ X  # sampe as X.T @ temp
+        return score
+
+    def _fisher_matrix(self, coef, phi, X, y, weights, link):
+        r"""Compute the Fisher information matrix.
+
+        The Fisher information matrix, also known as expected information
+        matrix is given by
+
+        .. math:
+
+            \mathbf{F}(\boldsymbol{w}) =
+            \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial
+            \boldsymbol{w}} \right]
+            = \mathrm{E}\left[
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
+            \partial\boldsymbol{w}^T}\right]
+            = \mathbf{X}^T W \mathbf{X} \,,
+
+        with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`,
+        see func:`_score`.
+        """
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
+        d = link.inverse_derivative(lin_pred)
+        d2_sigma_inv = sigma_inv * d * d
+        intercept = (coef.size == X.shape[1] + 1)
+        fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv,
+                                           intercept=intercept)
+        return fisher_matrix
+
+    def _observed_information(self, coef, phi, X, y, weights, link):
+        r"""Compute the observed information matrix.
+
+        The observed information matrix, also known as the negative of
+        the Hessian matrix of the log-likelihood, is given by
+
+        .. math:
+
+            \mathbf{H}(\boldsymbol{w}) =
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
+            \partial\boldsymbol{w}^T}
+            = \mathbf{X}^T \left[
+            - \mathbf{D}' \mathbf{R}
+            + \mathbf{D}^2 \mathbf{V} \mathbf{R}
+            + \mathbf{D}^2
+            \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,,
+
+        with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`,
+        :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{
+        v(\mu_i)}
+        \right)`,
+        see :func:`score_` function and :func:`_fisher_matrix`.
+        """
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
+        dp = link.inverse_derivative2(lin_pred)
+        d2 = link.inverse_derivative(lin_pred)**2
+        v = self.unit_variance_derivative(mu)/self.unit_variance(mu)
+        r = y - mu
+        temp = sigma_inv * (-dp * r + d2 * v * r + d2)
+        intercept = (coef.size == X.shape[1] + 1)
+        observed_information = _safe_sandwich_dot(X, temp,
+                                                  intercept=intercept)
+        return observed_information
+
+    def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link,
+                             diag_fisher=False):
+        """Compute linear predictor, mean, score function and fisher matrix.
+
+        It calculates the linear predictor, the mean, score function
+        (derivative of log-likelihood) and Fisher information matrix
+        all in one go as function of `coef` (:math:`w`) and the data.
+
+        Parameters
+        ----------
+        diag_fisher : boolean, optional (default=False)
+            If ``True``, returns only an array d such that
+            fisher = X.T @ np.diag(d) @ X.
+
+        Returns
+        -------
+        (eta, mu, score, fisher) : tuple with 4 elements
+            The 4 elements are:
+
+            * eta: ndarray, shape (X.shape[0],)
+            * mu: ndarray, shape (X.shape[0],)
+            * score: ndarray, shape (X.shape[0],)
+            * fisher:
+
+                * If diag_fisher is ``False``, the full fisher matrix,
+                  an array of shape (X.shape[1], X.shape[1])
+                * If diag_fisher is ``True`, an array of shape (X.shape[0])
+        """
+        intercept = (coef.size == X.shape[1] + 1)
+        # eta = linear predictor
+        eta = _safe_lin_pred(X, coef)
+        mu = link.inverse(eta)
+        sigma_inv = 1./self.variance(mu, phi=phi, weights=weights)
+        d1 = link.inverse_derivative(eta)  # = h'(eta)
+        # Alternatively:
+        # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
+        # d1 = 1./link.derivative(mu)
+        d1_sigma_inv = d1 * sigma_inv
+        temp = d1_sigma_inv * (y - mu)
+        if intercept:
+            score = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            score = temp @ X
+
+        d2_sigma_inv = d1 * d1_sigma_inv
+        if diag_fisher:
+            fisher_matrix = d2_sigma_inv
+        else:
+            fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv,
+                                               intercept=intercept)
+        return eta, mu, score, fisher_matrix
+
+
+class TweedieDistribution(ExponentialDispersionModel):
+    r"""A class for the Tweedie distribution.
+
+    A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely
+    defined by it's mean-variance relationship
+    :math:`\mathrm{Var}[Y] \propto \mu^power`.
+
+    Special cases are:
+
+    ===== ================
+    Power Distribution
+    ===== ================
+    0     Normal
+    1     Poisson
+    (0,1) Compound Poisson
+    2     Gamma
+    3     Inverse Gaussian
+
+    Parameters
+    ----------
+    power : float (default=0)
+            The variance power of the `unit_variance`
+            :math:`v(\mu) = \mu^{power}`.
+            For ``0<power<1``, no distribution exists.
+    """
+    def __init__(self, power=0):
+        self.power = power
+        self._upper_bound = np.Inf
+        self._include_upper_bound = False
+        if power < 0:
+            # Extreme Stable
+            self._lower_bound = -np.Inf
+            self._include_lower_bound = False
+        elif power == 0:
+            # NormalDistribution
+            self._lower_bound = -np.Inf
+            self._include_lower_bound = False
+        elif (power > 0) and (power < 1):
+            raise ValueError('For 0<power<1, no distribution exists.')
+        elif power == 1:
+            # PoissonDistribution
+            self._lower_bound = 0
+            self._include_lower_bound = True
+        elif (power > 1) and (power < 2):
+            # Compound Poisson
+            self._lower_bound = 0
+            self._include_lower_bound = True
+        elif power == 2:
+            # GammaDistribution
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        elif (power > 2) and (power < 3):
+            # Positive Stable
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        elif power == 3:
+            # InverseGaussianDistribution
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        elif power > 3:
+            # Positive Stable
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        else:
+            raise ValueError('The power must be a float, i.e. real number, '
+                             'got (power={})'.format(power))
+
+    @property
+    def power(self):
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        if not isinstance(power, numbers.Real):
+            raise TypeError('power must be a real number, input was {0}'
+                            .format(power))
+        self._power = power
+
+    def unit_variance(self, mu):
+        """Compute the unit variance of a Tweedie distribution v(mu)=mu**power.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        return np.power(mu, self.power)
+
+    def unit_variance_derivative(self, mu):
+        """Compute the derivative of the unit variance of a Tweedie
+        distribution v(mu)=power*mu**(power-1).
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        return self.power * np.power(mu, self.power - 1)
+
+    def unit_deviance(self, y, mu):
+        p = self.power
+        if p == 0:
+            # NormalDistribution
+            return (y - mu)**2
+        if p == 1:
+            # PoissonDistribution
+            # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
+            return 2 * (special.xlogy(y, y/mu) - y + mu)
+        elif p == 2:
+            # GammaDistribution
+            return 2 * (np.log(mu/y) + y/mu - 1)
+        else:
+            # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
+            #    - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p))
+            return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
+                        y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
+
+
+class NormalDistribution(TweedieDistribution):
+    """Class for the Normal (aka Gaussian) distribution"""
+    def __init__(self):
+        super(NormalDistribution, self).__init__(power=0)
+
+
+class PoissonDistribution(TweedieDistribution):
+    """Class for the scaled Poisson distribution"""
+    def __init__(self):
+        super(PoissonDistribution, self).__init__(power=1)
+
+
+class GammaDistribution(TweedieDistribution):
+    """Class for the Gamma distribution"""
+    def __init__(self):
+        super(GammaDistribution, self).__init__(power=2)
+
+
+class InverseGaussianDistribution(TweedieDistribution):
+    """Class for the scaled InverseGaussianDistribution distribution"""
+    def __init__(self):
+        super(InverseGaussianDistribution, self).__init__(power=3)
+
+
+class GeneralizedHyperbolicSecant(ExponentialDispersionModel):
+    """A class for the Generalized Hyperbolic Secant (GHS) distribution.
+
+    The GHS distribution is for tagets y in (-inf, inf).
+    """
+    def __init__(self):
+        self._lower_bound = -np.Inf
+        self._upper_bound = np.Inf
+        self._include_lower_bound = False
+        self._include_upper_bound = False
+
+    def unit_variance(self, mu):
+        return 1 + mu**2
+
+    def unit_variance_derivative(self, mu):
+        return 2 * mu
+
+    def unit_deviance(self, y, mu):
+        return (2 * y * (np.arctan(y) - np.arctan(mu)) +
+                np.log((1 + mu**2)/(1 + y**2)))
+
+
+class BinomialDistribution(ExponentialDispersionModel):
+    """A class for the Binomial distribution.
+
+    The Binomial distribution is for tagets y in [0, 1].
+    """
+    def __init__(self):
+        self._lower_bound = 0
+        self._upper_bound = 1
+        self._include_lower_bound = True
+        self._include_upper_bound = True
+
+    def unit_variance(self, mu):
+        return mu * (1 - mu)
+
+    def unit_variance_derivative(self, mu):
+        return 1 - 2 * mu
+
+    def unit_deviance(self, y, mu):
+        return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu)))
+
+
+def _irls_step(X, W, P2, z, fit_intercept=True):
+    """Compute one step in iteratively reweighted least squares.
+
+    Solve A w = b for w with
+    A = (X' W X + P2)
+    b = X' W z
+    z = eta + D^-1 (y-mu)
+
+    See also fit method of :class:`GeneralizedLinearRegressor`.
+
+    Parameters
+    ----------
+    X : {ndarray, sparse matrix}, shape (n_samples, n_features)
+        Training data (with intercept included if present)
+
+    W : ndarray, shape (n_samples,)
+
+    P2 : {ndarray, sparse matrix}, shape (n_features, n_features)
+        The L2-penalty matrix or vector (=diagonal matrix)
+
+    z : ndarray, shape (n_samples,)
+        Working observations
+
+    fit_intercept : boolean, optional (default=True)
+
+    Returns
+    -------
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
+    """
+    # Note: solve vs least squares, what is more appropriate?
+    #       scipy.linalg.solve seems faster, but scipy.linalg.lstsq
+    #       is more robust.
+    # Note: X.T @ W @ X is not sparse, even when X is sparse.
+    #      Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b)
+    if fit_intercept:
+        Wz = W * z
+        if sparse.issparse(X):
+            b = np.concatenate(([Wz.sum()], X.transpose() @ Wz))
+        else:
+            b = np.concatenate(([Wz.sum()], X.T @ Wz))
+        A = _safe_sandwich_dot(X, W, intercept=fit_intercept)
+        if P2.ndim == 1:
+            idx = np.arange(start=1, stop=A.shape[0])
+            A[(idx, idx)] += P2  # add to diag elements without intercept
+        elif sparse.issparse(P2):
+            A[1:, 1:] += P2.toarray()
+        else:
+            A[1:, 1:] += P2
+    else:
+        if sparse.issparse(X):
+            XtW = X.transpose().multiply(W)
+            # for older versions of numpy and scipy, A may be a np.matrix
+            A = _safe_toarray(XtW @ X)
+        else:
+            XtW = (X.T * W)
+            A = XtW @ X
+        b = XtW @ z
+        if P2.ndim == 1:
+            A[np.diag_indices_from(A)] += P2
+        elif sparse.issparse(P2):
+            A += P2.toarray()
+        else:
+            A += P2
+    # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True)
+    coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
+    return coef
+
+
+def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link,
+                 max_iter, tol):
+    """Solve GLM with L2 penalty by IRLS algorithm.
+
+    Note: If X is sparse, P2 must also be sparse.
+    """
+    # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj'
+    #   Obj = objective function = 1/2 Dev + l2/2 w P2 w
+    #   Dev = deviance, s = normalized weights, variance V(mu) but phi=1
+    #   D   = link.inverse_derivative(eta) = diag_matrix(h'(X w))
+    #   D2  = link.inverse_derivative(eta)^2 = D^2
+    #   W   = D2/V(mu)
+    #   l2  = alpha * (1 - l1_ratio)
+    #   Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w
+    #        = -X' D (y-mu)/V(mu) + l2 P2 w
+    #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
+    #   Use Fisher matrix instead of full info matrix -X'(...) X,
+    #    i.e. E[Dev''] with E[y-mu]=0:
+    #   Obj'' ~ X' W X + l2 P2
+    # (1): w = (X' W X + l2 P2)^-1 X' W z,
+    #      with z = eta + D^-1 (y-mu)
+    # Note: P2 must be symmetrized
+    # Note: ' denotes derivative, but also transpose for matrices
+
+    # eta = linear predictor
+    eta = _safe_lin_pred(X, coef)
+    mu = link.inverse(eta)
+    # D = h'(eta)
+    hp = link.inverse_derivative(eta)
+    V = family.variance(mu, phi=1, weights=weights)
+    n_iter = 0
+    while n_iter < max_iter:
+        n_iter += 1
+        # coef_old not used so far.
+        # coef_old = coef
+        # working weights W, in principle a diagonal matrix
+        # therefore here just as 1d array
+        W = hp**2 / V
+        # working observations
+        z = eta + (y - mu) / hp
+        # solve A*coef = b
+        # A = X' W X + P2, b = X' W z
+        coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept)
+        # updated linear predictor
+        # do it here for updated values for tolerance
+        eta = _safe_lin_pred(X, coef)
+        mu = link.inverse(eta)
+        hp = link.inverse_derivative(eta)
+        V = family.variance(mu, phi=1, weights=weights)
+
+        # which tolerace? |coef - coef_old| or gradient?
+        # use gradient for compliance with newton-cg and lbfgs
+        # gradient = -X' D (y-mu)/V(mu) + l2 P2 w
+        temp = hp * (y - mu) / V
+        if sparse.issparse(X):
+            gradient = -(X.transpose() @ temp)
+        else:
+            gradient = -(X.T @ temp)
+        idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
+        if P2.ndim == 1:
+            gradient += P2 * coef[idx:]
+        else:
+            gradient += P2 @ coef[idx:]
+        if fit_intercept:
+            gradient = np.concatenate(([-temp.sum()], gradient))
+        if (np.max(np.abs(gradient)) <= tol):
+            converged = True
+            break
+
+    if not converged:
+        warnings.warn("irls failed to converge. Increase the number "
+                      "of iterations (currently {0})"
+                      .format(max_iter), ConvergenceWarning)
+
+    return coef, n_iter
+
+
+def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
+              max_inner_iter=1000, selection='cyclic',
+              random_state=None, diag_fisher=False):
+    """Compute inner loop of coordinate descent, i.e. cycles through features.
+
+    Minimization of 1-d subproblems::
+
+        min_z q(d+z*e_j) - q(d)
+        = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1
+
+    A = f'(w) + d*H(w) + (w+d)*P2
+    B = H+P2
+    Note: f'=-score and H=fisher are updated at the end of outer iteration.
+    """
+    # TODO: use sparsity (coefficient already 0 due to L1 penalty)
+    #       => active set of features for featurelist, see paper
+    #          of Improved GLMNET or Gap Safe Screening Rules
+    #          https://arxiv.org/abs/1611.05780
+    n_samples, n_features = X.shape
+    intercept = (coef.size == X.shape[1] + 1)
+    idx = 1 if intercept else 0  # offset if coef[0] is intercept
+    B = fisher
+    if P2.ndim == 1:
+        coef_P2 = coef[idx:] * P2
+        if not diag_fisher:
+            idiag = np.arange(start=idx, stop=B.shape[0])
+            # B[np.diag_indices_from(B)] += P2
+            B[(idiag, idiag)] += P2
+    else:
+        coef_P2 = coef[idx:] @ P2
+        if not diag_fisher:
+            if sparse.issparse(P2):
+                B[idx:, idx:] += P2.toarray()
+            else:
+                B[idx:, idx:] += P2
+    # A = -score + coef_P2
+    A = -score
+    A[idx:] += coef_P2
+    # A += d @ (H+P2) but so far d=0
+    # inner loop
+    for inner_iter in range(1, max_inner_iter+1):
+        inner_iter += 1
+        n_cycles += 1
+        # cycle through features, update intercept separately at the end
+        if selection == 'random':
+            featurelist = random_state.permutation(n_features)
+        else:
+            featurelist = np.arange(n_features)
+        for j in featurelist:
+            # minimize_z: a z + 1/2 b z^2 + c |d+z|
+            # a = A_j
+            # b = B_jj > 0
+            # c = |P1_j| = P1_j > 0, see 1.3
+            # d = w_j + d_j
+            # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
+            # with beta = z+d, beta_hat = d-a/b and gamma = c/b
+            # z = 1/b * S(bd-a,c) - d
+            # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
+            jdx = j+idx  # index for arrays containing entries for intercept
+            a = A[jdx]
+            if diag_fisher:
+                # Note: fisher is ndarray of shape (n_samples,) => no idx
+                # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway
+                Bj = np.zeros_like(A)
+                if intercept:
+                    Bj[0] = fisher.sum()
+                if sparse.issparse(X):
+                    Bj[idx:] = _safe_toarray(X[:, j].transpose() @
+                                             X.multiply(fisher[:, np.newaxis])
+                                             ).ravel()
+                else:
+                    Bj[idx:] = (fisher * X[:, j]) @ X
+
+                if P2.ndim == 1:
+                    Bj[idx:] += P2[j]
+                else:
+                    if sparse.issparse(P2):
+                        # slice columns as P2 is csc
+                        Bj[idx:] += P2[:, j].toarray().ravel()
+                    else:
+                        Bj[idx:] += P2[:, j]
+                b = Bj[jdx]
+            else:
+                b = B[jdx, jdx]
+
+            # those ten lines aree what it is all about
+            if b <= 0:
+                z = 0
+            elif P1[j] == 0:
+                z = -a/b
+            elif a + P1[j] < b * (coef[jdx] + d[jdx]):
+                z = -(a + P1[j])/b
+            elif a - P1[j] > b * (coef[jdx] + d[jdx]):
+                z = -(a - P1[j])/b
+            else:
+                z = -(coef[jdx] + d[jdx])
+
+            # update direction d
+            d[jdx] += z
+            # update A because d_j is now d_j+z
+            # A = f'(w) + d*H(w) + (w+d)*P2
+            # => A += (H+P2)*e_j z = B_j * z
+            # Note: B is symmetric B = B.transpose
+            if diag_fisher:
+                # Bj = B[:, j] calculated above, still valid
+                A += Bj * z
+            else:
+                # B is symmetric, C- or F-contiguous, but never sparse
+                if B.flags['F_CONTIGUOUS']:
+                    # slice columns like for sparse csc
+                    A += B[:, jdx] * z
+                else:  # B.flags['C_CONTIGUOUS'] might be true
+                    # slice rows
+                    A += B[jdx, :] * z
+            # end of cycle over features
+        # update intercept
+        if intercept:
+            if diag_fisher:
+                Bj = np.zeros_like(A)
+                Bj[0] = fisher.sum()
+                Bj[1:] = fisher @ X
+                b = Bj[0]
+            else:
+                b = B[0, 0]
+            z = 0 if b <= 0 else -A[0]/b
+            d[0] += z
+            if diag_fisher:
+                A += Bj * z
+            else:
+                if B.flags['F_CONTIGUOUS']:
+                    A += B[:, 0] * z
+                else:
+                    A += B[0, :] * z
+        # end of complete cycle
+        # stopping criterion for inner loop
+        # sum_i(|minimum of norm of subgrad of q(d)_i|)
+        # subgrad q(d) = A + subgrad ||P1*(w+d)||_1
+        mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1)
+        mn_subgrad = linalg.norm(mn_subgrad, ord=1)
+        if mn_subgrad <= inner_tol:
+            if inner_iter == 1:
+                inner_tol = inner_tol/4.
+            break
+        # end of inner loop
+    return d, coef_P2, n_cycles, inner_tol
+
+
+def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
+               max_iter=100, max_inner_iter=1000, tol=1e-4,
+               selection='cyclic ', random_state=None,
+               diag_fisher=False, copy_X=True):
+    """Solve GLM with L1 and L2 penalty by coordinate descent algorithm.
+
+    The objective beeing minimized in the coefficients w=coef is::
+
+        F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1
+
+    An Improved GLMNET for L1-regularized Logistic Regression:
+
+    1. Find optimal descent direction d by minimizing
+       min_d F(w+d) = min_d F(w+d) - F(w)
+    2. Quadrdatic approximation of F(w+d)-F(w) = q(d):
+       using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives:
+       q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
+       + ||P1*(w+d)||_1 - ||P1*w||_1
+       Then minimize q(d): min_d q(d)
+    3. Coordinate descent by updating coordinate j (d -> d+z*e_j):
+       min_z q(d+z*e_j)
+       = min_z q(d+z*e_j) - q(d)
+       = min_z A_j z + 1/2 B_jj z^2
+               + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
+       A = f'(w) + d*H(w) + (w+d)*P2
+       B = H + P2
+
+    Repeat steps 1-3 until convergence.
+    Note: Use Fisher matrix instead of Hessian for H.
+    Note: f' = -score, H = Fisher matrix
+
+    Parameters
+    ----------
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
+
+    X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
+        Training data (with intercept included if present). If not sparse,
+        pass directly as Fortran-contiguous data to avoid
+        unnecessary memory duplication.
+
+    y : ndarray, shape (n_samples,)
+        Target values.
+
+    weights: ndarray, shape (n_samples,)
+        Sample weights with which the deviance is weighted. The weights must
+        bee normalized and sum to 1.
+
+    P1 : {ndarray}, shape (n_features,)
+        The L1-penalty vector (=diagonal matrix)
+
+    P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features)
+        The L2-penalty matrix or vector (=diagonal matrix). If a matrix is
+        passed, it must be symmetric. If X is sparse, P2 must also be sparse.
+
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
+    family : ExponentialDispersionModel
+
+    link : Link
+
+    max_iter : int, optional (default=100)
+        Maximum numer of outer (Newton) iterations.
+
+    max_inner_iter : int, optional (default=1000)
+        Maximum number of iterations in each inner loop, i.e. max number of
+        cycles over all features per inner loop.
+
+    tol : float, optional (default=1e-4)
+        Covergence criterion is
+        sum_i(|minimum of norm of subgrad of objective_i|)<=tol.
+
+    selection : str, optional (default='cyclic')
+        If 'random', randomly chose features in inner loop.
+
+    random_state : {int, RandomState instance, None}, optional (default=None)
+
+    diag_fisher : boolean, optional (default=False)
+        ``False`` calculates full fisher matrix, ``True`` only diagonal matrix
+        s.t. fisher = X.T @ diag @ X. This saves storage but needs more
+        matrix-vector multiplications.
+
+    copy_X : boolean, optional (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    Returns
+    -------
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
+
+    n_iter : numer of outer iterations = newton iterations
+
+    n_cycles : number of cycles over features
+
+    References
+    ----------
+    Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+    An Improved GLMNET for L1-regularized Logistic Regression,
+    Journal of Machine Learning Research 13 (2012) 1999-2030
+    https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    """
+    X = check_array(X, 'csc', dtype=[np.float64, np.float32],
+                    order='F', copy=copy_X)
+    if P2.ndim == 2:
+        P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32],
+                         order='F', copy=copy_X)
+    if sparse.issparse(X):
+        if not sparse.isspmatrix_csc(X):
+            raise ValueError("If X is sparse, it must be in csc format"
+                             "; got (format={})".format(X.format))
+        if not sparse.isspmatrix_csc(P2):
+            raise ValueError("If X is sparse, P2 must also be sparse csc"
+                             "format. Got P2 not sparse.")
+    random_state = check_random_state(random_state)
+    # Note: we already set P2 = l2*P2, P1 = l1*P1
+    # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+    n_iter = 0  # number of outer iterations
+    n_cycles = 0  # number of (complete) cycles over features
+    converged = False
+    n_samples, n_features = X.shape
+    idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
+    # line search parameters
+    (beta, sigma) = (0.5, 0.01)
+    # some precalculations
+    # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a
+    #       1d array representing a diagonal matrix.
+    eta, mu, score, fisher = family._eta_mu_score_fisher(
+        coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
+        diag_fisher=diag_fisher)
+    # set up space for search direction d for inner loop
+    d = np.zeros_like(coef)
+    # initial stopping tolerance of inner loop
+    # use L1-norm of minimum of norm of subgradient of F
+    inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
+    inner_tol = linalg.norm(inner_tol, ord=1)
+    # outer loop
+    while n_iter < max_iter:
+        n_iter += 1
+        # initialize search direction d (to be optimized) with zero
+        d.fill(0)
+        # inner loop = _cd_cycle
+        d, coef_P2, n_cycles, inner_tol = \
+            _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
+                      max_inner_iter=max_inner_iter, selection=selection,
+                      random_state=random_state, diag_fisher=diag_fisher)
+        # line search by sequence beta^k, k=0, 1, ..
+        # F(w + lambda d) - F(w) <= lambda * bound
+        # bound = sigma * (f'(w)*d + w*P2*d
+        #                  +||P1 (w+d)||_1 - ||P1 w||_1)
+        P1w_1 = linalg.norm(P1 * coef[idx:], ord=1)
+        P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1)
+        # Note: coef_P2 already calculated and still valid
+        bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1)
+        Fw = (0.5 * family.deviance(y, mu, weights) +
+              0.5 * (coef_P2 @ coef[idx:]) + P1w_1)
+        la = 1./beta
+        for k in range(20):
+            la *= beta  # starts with la=1
+            coef_wd = coef + la * d
+            mu_wd = link.inverse(_safe_lin_pred(X, coef_wd))
+            Fwd = (0.5 * family.deviance(y, mu_wd, weights) +
+                   linalg.norm(P1 * coef_wd[idx:], ord=1))
+            if P2.ndim == 1:
+                Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:])
+            else:
+                Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:]))
+            if Fwd - Fw <= sigma * la * bound:
+                break
+        # update coefficients
+        # coef_old = coef.copy()
+        coef += la * d
+        # calculate eta, mu, score, Fisher matrix for next iteration
+        eta, mu, score, fisher = family._eta_mu_score_fisher(
+            coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
+            diag_fisher=diag_fisher)
+        # stopping criterion for outer loop
+        # sum_i(|minimum-norm of subgrad of F(w)_i|)
+        # fp_wP2 = f'(w) + w*P2
+        # Note: eta, mu and score are already updated
+        mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
+        mn_subgrad = linalg.norm(mn_subgrad, ord=1)
+        if mn_subgrad <= tol:
+            converged = True
+            break
+        # end of outer loop
+    if not converged:
+        warnings.warn("Coordinate descent failed to converge. Increase"
+                      " the maximum number of iterations max_iter"
+                      " (currently {0})".format(max_iter), ConvergenceWarning)
+
+    return coef, n_iter, n_cycles
+
+
+class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
+    """Regression via a Generalized Linear Model (GLM) with penalties.
+
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
+    fitting and predicting the mean of the target y as mu=h(X*w). Therefore,
+    the fit minimizes the following objective function with combined L1 and L2
+    priors as regularizer::
+
+            1/(2*sum(s)) * deviance(y, h(X*w); s)
+            + alpha * l1_ratio * ||P1*w||_1
+            + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
+
+    with inverse link function h and s=sample_weight. Note that for
+    ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples).
+    For ``P1=P2='identity'``, the penalty is the elastic net::
+
+            alpha * l1_ratio * ||w||_1
+            + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2
+
+    If you are interested in controlling the L1 and L2 penalties
+    separately, keep in mind that this is equivalent to::
+
+            a * L1 + b * L2
+
+    where::
+
+            alpha = a + b and l1_ratio = a / (a + b)
+
+    The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet,
+    while ``alpha`` corresponds to the lambda parameter in glmnet.
+    Specifically, l1_ratio = 1 is the lasso penalty.
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
+
+    Parameters
+    ----------
+    alpha : float, optional (default=1)
+        Constant that multiplies the penalty terms und thus determines the
+        regularization strength.
+        See the notes for the exact mathematical meaning of this
+        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        case, the design matrix X must have full column rank
+        (no collinearities).
+
+    l1_ratio : float, optional (default=0)
+        The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For
+        ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it
+        is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
+        combination of L1 and L2.
+
+    P1 : {'identity', array-like}, shape (n_features,), optional \
+            (default='identity')
+        With this array, you can exclude coefficients from the L1 penalty.
+        Set the corresponding value to 1 (include) or 0 (exclude). The
+        default value ``'identity'`` is the same as a 1d array of ones.
+        Note that n_features = X.shape[1].
+
+    P2 : {'identity', array-like, sparse matrix}, shape \
+            (n_features,) or (n_features, n_features), optional \
+            (default='identity')
+        With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`.
+        This gives a fine control over this penalty (Tikhonov regularization).
+        A 2d array is directly used as the square matrix P2. A 1d array is
+        interpreted as diagonal (square) matrix. The default 'identity' sets
+        the identity matrix, which gives the usual squared L2-norm. If you just
+        want to exclude certain coefficients, pass a 1d array filled with 1,
+        and 0 for the coefficients to be excluded.
+        Note that P2 must be positive semi-definite.
+
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
+    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \
+            or an instance of class ExponentialDispersionModel, \
+            optional(default='normal')
+        The distributional assumption of the GLM, i.e. which distribution from
+        the EDM, specifies the loss function to be minimized.
+
+    link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \
+            optional (default='auto')
+        The link function of the GLM, i.e. mapping from linear predictor
+        (X*coef) to expectation (mu). Option 'auto' sets the link depending on
+        the chosen family as follows:
+
+        - 'identity' for family 'normal'
+
+        - 'log' for families 'poisson', 'gamma', 'inverse.gaussian'
+
+        - 'logit' for family 'binomial'
+
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None)
+        Method for estimation of the dispersion parameter phi. Whether to use
+        the chi squared statisic or the deviance statistic. If None, the
+        dispersion is not estimated.
+
+    solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \
+            optional (default='auto')
+        Algorithm to use in the optimization problem:
+
+        'auto'
+            Sets 'irls' if l1_ratio equals 0, else 'cd'.
+
+        'cd'
+            Coordinate descent algorithm. It can deal with L1 as well as L2
+            penalties. Note that in order to avoid unnecessary memory
+            duplication of X in the ``fit`` method, X should be directly passed
+            as a Fortran-contiguous numpy array or sparse csc matrix.
+
+        'irls'
+            Iterated reweighted least squares.
+            It is the standard algorithm for GLMs. It cannot deal with
+            L1 penalties.
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties.
+
+        'newton-cg', 'lbfgs'
+            Newton conjugate gradient algorithm cannot deal with L1 penalties.
+
+        Note that all solvers except lbfgs use the fisher matrix, i.e. the
+        expected Hessian instead of the Hessian matrix.
+
+    max_iter : int, optional (default=100)
+        The maximal number of iterations for solver algorithms.
+
+    tol : float, optional (default=1e-4)
+        Stopping criterion. For the irls, newton-cg and lbfgs solvers,
+        the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
+        where ``g_i`` is the i-th component of the gradient (derivative) of
+        the objective function. For the cd solver, covergence is reached
+        when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the
+        subgradient of the objective and minimum-norm of ``g_i`` is the element
+        of the subgradient ``g_i`` with the smallest L2-norm.
+
+    warm_start : boolean, optional (default=False)
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` (supersedes option
+        ``start_params``). If set to ``True`` or if the attribute ``coef_``
+        does not exit (first call to ``fit``), option ``start_params`` sets the
+        start values for ``coef_`` and ``intercept_``.
+
+    start_params : {'guess', 'zero', array of shape (n_features*, )}, \
+            optional (default='guess')
+        Relevant only if ``warm_start=False`` or if fit is called
+        the first time (``self.coef_`` does not yet exist).
+
+        'guess'
+            Start values of mu are calculated by family.starting_mu(..). Then,
+            one Newton step obtains start values for ``coef_``. If
+            ``solver='irls'``, it uses one irls step, else the Newton step is
+            calculated by the cd solver.
+            This gives usually good starting values.
+
+        'zero'
+        All coefficients are set to zero. If ``fit_intercept=True``, the
+        start value for the intercept is obtained by the weighted average of y.
+
+        array
+        The array of size n_features* is directly used as start values
+        for ``coef_``. If ``fit_intercept=True``, the first element
+        is assumed to be the start value for the ``intercept_``.
+        Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes
+        the intercept in counting.
+
+    selection : str, optional (default='cyclic')
+        For the solver 'cd' (coordinate descent), the coordinates (features)
+        can be updated in either cyclic or random order.
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially in the same order. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    random_state : {int, RandomState instance, None}, optional (default=None)
+        The seed of the pseudo random number generator that selects a random
+        feature to be updated for solver 'cd' (coordinate descent).
+        If int, random_state is the seed used by the random
+        number generator; if RandomState instance, random_state is the random
+        number generator; if None, the random number generator is the
+        RandomState instance used by `np.random`. Used when ``selection`` ==
+        'random'.
+
+    diag_fisher : boolean, optional, (default=False)
+        Only relevant for solver 'cd' (see also ``start_params='guess'``).
+        If ``False``, the full Fisher matrix (expected Hessian) is computed in
+        each outer iteration (Newton iteration). If ``True``, only a diagonal
+        matrix (stored as 1d array) is computed, such that
+        fisher = X.T @ diag @ X. This saves memory and matrix-matrix
+        multiplications, but needs more matrix-vector multiplications. If you
+        use large sparse X or if you have many features,
+        i.e. n_features >> n_samples, you might set this option to ``True``.
+
+    copy_X : boolean, optional, (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    check_input : boolean, optional (default=True)
+        Allow to bypass several checks on input: y values in range of family,
+        sample_weight non-negative, P2 positive semi-definite.
+        Don't use this parameter unless you know what you do.
+
+    verbose : int, optional (default=0)
+        For the lbfgs solver set verbose to any positive number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array, shape (n_features,)
+        Estimated coefficients for the linear predictor (X*coef_+intercept_) in
+        the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    dispersion_ : float
+        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
+
+    n_iter_ : int
+        Actual number of iterations used in solver.
+
+    Notes
+    -----
+    The fit itself does not need Y to be from an EDM, but only assumes
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    :ref:`User Guide <Generalized_linear_regression>`.
+
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    minimizing the deviance plus penalty term, which is equivalent to
+    (penalized) maximum likelihood estimation.
+
+    For alpha > 0, the feature matrix X should be standardized in order to
+    penalize features equally strong. Call
+    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
+
+    If the target y is a ratio, appropriate sample weights s should be
+    provided.
+    As an example, consider Poission distributed counts z (integers) and
+    weights s=exposure (time, money, persons years, ...). Then you fit
+    y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
+    sample_weight=s)``. The weights are necessary for the right (finite
+    sample) mean.
+    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
+    in this case one might say that y has a 'scaled' Poisson distributions.
+    The same holds for other distributions.
+
+    References
+    ----------
+    For the coordinate descent implementation:
+        * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+          An Improved GLMNET for L1-regularized Logistic Regression,
+          Journal of Machine Learning Research 13 (2012) 1999-2030
+          https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    """
+    def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
+                 fit_intercept=True, family='normal', link='auto',
+                 fit_dispersion=None, solver='auto', max_iter=100,
+                 tol=1e-4, warm_start=False, start_params='guess',
+                 selection='cyclic', random_state=None, diag_fisher=False,
+                 copy_X=True, check_input=True, verbose=0):
+        self.alpha = alpha
+        self.l1_ratio = l1_ratio
+        self.P1 = P1
+        self.P2 = P2
+        self.fit_intercept = fit_intercept
+        self.family = family
+        self.link = link
+        self.fit_dispersion = fit_dispersion
+        self.solver = solver
+        self.max_iter = max_iter
+        self.tol = tol
+        self.warm_start = warm_start
+        self.start_params = start_params
+        self.selection = selection
+        self.random_state = random_state
+        self.diag_fisher = diag_fisher
+        self.copy_X = copy_X
+        self.check_input = check_input
+        self.verbose = verbose
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit a Generalized Linear Model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+
+        y : array-like, shape (n_samples,)
+            Target values.
+
+        sample_weight : {None, array-like}, shape (n_samples,),\
+                optinal (default=None)
+            Individual weights w_i for each sample. Note that for an
+            Exponential Dispersion Model (EDM), one has
+            Var[Y_i]=phi/w_i * v(mu).
+            If Y_i ~ EDM(mu, phi/w_i), then
+            sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
+            weighted average with weights=sample_weight.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        #######################################################################
+        # 1. input validation                                                 #
+        #######################################################################
+        # 1.1 validate arguments of __init__ ##################################
+        # Guarantee that self._family_instance is an instance of class
+        # ExponentialDispersionModel
+        if isinstance(self.family, ExponentialDispersionModel):
+            self._family_instance = self.family
+        else:
+            if self.family == 'normal':
+                self._family_instance = NormalDistribution()
+            elif self.family == 'poisson':
+                self._family_instance = PoissonDistribution()
+            elif self.family == 'gamma':
+                self._family_instance = GammaDistribution()
+            elif self.family == 'inverse.gaussian':
+                self._family_instance = InverseGaussianDistribution()
+            elif self.family == 'binomial':
+                self._family_instance = BinomialDistribution()
+            else:
+                raise ValueError(
+                    "The family must be an instance of class"
+                    " ExponentialDispersionModel or an element of"
+                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
+                    "'binomial']; got (family={0})".format(self.family))
+
+        # Guarantee that self._link_instance is set to an instance of
+        # class Link
+        if isinstance(self.link, Link):
+            self._link_instance = self.link
+        else:
+            if self.link == 'auto':
+                if isinstance(self._family_instance, TweedieDistribution):
+                    if self._family_instance.power <= 0:
+                        self._link_instance = IdentityLink()
+                    if self._family_instance.power >= 1:
+                        self._link_instance = LogLink()
+                elif isinstance(self._family_instance,
+                                GeneralizedHyperbolicSecant):
+                    self._link_instance = IdentityLink()
+                elif isinstance(self._family_instance, BinomialDistribution):
+                    self._link_instance = LogitLink()
+                else:
+                    raise ValueError("No default link known for the "
+                                     "specified distribution family. Please "
+                                     "set link manually, i.e. not to 'auto'; "
+                                     "got (link='auto', family={}"
+                                     .format(self.family))
+            elif self.link == 'identity':
+                self._link_instance = IdentityLink()
+            elif self.link == 'log':
+                self._link_instance = LogLink()
+            elif self.link == 'logit':
+                self._link_instance = LogitLink()
+            else:
+                raise ValueError(
+                    "The link must be an instance of class Link or "
+                    "an element of ['auto', 'identity', 'log', 'logit']; "
+                    "got (link={0})".format(self.link))
+
+        # validate further arguments
+        if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
+            raise ValueError("Penalty term must be a non-negative number;"
+                             " got (alpha={0})".format(self.alpha))
+        if (not isinstance(self.l1_ratio, numbers.Number) or
+                self.l1_ratio < 0 or self.l1_ratio > 1):
+            raise ValueError("l1_ratio must be a number in interval [0, 1];"
+                             " got (l1_ratio={0})".format(self.l1_ratio))
+        if not isinstance(self.fit_intercept, bool):
+            raise ValueError("The argument fit_intercept must be bool;"
+                             " got {0}".format(self.fit_intercept))
+        if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']:
+            raise ValueError("GeneralizedLinearRegressor supports only solvers"
+                             " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';"
+                             " got {0}".format(self.solver))
+        solver = self.solver
+        if self.solver == 'auto':
+            if self.l1_ratio == 0:
+                solver = 'irls'
+            else:
+                solver = 'cd'
+        if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']):
+            raise ValueError("The chosen solver (solver={0}) can't deal "
+                             "with L1 penalties, which are included with "
+                             "(alpha={1}) and (l1_ratio={2})."
+                             .format(solver, self.alpha, self.l1_ratio))
+        if (not isinstance(self.max_iter, int)
+                or self.max_iter <= 0):
+            raise ValueError("Maximum number of iteration must be a positive "
+                             "integer;"
+                             " got (max_iter={0!r})".format(self.max_iter))
+        if not isinstance(self.tol, numbers.Number) or self.tol <= 0:
+            raise ValueError("Tolerance for stopping criteria must be "
+                             "positive; got (tol={0!r})".format(self.tol))
+        if not isinstance(self.warm_start, bool):
+            raise ValueError("The argument warm_start must be bool;"
+                             " got {0}".format(self.warm_start))
+        if self.selection not in ['cyclic', 'random']:
+            raise ValueError("The argument selection must be 'cyclic' or "
+                             "'random'; got (selection={0})"
+                             .format(self.selection))
+        random_state = check_random_state(self.random_state)
+        if not isinstance(self.diag_fisher, bool):
+            raise ValueError("The argument diag_fisher must be bool;"
+                             " got {0}".format(self.diag_fisher))
+        if not isinstance(self.copy_X, bool):
+            raise ValueError("The argument copy_X must be bool;"
+                             " got {0}".format(self.copy_X))
+        if not isinstance(self.check_input, bool):
+            raise ValueError("The argument check_input must be bool; got "
+                             "(check_input={0})".format(self.check_input))
+
+        family = self._family_instance
+        link = self._link_instance
+
+        # 1.2 validate arguments of fit #######################################
+        _dtype = [np.float64, np.float32]
+        if solver == 'cd':
+            _stype = ['csc']
+        else:
+            _stype = ['csc', 'csr']
+        X, y = check_X_y(X, y, accept_sparse=_stype,
+                         dtype=_dtype, y_numeric=True, multi_output=False,
+                         copy=self.copy_X)
+        # Without converting y to float, deviance might raise
+        # ValueError: Integers to negative integer powers are not allowed.
+        # Also, y must not be sparse.
+        y = np.asarray(y, dtype=np.float64)
+
+        weights = _check_weights(sample_weight, y.shape[0])
+
+        n_samples, n_features = X.shape
+
+        # 1.3 arguments to take special care ##################################
+        # P1, P2, start_params
+        if isinstance(self.P1, str) and self.P1 == 'identity':
+            P1 = np.ones(n_features)
+        else:
+            P1 = np.atleast_1d(self.P1)
+            try:
+                P1 = P1.astype(np.float64, casting='safe', copy=False)
+            except TypeError:
+                raise TypeError("The given P1 cannot be converted to a numeric"
+                                "array; got (P1.dtype={0})."
+                                .format(P1.dtype))
+            if (P1.ndim != 1) or (P1.shape[0] != n_features):
+                raise ValueError("P1 must be either 'identity' or a 1d array "
+                                 "with the length of X.shape[1]; "
+                                 "got (P1.shape[0]={0}), "
+                                 "needed (X.shape[1]={1})."
+                                 .format(P1.shape[0], n_features))
+        # If X is sparse, make P2 sparse, too.
+        if isinstance(self.P2, str) and self.P2 == 'identity':
+            if sparse.issparse(X):
+                P2 = (sparse.dia_matrix((np.ones(n_features), 0),
+                      shape=(n_features, n_features))).tocsc()
+            else:
+                P2 = np.ones(n_features)
+        else:
+            P2 = check_array(self.P2, copy=True,
+                             accept_sparse=_stype,
+                             dtype=_dtype, ensure_2d=False)
+            if P2.ndim == 1:
+                P2 = np.asarray(P2)
+                if P2.shape[0] != n_features:
+                    raise ValueError("P2 should be a 1d array of shape "
+                                     "(n_features,) with "
+                                     "n_features=X.shape[1]; "
+                                     "got (P2.shape=({0},)), needed ({1},)"
+                                     .format(P2.shape[0], X.shape[1]))
+                if sparse.issparse(X):
+                    P2 = (sparse.dia_matrix((P2, 0),
+                          shape=(n_features, n_features))).tocsc()
+            elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and
+                    P2.shape[0] == X.shape[1]):
+                if sparse.issparse(X):
+                    P2 = (sparse.dia_matrix((P2, 0),
+                          shape=(n_features, n_features))).tocsc()
+            else:
+                raise ValueError("P2 must be either None or an array of shape "
+                                 "(n_features, n_features) with "
+                                 "n_features=X.shape[1]; "
+                                 "got (P2.shape=({0}, {1})), needed ({2}, {2})"
+                                 .format(P2.shape[0], P2.shape[1], X.shape[1]))
+
+        start_params = self.start_params
+        if isinstance(start_params, str):
+            if start_params not in ['guess',  'zero']:
+                raise ValueError("The argument start_params must be 'guess', "
+                                 "'zero' or an array of correct length; "
+                                 "got(start_params={0})".format(start_params))
+        else:
+            start_params = check_array(start_params, accept_sparse=False,
+                                       force_all_finite=True, ensure_2d=False,
+                                       dtype=_dtype, copy=True)
+            if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
+                    (start_params.ndim != 1)):
+                raise ValueError("Start values for parameters must have the"
+                                 "right length and dimension; required (length"
+                                 "={0}, ndim=1); got (length={1}, ndim={2})."
+                                 .format(X.shape[1] + self.fit_intercept,
+                                         start_params.shape[0],
+                                         start_params.ndim))
+
+        l1 = self.alpha * self.l1_ratio
+        l2 = self.alpha * (1 - self.l1_ratio)
+        # P1 and P2 are now for sure copies
+        P1 = l1 * P1
+        P2 = l2 * P2
+        # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
+        # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
+        if P2.ndim == 2:
+            if sparse.issparse(P2):
+                if sparse.isspmatrix_csc(P2):
+                    P2 = 0.5 * (P2 + P2.transpose()).tocsc()
+                else:
+                    P2 = 0.5 * (P2 + P2.transpose()).tocsr()
+            else:
+                P2 = 0.5 * (P2 + P2.T)
+
+        # For coordinate descent, if X is sparse, P2 must also be csc
+        if solver == 'cd' and sparse.issparse(X):
+            P2 = sparse.csc_matrix(P2)
+
+        # 1.4 additional validations ##########################################
+        if self.check_input:
+            if not np.all(family.in_y_range(y)):
+                raise ValueError("Some value(s) of y are out of the valid "
+                                 "range for family {0}"
+                                 .format(family.__class__.__name__))
+            # check if P1 has only non-negative values, negative values might
+            # indicate group lasso in the future.
+            if not isinstance(self.P1, str):  # if self.P1 != 'identity':
+                if not np.all(P1 >= 0):
+                    raise ValueError("P1 must not have negative values.")
+            # check if P2 is positive semidefinite
+            # np.linalg.cholesky(P2) 'only' asserts positive definite
+            if not isinstance(self.P2, str):  # self.P2 != 'identity'
+                # due to numerical precision, we allow eigenvalues to be a
+                # tiny bit negative
+                epsneg = -10 * np.finfo(P2.dtype).epsneg
+                if P2.ndim == 1 or P2.shape[0] == 1:
+                    p2 = P2
+                    if sparse.issparse(P2):
+                        p2 = P2.toarray()
+                    if not np.all(p2 >= 0):
+                        raise ValueError("1d array P2 must not have negative "
+                                         "values.")
+                elif sparse.issparse(P2):
+                    # for sparse matrices, not all eigenvals can be computed
+                    # efficiently, use only half of n_features
+                    # k = how many eigenvals to compute
+                    k = np.min([10, n_features // 10 + 1])
+                    sigma = 0  # start searching near this value
+                    which = 'SA'  # find smallest algebraic eigenvalues first
+                    if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma,
+                                                 which=which) >= epsneg):
+                        raise ValueError("P2 must be positive semi-definite.")
+                else:
+                    if not np.all(linalg.eigvalsh(P2) >= epsneg):
+                        raise ValueError("P2 must be positive semi-definite.")
+            # TODO: if alpha=0 check that X is not rank deficient
+            # TODO: what else to check?
+
+        #######################################################################
+        # 2. rescaling of weights (sample_weight)                             #
+        #######################################################################
+        # IMPORTANT NOTE: Since we want to minimize
+        # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
+        # deviance = sum(sample_weight * unit_deviance),
+        # we rescale weights such that sum(weights) = 1 and this becomes
+        # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
+        weights_sum = np.sum(weights)
+        weights = weights/weights_sum
+
+        #######################################################################
+        # 3. initialization of coef = (intercept_, coef_)                     #
+        #######################################################################
+        # Note: Since phi=self.dispersion_ does not enter the estimation
+        #       of mu_i=E[y_i], set it to 1.
+
+        # set start values for coef
+        coef = None
+        if self.warm_start and hasattr(self, 'coef_'):
+            if self.fit_intercept:
+                coef = np.concatenate((np.array([self.intercept_]),
+                                       self.coef_))
+            else:
+                coef = self.coef_
+        elif isinstance(start_params, str):
+            if start_params == 'guess':
+                # Set mu=starting_mu of the family and do one Newton step
+                # If solver=cd use cd, else irls
+                mu = family.starting_mu(y, weights=weights)
+                eta = link.link(mu)  # linear predictor
+                if solver in ['cd', 'lbfgs', 'newton-cg']:
+                    # see function _cd_solver
+                    sigma_inv = 1/family.variance(mu, phi=1, weights=weights)
+                    d1 = link.inverse_derivative(eta)
+                    temp = sigma_inv * d1 * (y - mu)
+                    if self.fit_intercept:
+                        score = np.concatenate(([temp.sum()], temp @ X))
+                    else:
+                        score = temp @ X  # sampe as X.T @ temp
+
+                    d2_sigma_inv = d1 * d1 * sigma_inv
+                    diag_fisher = self.diag_fisher
+                    if diag_fisher:
+                        fisher = d2_sigma_inv
+                    else:
+                        fisher = \
+                            _safe_sandwich_dot(X, d2_sigma_inv,
+                                               intercept=self.fit_intercept)
+                    # set up space for search direction d for inner loop
+                    if self.fit_intercept:
+                        coef = np.zeros(n_features+1)
+                    else:
+                        coef = np.zeros(n_features)
+                    d = np.zeros_like(coef)
+                    # initial stopping tolerance of inner loop
+                    # use L1-norm of minimum of norm of subgradient of F
+                    # use less restrictive tolerance for initial guess
+                    inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2,
+                                                 P1=P1)
+                    inner_tol = 4 * linalg.norm(inner_tol, ord=1)
+                    # just one outer loop = Newton step
+                    n_cycles = 0
+                    d, coef_P2, n_cycles, inner_tol = \
+                        _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles,
+                                  inner_tol, max_inner_iter=1000,
+                                  selection=self.selection,
+                                  random_state=self.random_state,
+                                  diag_fisher=self.diag_fisher)
+                    coef += d  # for simplicity no line search here
+                else:
+                    # See _irls_solver
+                    # h'(eta)
+                    hp = link.inverse_derivative(eta)
+                    # working weights W, in principle a diagonal matrix
+                    # therefore here just as 1d array
+                    W = (hp**2 / family.variance(mu, phi=1, weights=weights))
+                    # working observations
+                    z = eta + (y-mu)/hp
+                    # solve A*coef = b
+                    # A = X' W X + l2 P2, b = X' W z
+                    coef = _irls_step(X, W, P2, z,
+                                      fit_intercept=self.fit_intercept)
+            else:  # start_params == 'zero'
+                if self.fit_intercept:
+                    coef = np.zeros(n_features+1)
+                    coef[0] = link.link(np.average(y, weights=weights))
+                else:
+                    coef = np.zeros(n_features)
+        else:  # assign given array as start values
+            coef = start_params
+
+        #######################################################################
+        # 4. fit                                                              #
+        #######################################################################
+        # algorithms for optimiation
+        # TODO: Parallelize it?
+
+        # 4.1 IRLS ############################################################
+        # Note: we already set P2 = l2*P2, see above
+        # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+        if solver == 'irls':
+            coef, self.n_iter_ = \
+                _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2,
+                             fit_intercept=self.fit_intercept, family=family,
+                             link=link, max_iter=self.max_iter, tol=self.tol)
+
+        # 4.2 L-BFGS ##########################################################
+        elif solver == 'lbfgs':
+            def func(coef, X, y, weights, P2, family, link):
+                mu, devp = \
+                    family._mu_deviance_derivative(coef, X, y, weights, link)
+                dev = family.deviance(y, mu, weights)
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
+                if P2.ndim == 1:
+                    L2 = P2 * coef[idx:]
+                else:
+                    L2 = P2 @ coef[idx:]
+                obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2)
+                objp = 0.5 * devp
+                objp[idx:] += L2
+                return obj, objp
+
+            args = (X, y, weights, P2, family, link)
+            coef, loss, info = fmin_l_bfgs_b(
+                func, coef, fprime=None, args=args,
+                iprint=(self.verbose > 0) - 1, pgtol=self.tol,
+                maxiter=self.max_iter, factr=1e3)
+            if self.verbose > 0:
+                if info["warnflag"] == 1:
+                    warnings.warn("lbfgs failed to converge."
+                                  " Increase the number of iterations.",
+                                  ConvergenceWarning)
+                elif info["warnflag"] == 2:
+                    warnings.warn("lbfgs failed for the reason: {0}"
+                                  .format(info["task"]))
+            self.n_iter_ = info['nit']
+
+        # 4.3 Newton-CG #######################################################
+        # We use again the fisher matrix instead of the hessian. More
+        # precisely, expected hessian of deviance.
+        elif solver == 'newton-cg':
+            def func(coef, X, y, weights, P2, family, link):
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
+                if P2.ndim == 1:
+                    L2 = coef[idx:] @ (P2 * coef[idx:])
+                else:
+                    L2 = coef[idx:] @ (P2 @ coef[idx:])
+                mu = link.inverse(_safe_lin_pred(X, coef))
+                return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2
+
+            def grad(coef, X, y, weights, P2, family, link):
+                mu, devp = \
+                    family._mu_deviance_derivative(coef, X, y, weights, link)
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
+                if P2.ndim == 1:
+                    L2 = P2 * coef[idx:]
+                else:
+                    L2 = P2 @ coef[idx:]
+                objp = 0.5 * devp
+                objp[idx:] += L2
+                return objp
+
+            def grad_hess(coef, X, y, weights, P2, family, link):
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
+                if P2.ndim == 1:
+                    L2 = P2 * coef[idx:]
+                else:
+                    L2 = P2 @ coef[idx:]
+                eta = _safe_lin_pred(X, coef)
+                mu = link.inverse(eta)
+                d1 = link.inverse_derivative(eta)
+                temp = d1 * family.deviance_derivative(y, mu, weights)
+                if intercept:
+                    grad = np.concatenate(([0.5 * temp.sum()],
+                                           0.5 * temp @ X + L2))
+                else:
+                    grad = 0.5 * temp @ X + L2  # sampe as 0.5* X.T @ temp + L2
+
+                # expected hessian = fisher = X.T @ diag_matrix @ X
+                # calculate only diag_matrix
+                diag = d1**2 / family.variance(mu, phi=1, weights=weights)
+                if intercept:
+                    h0i = np.concatenate(([diag.sum()], diag @ X))
+
+                def Hs(coef):
+                    # return (0.5 * fisher + P2) @ coef
+                    # ret = 0.5 * (X.T @ (diag * (X @ coef)))
+                    ret = 0.5 * ((diag * (X @ coef[idx:])) @ X)
+                    if P2.ndim == 1:
+                        ret += P2 * coef[idx:]
+                    else:
+                        ret += P2 @ coef[idx:]
+                    if intercept:
+                        ret = np.concatenate(([0.5 * (h0i @ coef)],
+                                             ret + 0.5 * coef[0] * h0i[1:]))
+                    return ret
+
+                return grad, Hs
+
+            args = (X, y, weights, P2, family, link)
+            coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef,
+                                           args=args, maxiter=self.max_iter,
+                                           tol=self.tol)
+
+        # 4.4 coordinate descent ##############################################
+        # Note: we already set P1 = l1*P1, see above
+        # Note: we already set P2 = l2*P2, see above
+        # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+        elif solver == 'cd':
+            coef, self.n_iter_, self._n_cycles = \
+                _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1,
+                           P2=P2, fit_intercept=self.fit_intercept,
+                           family=family, link=link,
+                           max_iter=self.max_iter, tol=self.tol,
+                           selection=self.selection, random_state=random_state,
+                           diag_fisher=self.diag_fisher, copy_X=self.copy_X)
+
+        #######################################################################
+        # 5. postprocessing                                                   #
+        #######################################################################
+        if self.fit_intercept:
+            self.intercept_ = coef[0]
+            self.coef_ = coef[1:]
+        else:
+            # set intercept to zero as the other linear models do
+            self.intercept_ = 0.
+            self.coef_ = coef
+
+        if self.fit_dispersion in ['chisqr', 'deviance']:
+            # attention because of rescaling of weights
+            self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum
+
+        return self
+
+    def linear_predictor(self, X):
+        """Compute the linear_predictor = X*coef_ + intercept_.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        C : array, shape (n_samples,)
+            Returns predicted values of linear predictor.
+        """
+        check_is_fitted(self, "coef_")
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                        dtype='numeric', copy=True, ensure_2d=True,
+                        allow_nd=False)
+        return X @ self.coef_ + self.intercept_
+
+    def predict(self, X, sample_weight=None):
+        """Predict uing GLM with feature matrix X.
+        If sample_weight is given, returns prediction*sample_weight.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Samples.
+
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
+
+        Returns
+        -------
+        C : array, shape (n_samples,)
+            Returns predicted values times sample_weight.
+        """
+        # TODO: Is copy=True necessary?
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                        dtype='numeric', copy=True, ensure_2d=True,
+                        allow_nd=False)
+        eta = self.linear_predictor(X)
+        mu = self._link_instance.inverse(eta)
+        weights = _check_weights(sample_weight, X.shape[0])
+
+        return mu*weights
+
+    def estimate_phi(self, X, y, sample_weight=None):
+        """Estimate/fit the dispersion parameter phi.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+
+        y : array-like, shape (n_samples,)
+            Target values.
+
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
+            Sample weights.
+
+        Returns
+        -------
+        phi : float
+            Dispersion parameter.
+        """
+        check_is_fitted(self, "coef_")
+        _dtype = [np.float64, np.float32]
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                         dtype=_dtype, y_numeric=True, multi_output=False)
+        n_samples, n_features = X.shape
+        weights = _check_weights(sample_weight, n_samples)
+        eta = X @ self.coef_
+        if self.fit_intercept is True:
+            eta += self.intercept_
+            n_features += 1
+        if n_samples <= n_features:
+            raise ValueError("Estimation of dispersion parameter phi requires"
+                             " more samples than features, got"
+                             " samples=X.shape[0]={0} and"
+                             " n_features=X.shape[1]+fit_intercept={1}."
+                             .format(n_samples, n_features))
+        mu = self._link_instance.inverse(eta)
+        if self.fit_dispersion == 'chisqr':
+            chisq = np.sum(weights*(y-mu)**2 /
+                           self._family_instance.unit_variance(mu))
+            return chisq/(n_samples - n_features)
+        elif self.fit_dispersion == 'deviance':
+            dev = self._family_instance.deviance(y, mu, weights)
+            return dev/(n_samples - n_features)
+
+    # Note: check_estimator(GeneralizedLinearRegressor) might raise
+    # "AssertionError: -0.28014056555724598 not greater than 0.5"
+    # unless GeneralizedLinearRegressor has a score which passes the test.
+    def score(self, X, y, sample_weight=None):
+        """Compute D^2, the percentage of deviance explained.
+
+        D^2 is a generalization of the coefficient of determination R^2.
+        R^2 uses squared error and D^2 deviance. Note that those two are equal
+        for family='normal'.
+
+        D^2 is defined as
+        :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
+        :math:`D_{null}` is the null deviance, i.e. the deviance of a model
+        with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
+        The mean :math:`\\bar{y}` is averaged by sample_weight.
+        Best possible score is 1.0 and it can be negative (because the model
+        can be arbitrarily worse).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like, shape (n_samples,)
+            True values of target.
+
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            D^2 of self.predict(X) w.r.t. y.
+        """
+        # Note, default score defined in RegressorMixin is R^2 score.
+        # TODO: make D^2 a score function in module metrics (and thereby get
+        #       input validation and so on)
+        weights = _check_weights(sample_weight, y.shape[0])
+        mu = self.predict(X)
+        dev = self._family_instance.deviance(y, mu, weights=weights)
+        y_mean = np.average(y, weights=weights)
+        dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
+        return 1. - dev / dev_null
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
new file mode 100644
index 0000000000000..7cb3f4a5b5969
--- /dev/null
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -0,0 +1,640 @@
+import numpy as np
+from numpy.testing import assert_allclose
+import pytest
+import scipy as sp
+from scipy import linalg, optimize, sparse
+
+from sklearn.datasets import make_classification, make_regression
+from sklearn.linear_model.glm import (
+    Link,
+    IdentityLink,
+    LogLink,
+    LogitLink,
+    TweedieDistribution,
+    NormalDistribution, PoissonDistribution,
+    GammaDistribution, InverseGaussianDistribution,
+    GeneralizedHyperbolicSecant, BinomialDistribution,
+    GeneralizedLinearRegressor)
+from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
+
+from sklearn.utils.testing import (
+    assert_equal, assert_almost_equal,
+    assert_array_equal, assert_array_almost_equal)
+
+
+@pytest.mark.parametrize('link', Link.__subclasses__())
+def test_link_properties(link):
+    """Test link inverse and derivative."""
+    rng = np.random.RandomState(42)
+    x = rng.rand(100)*100
+    link = link()  # instatiate object
+    decimal = 10
+    if isinstance(link, LogitLink):
+        # careful for large x, note expit(36) = 1
+        # limit max eta to 15
+        x = x / 100 * 15
+        decimal = 8
+    assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal)
+    # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
+    assert_almost_equal(link.derivative(link.inverse(x)),
+                        1./link.inverse_derivative(x), decimal=decimal)
+    # for LogitLink, in the following x should be between 0 and 1.
+    # assert_almost_equal(link.inverse_derivative(link.link(x)),
+    #                     1./link.derivative(x), decimal=decimal)
+
+
+@pytest.mark.parametrize(
+    'family, expected',
+    [(NormalDistribution(), [True, True, True]),
+     (PoissonDistribution(), [False, True, True]),
+     (TweedieDistribution(power=1.5), [False, True, True]),
+     (GammaDistribution(), [False, False, True]),
+     (InverseGaussianDistribution(), [False, False, True]),
+     (TweedieDistribution(power=4.5), [False, False, True])])
+def test_family_bounds(family, expected):
+    """Test the valid range of distributions at -1, 0, 1."""
+    result = family.in_y_range([-1, 0, 1])
+    assert_array_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    'family, chk_values',
+    [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
+     (PoissonDistribution(), [0.1, 1.5]),
+     (GammaDistribution(), [0.1, 1.5]),
+     (InverseGaussianDistribution(), [0.1, 1.5]),
+     (TweedieDistribution(power=-2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-1), [0.1, 1.5]),
+     (TweedieDistribution(power=1.5), [0.1, 1.5]),
+     (TweedieDistribution(power=2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-4), [0.1, 1.5]),
+     (GeneralizedHyperbolicSecant(), [0.1, 1.5])])
+def test_deviance_zero(family, chk_values):
+    """Test deviance(y,y) = 0 for different families."""
+    for x in chk_values:
+        assert_almost_equal(family.deviance(x, x), 0, decimal=10)
+
+
+@pytest.mark.parametrize(
+    'family, link',
+    [(NormalDistribution(), IdentityLink()),
+     (PoissonDistribution(), LogLink()),
+     (GammaDistribution(), LogLink()),
+     (InverseGaussianDistribution(), LogLink()),
+     (TweedieDistribution(power=1.5), LogLink()),
+     (TweedieDistribution(power=4.5), LogLink())])
+def test_fisher_matrix(family, link):
+    """Test the Fisher matrix numerically.
+    Trick: Use numerical differentiation with y = mu"""
+    coef = np.array([-2, 1, 0, 1, 2.5])
+    phi = 0.5
+    rng = np.random.RandomState(42)
+    X = rng.randn(10, 5)
+    lin_pred = np.dot(X, coef)
+    mu = link.inverse(lin_pred)
+    weights = rng.randn(10)**2 + 1
+    fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu,
+                                   weights=weights, link=link)
+    approx = np.array([]).reshape(0, coef.shape[0])
+    for i in range(coef.shape[0]):
+        def f(coef):
+            return -family._score(coef=coef, phi=phi, X=X, y=mu,
+                                  weights=weights, link=link)[i]
+        approx = np.vstack(
+            [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)])
+    assert_allclose(fisher, approx, rtol=1e-3)
+
+
+def test_sample_weights_validation():
+    """Test the raised errors in the validation of sample_weight."""
+    # 1. scalar value but not positive
+    X = [[1]]
+    y = [1]
+    weights = 0
+    glm = GeneralizedLinearRegressor(fit_intercept=False)
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
+
+    # 2. 2d array
+    weights = [[0]]
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
+
+    # 3. 1d but wrong length
+    weights = [1, 0]
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
+
+    # 4. 1d but only zeros (sum not greater than 0)
+    weights = [0, 0]
+    X = [[0], [1]]
+    y = [1, 2]
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
+
+    # 5. 1d but weith a negative value
+    weights = [2, -1]
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
+
+
+@pytest.mark.parametrize('f, fam',
+                         [('normal', NormalDistribution()),
+                          ('poisson', PoissonDistribution()),
+                          ('gamma', GammaDistribution()),
+                          ('inverse.gaussian', InverseGaussianDistribution()),
+                          ('binomial', BinomialDistribution())])
+def test_glm_family_argument(f, fam):
+    """Test GLM family argument set as string."""
+    y = np.array([0.1, 0.5])  # in range of all distributions
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
+    assert_equal(type(glm._family_instance), type(fam))
+
+    glm = GeneralizedLinearRegressor(family='not a family',
+                                     fit_intercept=False)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('l, link',
+                         [('identity', IdentityLink()),
+                          ('log', LogLink()),
+                          ('logit', LogitLink())])
+def test_glm_link_argument(l, link):
+    """Test GLM link argument set as string."""
+    y = np.array([0.1, 0.5])  # in range of all distributions
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y)
+    assert_equal(type(glm._link_instance), type(link))
+
+    glm = GeneralizedLinearRegressor(family='normal', link='not a link')
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('alpha', ['not a number', -4.2])
+def test_glm_alpha_argument(alpha):
+    """Test GLM for invalid alpha argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', alpha=alpha)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]])
+def test_glm_l1_ratio_argument(l1_ratio):
+    """Test GLM for invalid l1_ratio argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3],
+                                [-1]])
+def test_glm_P1_argument(P1):
+    """Test GLM for invalid P1 argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True)
+    with pytest.raises((ValueError, TypeError)):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]],
+                                sparse.csr_matrix([1, 2, 3]), [-1]])
+def test_glm_P2_argument(P2):
+    """Test GLM for invalid P2 argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(P2=P2, check_input=True)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+def test_glm_P2_positive_semidefinite():
+    """Test GLM for a positive semi-definite P2 argument."""
+    n_samples, n_features = 10, 5
+    y = np.arange(n_samples)
+    X = np.zeros((n_samples, n_features))
+    P2 = np.diag([100, 10, 5, 0, -1E-5])
+    rng = np.random.RandomState(42)
+    # construct random orthogonal matrix Q
+    Q, R = linalg.qr(rng.randn(n_features, n_features))
+    P2 = Q.T @ P2 @ Q
+    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
+                                     check_input=True)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+    P2 = sparse.csr_matrix(P2)
+    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
+                                     check_input=True)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]])
+def test_glm_fit_intercept_argument(fit_intercept):
+    """Test GLM for invalid fit_intercept argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('solver, l1_ratio',
+                         [('not a solver', 0), (1, 0), ([1], 0),
+                          ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)])
+def test_glm_solver_argument(solver, l1_ratio):
+    """Test GLM for invalid solver argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]])
+def test_glm_max_iter_argument(max_iter):
+    """Test GLM for invalid max_iter argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(max_iter=max_iter)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]])
+def test_glm_tol_argument(tol):
+    """Test GLM for invalid tol argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(tol=tol)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]])
+def test_glm_warm_start_argument(warm_start):
+    """Test GLM for invalid warm_start argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(warm_start=warm_start)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('start_params',
+                         ['not a start_params', ['zero'], [0, 0, 0],
+                          [[0, 0]], ['a', 'b']])
+def test_glm_start_params_argument(start_params):
+    """Test GLM for invalid start_params argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(start_params=start_params)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']])
+def test_glm_selection_argument(selection):
+    """Test GLM for invalid selection argument"""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(selection=selection)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]])
+def test_glm_random_state_argument(random_state):
+    """Test GLM for invalid random_state argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(random_state=random_state)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]])
+def test_glm_diag_fisher_argument(diag_fisher):
+    """Test GLM for invalid diag_fisher arguments."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]])
+def test_glm_copy_X_argument(copy_X):
+    """Test GLM for invalid copy_X arguments."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(copy_X=copy_X)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]])
+def test_glm_check_input_argument(check_input):
+    """Test GLM for invalid check_input argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(check_input=check_input)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
+def test_glm_identiy_regression(solver):
+    """Test GLM regression with identity link on a simple dataset."""
+    coef = [1, 2]
+    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
+    y = np.dot(X, coef)
+    glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
+                                     fit_intercept=False, solver=solver,
+                                     start_params='zero', tol=1e-7)
+    res = glm.fit(X, y)
+    assert_array_almost_equal(res.coef_, coef)
+
+
+@pytest.mark.parametrize(
+    'family',
+    [NormalDistribution(), PoissonDistribution(),
+     GammaDistribution(), InverseGaussianDistribution(),
+     TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
+     GeneralizedHyperbolicSecant()])
+@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6),
+                                              ('lbfgs', 1e-6, 6),
+                                              ('newton-cg', 1e-7, 6),
+                                              ('cd', 1e-7, 6)])
+def test_glm_log_regression(family, solver, tol, dec):
+    """Test GLM regression with log link on a simple dataset."""
+    coef = [0.2, -0.1]
+    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
+    y = np.exp(np.dot(X, coef))
+    glm = GeneralizedLinearRegressor(
+                alpha=0, family=family, link='log', fit_intercept=False,
+                solver=solver, start_params='guess', tol=tol)
+    res = glm.fit(X, y)
+    assert_array_almost_equal(res.coef_, coef, decimal=dec)
+
+
+@pytest.mark.filterwarnings('ignore::DeprecationWarning')
+@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 5),
+                                              ('lbfgs', 1e-6, 5),
+                                              ('newton-cg', 1e-5, 5),
+                                              ('cd', 1e-6, 6)])
+def test_normal_ridge(solver, tol, dec):
+    """Test ridge regression for Normal distributions.
+
+    Compare to test_ridge in test_ridge.py.
+    """
+    rng = np.random.RandomState(42)
+    alpha = 1.0
+
+    # 1. With more samples than features
+    n_samples, n_features, n_predict = 100, 7, 10
+    X, y, coef = make_regression(n_samples=n_samples+n_predict,
+                                 n_features=n_features,
+                                 n_informative=n_features-2, noise=0.5,
+                                 coef=True, random_state=rng)
+    y = y[0:n_samples]
+    X, T = X[0:n_samples], X[n_samples:]
+
+    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6,
+                  solver='svd', normalize=False)
+    ridge.fit(X, y)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=True,
+                                     tol=tol, max_iter=100, solver=solver,
+                                     check_input=False, random_state=rng)
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
+
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
+                  solver='svd', normalize=False)
+    ridge.fit(X, y)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=False,
+                                     tol=tol, max_iter=100, solver=solver,
+                                     check_input=False, random_state=rng,
+                                     fit_dispersion='chisqr')
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
+    mu = glm.predict(X)
+    assert_almost_equal(glm.dispersion_,
+                        np.sum((y-mu)**2/(n_samples-n_features)))
+
+    # 2. With more features than samples and sparse
+    n_samples, n_features, n_predict = 10, 100, 10
+    X, y, coef = make_regression(n_samples=n_samples+n_predict,
+                                 n_features=n_features,
+                                 n_informative=n_features-2, noise=0.5,
+                                 coef=True, random_state=rng)
+    y = y[0:n_samples]
+    X, T = X[0:n_samples], X[n_samples:]
+
+    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9,
+                  solver='sag', normalize=False, max_iter=100000)
+    ridge.fit(X, y)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=True,
+                                     tol=tol, max_iter=300, solver=solver,
+                                     check_input=False, random_state=rng)
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
+
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7,
+                  solver='sag', normalize=False, max_iter=1000)
+    ridge.fit(X, y)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=False,
+                                     tol=tol*2, max_iter=300, solver=solver,
+                                     check_input=False, random_state=rng)
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2)
+
+
+@pytest.mark.parametrize('solver, tol, dec',
+                         [('irls', 1e-7, 6),
+                          ('lbfgs', 1e-7, 5),
+                          ('newton-cg', 1e-7, 5),
+                          ('cd', 1e-7, 7)])
+def test_poisson_ridge(solver, tol, dec):
+    """Test ridge regression with poisson family and LogLink.
+
+    Compare to R's glmnet"""
+    # library("glmnet")
+    # options(digits=10)
+    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
+    # x <- data.matrix(df[,c("a", "b")])
+    # y <- df$y
+    # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson",
+    #               standardize=F, thresh=1e-10, nlambda=10000)
+    # coef(fit, s=1)
+    # (Intercept) -0.12889386979
+    # a            0.29019207995
+    # b            0.03741173122
+    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
+    y = np.array([0, 1, 1, 2])
+    rng = np.random.RandomState(42)
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
+                                     fit_intercept=True, family='poisson',
+                                     link='log', tol=tol,
+                                     solver=solver, max_iter=300,
+                                     random_state=rng)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, -0.12889386979,
+                        decimal=dec)
+    assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
+                              decimal=dec)
+
+
+@pytest.mark.parametrize('diag_fisher', [False, True])
+def test_normal_enet(diag_fisher):
+    """Test elastic net regression with normal/gaussian family."""
+    alpha, l1_ratio = 0.3, 0.7
+    n_samples, n_features = 20, 2
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples, n_features).copy(order='F')
+    beta = rng.randn(n_features)
+    y = 2 + np.dot(X, beta) + rng.randn(n_samples)
+
+    # 1. test normal enet on dense data
+    glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio,
+                                     family='normal', link='identity',
+                                     fit_intercept=True, tol=1e-8,
+                                     max_iter=100, selection='cyclic',
+                                     solver='cd', start_params='zero',
+                                     check_input=False,
+                                     diag_fisher=diag_fisher)
+    glm.fit(X, y)
+
+    enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True,
+                      normalize=False, tol=1e-8, copy_X=True)
+    enet.fit(X, y)
+
+    assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
+    assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
+
+    # 2. test normal enet on sparse data
+    X = sparse.csc_matrix(X)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
+    assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
+
+
+def test_poisson_enet():
+    """Test elastic net regression with poisson family and LogLink.
+
+    Compare to R's glmnet"""
+    # library("glmnet")
+    # options(digits=10)
+    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
+    # x <- data.matrix(df[,c("a", "b")])
+    # y <- df$y
+    # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson",
+    #               standardize=F, thresh=1e-10, nlambda=10000)
+    # coef(fit, s=1)
+    # (Intercept) -0.03550978409
+    # a            0.16936423283
+    # b            .
+    glmnet_intercept = -0.03550978409
+    glmnet_coef = [0.16936423283, 0.]
+    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
+    y = np.array([0, 1, 1, 2])
+    rng = np.random.RandomState(42)
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
+                                     link='log', solver='cd', tol=1e-8,
+                                     selection='random', random_state=rng,
+                                     start_params='guess')
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7)
+    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7)
+
+    # test results with general optimization procedure
+    def obj(coef):
+        pd = PoissonDistribution()
+        link = LogLink()
+        N = y.shape[0]
+        mu = link.inverse(X @ coef[1:] + coef[0])
+        alpha, l1_ratio = (1, 0.5)
+        return 1./(2.*N) * pd.deviance(y, mu) \
+            + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \
+            + alpha * l1_ratio * np.sum(np.abs(coef[1:]))
+    res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10,
+                            options={'maxiter': 1000, 'disp': False})
+    assert_almost_equal(glm.intercept_, res.x[0], decimal=5)
+    assert_almost_equal(glm.coef_, res.x[1:], decimal=5)
+    assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))),
+                        res.fun, decimal=8)
+
+    # same for start_params='zero' and selection='cyclic'
+    # with reduced precision
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
+                                     link='log', solver='cd', tol=1e-5,
+                                     selection='cyclic', start_params='zero')
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
+    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
+
+    # check warm_start, therefore start with different alpha
+    glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5,
+                                     family='poisson', max_iter=300,
+                                     link='log', solver='cd', tol=1e-5,
+                                     selection='cyclic', start_params='zero')
+    glm.fit(X, y)
+    # warm start with original alpha and use of sparse matrices
+    glm.warm_start = True
+    glm.alpha = 1
+    X = sparse.csr_matrix(X)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
+    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
+
+
+@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10])
+def test_binomial_enet(alpha):
+    """Test elastic net regression with binomial family and LogitLink.
+
+    Compare to LogisticRegression.
+    """
+    l1_ratio = 0.5
+    n_samples = 500
+    rng = np.random.RandomState(42)
+    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6,
+                               n_informative=5, n_redundant=0, n_repeated=0,
+                               random_state=rng)
+    log = LogisticRegression(
+        penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6,
+        max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha),
+        solver='saga')
+    log.fit(X, y)
+    glm = GeneralizedLinearRegressor(
+        family=BinomialDistribution(), link=LogitLink(), fit_intercept=False,
+        alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic',
+        tol=1e-7)
+    glm.fit(X, y)
+    assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6)
+    assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6)

From 9b574bdc0b3a4a1081f73563d0468c864fcbbb22 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 11 Jun 2019 07:30:52 -0500
Subject: [PATCH 058/209] Fix docstrings for the new print_changed_only=True by
 default

---
 doc/modules/linear_model.rst | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 52f28346cc047..b9f0e96734ae5 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -942,17 +942,11 @@ follows:
 
     >>> from sklearn.linear_model import GeneralizedLinearRegressor
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
-    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
-    GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5,
-                               check_input=True, copy_X=True, diag_fisher=False,
-                               family='poisson', fit_dispersion=None,
-                               fit_intercept=True, l1_ratio=0, link='log',
-                               max_iter=100, random_state=None, selection='cyclic',
-                               solver='auto', start_params='guess', tol=0.0001,
-                               verbose=0, warm_start=False)
-    >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE
+    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
+    GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
+    >>> reg.coef_
     array([0.24630169, 0.43373464])
-    >>> reg.intercept_ #doctest: +ELLIPSIS
+    >>> reg.intercept_
     -0.76383633...
 
 

From 90299fdd8a4745965eb8f9dbe26ddb685f1531f1 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Wed, 12 Jun 2019 04:31:50 -0500
Subject: [PATCH 059/209] Increase coverage

---
 sklearn/linear_model/_glm.py           | 22 +++-----
 sklearn/linear_model/tests/test_glm.py | 78 ++++++++++++++++++++++++--
 2 files changed, 83 insertions(+), 17 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index 8152e84ac7253..055aacf26d747 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -182,7 +182,7 @@ def link(self, mu):
         mu : array, shape (n_samples,)
             Usually the (predicted) mean.
         """
-        raise NotImplementedError
+        pass
 
     @abstractmethod
     def derivative(self, mu):
@@ -193,7 +193,7 @@ def derivative(self, mu):
         mu : array, shape (n_samples,)
             Usually the (predicted) mean.
         """
-        raise NotImplementedError
+        pass
 
     @abstractmethod
     def inverse(self, lin_pred):
@@ -207,7 +207,7 @@ def inverse(self, lin_pred):
         lin_pred : array, shape (n_samples,)
             Usually the (fitted) linear predictor.
         """
-        raise NotImplementedError
+        pass
 
     @abstractmethod
     def inverse_derivative(self, lin_pred):
@@ -218,7 +218,7 @@ def inverse_derivative(self, lin_pred):
         lin_pred : array, shape (n_samples,)
             Usually the (fitted) linear predictor.
         """
-        raise NotImplementedError
+        pass
 
     @abstractmethod
     def inverse_derivative2(self, lin_pred):
@@ -229,7 +229,7 @@ def inverse_derivative2(self, lin_pred):
         lin_pred : array, shape (n_samples,)
             Usually the (fitted) linear predictor.
         """
-        raise NotImplementedError
+        pass
 
 
 class IdentityLink(Link):
@@ -400,7 +400,7 @@ def unit_variance(self, mu):
         mu : array, shape (n_samples,)
             Predicted mean.
         """
-        raise NotImplementedError()
+        pass
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
@@ -413,7 +413,7 @@ def unit_variance_derivative(self, mu):
         mu : array, shape (n_samples,)
             Target values.
         """
-        raise NotImplementedError()
+        pass
 
     def variance(self, mu, phi=1, weights=1):
         r"""Compute the variance function.
@@ -473,7 +473,7 @@ def unit_deviance(self, y, mu):
         mu : array, shape (n_samples,)
             Predicted mean.
         """
-        raise NotImplementedError()
+        pass
 
     def unit_deviance_derivative(self, y, mu):
         r"""Compute the derivative of the unit deviance w.r.t. mu.
@@ -1079,7 +1079,6 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
                 B[idx:, idx:] += P2.toarray()
             else:
                 B[idx:, idx:] += P2
-    # A = -score + coef_P2
     A = -score
     A[idx:] += coef_P2
     # A += d @ (H+P2) but so far d=0
@@ -1302,9 +1301,6 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
         P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32],
                          order='F', copy=copy_X)
     if sparse.issparse(X):
-        if not sparse.isspmatrix_csc(X):
-            raise ValueError("If X is sparse, it must be in csc format"
-                             "; got (format={})".format(X.format))
         if not sparse.isspmatrix_csc(P2):
             raise ValueError("If X is sparse, P2 must also be sparse csc"
                              "format. Got P2 not sparse.")
@@ -2021,7 +2017,7 @@ def fit(self, X, y, sample_weight=None):
                         _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles,
                                   inner_tol, max_inner_iter=1000,
                                   selection=self.selection,
-                                  random_state=self.random_state,
+                                  random_state=random_state,
                                   diag_fisher=self.diag_fisher)
                     coef += d  # for simplicity no line search here
                 else:
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index d42a8739f6aa0..230bbdabae201 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -21,10 +21,20 @@
     GeneralizedHyperbolicSecant, BinomialDistribution,
 )
 from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
+from sklearn.metrics import mean_absolute_error
 
 from sklearn.utils.testing import assert_array_equal
 
 
+@pytest.fixture(scope="module")
+def regression_data():
+    X, y = make_regression(n_samples=107,
+                           n_features=10,
+                           n_informative=80, noise=0.5,
+                           random_state=2)
+    return X, y
+
+
 @pytest.mark.parametrize('link', Link.__subclasses__())
 def test_link_properties(link):
     """Test link inverse and derivative."""
@@ -39,6 +49,10 @@ def test_link_properties(link):
     # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
     assert_allclose(link.derivative(link.inverse(x)),
                     1./link.inverse_derivative(x))
+
+    assert (
+      link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape)
+
     # for LogitLink, in the following x should be between 0 and 1.
     # assert_almost_equal(link.inverse_derivative(link.link(x)),
     #                     1./link.derivative(x), decimal=decimal)
@@ -108,7 +122,7 @@ def f(coef):
 
 def test_sample_weights_validation():
     """Test the raised errors in the validation of sample_weight."""
-    # 1. scalar value but not positive
+    # scalar value but not positive
     X = [[1]]
     y = [1]
     weights = 0
@@ -116,17 +130,20 @@ def test_sample_weights_validation():
     with pytest.raises(ValueError):
         glm.fit(X, y, weights)
 
-    # 2. 2d array
+    # Positive weights are accepted
+    glm.fit(X, y, sample_weight=1)
+
+    # 2d array
     weights = [[0]]
     with pytest.raises(ValueError):
         glm.fit(X, y, weights)
 
-    # 3. 1d but wrong length
+    # 1d but wrong length
     weights = [1, 0]
     with pytest.raises(ValueError):
         glm.fit(X, y, weights)
 
-    # 4. 1d but only zeros (sum not greater than 0)
+    # 1d but only zeros (sum not greater than 0)
     weights = [0, 0]
     X = [[0], [1]]
     y = [1, 2]
@@ -643,3 +660,56 @@ def test_binomial_enet(alpha):
     glm.fit(X, y)
     assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6)
     assert_allclose(log.coef_[0, :], glm.coef_, rtol=5e-6)
+
+
+@pytest.mark.parametrize(
+        "params",
+        [
+            {"solver": "irls", "start_params": "guess"},
+            {"solver": "irls", "start_params": "zero"},
+            {"solver": "lbfgs", "start_params": "guess"},
+            {"solver": "lbfgs", "start_params": "zero"},
+            {"solver": "newton-cg"},
+            {"solver": "cd", "selection": "cyclic", "diag_fisher": False},
+            {"solver": "cd", "selection": "cyclic", "diag_fisher": True},
+            {"solver": "cd", "selection": "random", "diag_fisher": False},
+        ],
+        ids=lambda params: ', '.join("%s=%s" % (key, val)
+                                     for key,  val in params.items())
+)
+def test_solver_equivalence(params, regression_data):
+    X, y = regression_data
+    est_ref = GeneralizedLinearRegressor(random_state=2)
+    est_ref.fit(X, y)
+
+    estimator = GeneralizedLinearRegressor(**params)
+    estimator.set_params(random_state=2)
+
+    estimator.fit(X, y)
+
+    assert_allclose(estimator.intercept_, est_ref.intercept_, rtol=1e-4)
+    assert_allclose(estimator.coef_, est_ref.coef_, rtol=1e-4)
+    assert_allclose(
+        mean_absolute_error(estimator.predict(X), y),
+        mean_absolute_error(est_ref.predict(X), y),
+        rtol=1e-4
+    )
+
+
+def test_fit_dispersion(regression_data):
+    X, y = regression_data
+
+    est1 = GeneralizedLinearRegressor(random_state=2)
+    est1.fit(X, y)
+    assert not hasattr(est1, "dispersion_")
+
+    est2 = GeneralizedLinearRegressor(random_state=2, fit_dispersion="chisqr")
+    est2.fit(X, y)
+    assert isinstance(est2.dispersion_, float)
+
+    est3 = GeneralizedLinearRegressor(
+            random_state=2, fit_dispersion="deviance")
+    est3.fit(X, y)
+    assert isinstance(est3.dispersion_, float)
+
+    assert_allclose(est2.dispersion_,  est3.dispersion_)

From e3a5a9aa6855928d89bdc5dad7f3a46ba934fb8e Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Wed, 12 Jun 2019 12:00:28 -0500
Subject: [PATCH 060/209] More tests and addressing some review comments

---
 sklearn/linear_model/_glm.py           | 51 +++++++++++++-------------
 sklearn/linear_model/tests/test_glm.py | 43 +++++++++++++++++++++-
 2 files changed, 67 insertions(+), 27 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index 055aacf26d747..acce438b2b5e2 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -6,8 +6,6 @@
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
-# TODO: Should the option `normalize` be included (like other linear models)?
-#       So far, it is not included. User must pass a normalized X.
 # TODO: Add cross validation support, e.g. GCV?
 # TODO: Should GeneralizedLinearRegressor inherit from LinearModel?
 #       So far, it does not.
@@ -287,7 +285,6 @@ def inverse_derivative(self, lin_pred):
         return ep * (1. - ep)
 
     def inverse_derivative2(self, lin_pred):
-        ep = special.expit(lin_pred)
         ep = special.expit(lin_pred)
         return ep * (1. - ep) * (1. - 2 * ep)
 
@@ -738,7 +735,19 @@ class TweedieDistribution(ExponentialDispersionModel):
             For ``0<power<1``, no distribution exists.
     """
     def __init__(self, power=0):
+        # validate power and set _upper_bound, _include_upper_bound attrs
         self.power = power
+
+    @property
+    def power(self):
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        if not isinstance(power, numbers.Real):
+            raise TypeError('power must be a real number, input was {0}'
+                            .format(power))
+
         self._upper_bound = np.Inf
         self._include_upper_bound = False
         if power < 0:
@@ -775,19 +784,10 @@ def __init__(self, power=0):
             # Positive Stable
             self._lower_bound = 0
             self._include_lower_bound = False
-        else:
-            raise ValueError('The power must be a float, i.e. real number, '
-                             'got (power={})'.format(power))
+        else:  # pragma: no cover
+            # this branch should be unreachable.
+            raise ValueError
 
-    @property
-    def power(self):
-        return self._power
-
-    @power.setter
-    def power(self, power):
-        if not isinstance(power, numbers.Real):
-            raise TypeError('power must be a real number, input was {0}'
-                            .format(power))
         self._power = power
 
     def unit_variance(self, mu):
@@ -962,7 +962,7 @@ def _irls_step(X, W, P2, z, fit_intercept=True):
             A += P2.toarray()
         else:
             A += P2
-    # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True)
+
     coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
     return coef
 
@@ -991,12 +991,13 @@ def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link,
     # Note: P2 must be symmetrized
     # Note: ' denotes derivative, but also transpose for matrices
 
-    # eta = linear predictor
     eta = _safe_lin_pred(X, coef)
     mu = link.inverse(eta)
     # D = h'(eta)
     hp = link.inverse_derivative(eta)
     V = family.variance(mu, phi=1, weights=weights)
+
+    converged = False
     n_iter = 0
     while n_iter < max_iter:
         n_iter += 1
@@ -1360,7 +1361,6 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
             if Fwd - Fw <= sigma * la * bound:
                 break
         # update coefficients
-        # coef_old = coef.copy()
         coef += la * d
         # calculate eta, mu, score, Fisher matrix for next iteration
         eta, mu, score, fisher = family._eta_mu_score_fisher(
@@ -2079,14 +2079,13 @@ def func(coef, X, y, weights, P2, family, link):
                 func, coef, fprime=None, args=args,
                 iprint=(self.verbose > 0) - 1, pgtol=self.tol,
                 maxiter=self.max_iter, factr=1e3)
-            if self.verbose > 0:
-                if info["warnflag"] == 1:
-                    warnings.warn("lbfgs failed to converge."
-                                  " Increase the number of iterations.",
-                                  ConvergenceWarning)
-                elif info["warnflag"] == 2:
-                    warnings.warn("lbfgs failed for the reason: {0}"
-                                  .format(info["task"]))
+            if info["warnflag"] == 1:
+                warnings.warn("lbfgs failed to converge."
+                              " Increase the number of iterations.",
+                              ConvergenceWarning)
+            elif info["warnflag"] == 2:
+                warnings.warn("lbfgs failed for the reason: {0}"
+                              .format(info["task"]))
             self.n_iter_ = info['nit']
 
         # 4.3 Newton-CG #######################################################
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 230bbdabae201..7229f21840829 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -22,6 +22,7 @@
 )
 from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
 from sklearn.metrics import mean_absolute_error
+from sklearn.exceptions import ConvergenceWarning
 
 from sklearn.utils.testing import assert_array_equal
 
@@ -72,6 +73,23 @@ def test_family_bounds(family, expected):
     assert_array_equal(result, expected)
 
 
+def test_tweedie_distribution_power():
+    with pytest.raises(ValueError, match="no distribution exists"):
+        TweedieDistribution(power=0.5)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        TweedieDistribution(power=1j)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        dist = TweedieDistribution()
+        dist.power = 1j
+
+    dist = TweedieDistribution()
+    assert dist._include_lower_bound is False
+    dist.power = 1
+    assert dist._include_lower_bound is True
+
+
 @pytest.mark.parametrize(
     'family, chk_values',
     [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
@@ -97,7 +115,8 @@ def test_deviance_zero(family, chk_values):
      (GammaDistribution(), LogLink()),
      (InverseGaussianDistribution(), LogLink()),
      (TweedieDistribution(power=1.5), LogLink()),
-     (TweedieDistribution(power=4.5), LogLink())])
+     (TweedieDistribution(power=4.5), LogLink())],
+    ids=lambda args: args.__class__.__name__)
 def test_fisher_matrix(family, link):
     """Test the Fisher matrix numerically.
     Trick: Use numerical differentiation with y = mu"""
@@ -110,6 +129,11 @@ def test_fisher_matrix(family, link):
     weights = rng.randn(10)**2 + 1
     fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu,
                                    weights=weights, link=link)
+    # check that the Fisher matrix is square and positive definite
+    assert fisher.ndim == 2
+    assert fisher.shape[0] == fisher.shape[1]
+    assert np.all(np.linalg.eigvals(fisher) >= 0)
+
     approx = np.array([]).reshape(0, coef.shape[0])
     for i in range(coef.shape[0]):
         def f(coef):
@@ -119,6 +143,13 @@ def f(coef):
             [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)])
     assert_allclose(fisher, approx, rtol=1e-3)
 
+    # check the observed information matrix
+    oim = family._observed_information(coef=coef, phi=phi, X=X, y=mu,
+                                       weights=weights, link=link)
+    assert oim.ndim == 2
+    assert oim.shape == fisher.shape
+    assert_allclose(oim, fisher)
+
 
 def test_sample_weights_validation():
     """Test the raised errors in the validation of sample_weight."""
@@ -713,3 +744,13 @@ def test_fit_dispersion(regression_data):
     assert isinstance(est3.dispersion_, float)
 
     assert_allclose(est2.dispersion_,  est3.dispersion_)
+
+
+@pytest.mark.parametrize("solver", ["irls", "lbfgs", "newton-cg", "cd"])
+def test_convergence_warning(solver, regression_data):
+    X, y = regression_data
+
+    est = GeneralizedLinearRegressor(solver=solver, random_state=2,
+                                     max_iter=1, tol=1e-20)
+    with pytest.warns(ConvergenceWarning):
+        est.fit(X, y)

From 54b80b8db780fbd3b438c6d5dc0001cdafe7a47e Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 13 Jun 2019 09:50:53 -0500
Subject: [PATCH 061/209] TST More specific checks of error messages in tests

---
 sklearn/linear_model/_glm.py           |  6 ++--
 sklearn/linear_model/tests/test_glm.py | 44 ++++++++++++++------------
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index acce438b2b5e2..c4f8cf7a975d3 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -1941,8 +1941,10 @@ def fit(self, X, y, sample_weight=None):
                     k = np.min([10, n_features // 10 + 1])
                     sigma = 0  # start searching near this value
                     which = 'SA'  # find smallest algebraic eigenvalues first
-                    if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma,
-                                                 which=which) >= epsneg):
+                    eigenvalues = splinalg.eigsh(P2, k=k, sigma=sigma,
+                                                 which=which,
+                                                 return_eigenvectors=False)
+                    if not np.all(eigenvalues >= epsneg):
                         raise ValueError("P2 must be positive semi-definite.")
                 else:
                     if not np.all(linalg.eigvalsh(P2) >= epsneg):
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 7229f21840829..de1a5262b36ce 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -158,7 +158,7 @@ def test_sample_weights_validation():
     y = [1]
     weights = 0
     glm = GeneralizedLinearRegressor(fit_intercept=False)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="weights must be non-negative"):
         glm.fit(X, y, weights)
 
     # Positive weights are accepted
@@ -166,24 +166,26 @@ def test_sample_weights_validation():
 
     # 2d array
     weights = [[0]]
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="must be 1D array or scalar"):
         glm.fit(X, y, weights)
 
     # 1d but wrong length
     weights = [1, 0]
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError,
+                       match="weights must have the same length as y"):
         glm.fit(X, y, weights)
 
     # 1d but only zeros (sum not greater than 0)
     weights = [0, 0]
     X = [[0], [1]]
     y = [1, 2]
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError,
+                       match="must have at least one positive element"):
         glm.fit(X, y, weights)
 
     # 5. 1d but with a negative value
     weights = [2, -1]
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="weights must be non-negative"):
         glm.fit(X, y, weights)
 
 
@@ -202,7 +204,7 @@ def test_glm_family_argument(f, fam):
 
     glm = GeneralizedLinearRegressor(family='not a family',
                                      fit_intercept=False)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="family must be"):
         glm.fit(X, y)
 
 
@@ -218,7 +220,7 @@ def test_glm_link_argument(l, link):
     assert isinstance(glm._link_instance, link.__class__)
 
     glm = GeneralizedLinearRegressor(family='normal', link='not a link')
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="link must be"):
         glm.fit(X, y)
 
 
@@ -228,7 +230,8 @@ def test_glm_alpha_argument(alpha):
     y = np.array([1, 2])
     X = np.array([[1], [2]])
     glm = GeneralizedLinearRegressor(family='normal', alpha=alpha)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError,
+                       match="Penalty term must be a non-negative"):
         glm.fit(X, y)
 
 
@@ -238,7 +241,8 @@ def test_glm_l1_ratio_argument(l1_ratio):
     y = np.array([1, 2])
     X = np.array([[1], [2]])
     glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError,
+                       match="l1_ratio must be a number in interval.*0, 1"):
         glm.fit(X, y)
 
 
@@ -276,13 +280,13 @@ def test_glm_P2_positive_semidefinite():
     P2 = Q.T @ P2 @ Q
     glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
                                      check_input=True)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="P2 must be positive semi-definite"):
         glm.fit(X, y)
 
     P2 = sparse.csr_matrix(P2)
     glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
                                      check_input=True)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="P2 must be positive semi-definite"):
         glm.fit(X, y)
 
 
@@ -292,7 +296,7 @@ def test_glm_fit_intercept_argument(fit_intercept):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="fit_intercept must be bool"):
         glm.fit(X, y)
 
 
@@ -314,7 +318,7 @@ def test_glm_max_iter_argument(max_iter):
     y = np.array([1, 2])
     X = np.array([[1], [2]])
     glm = GeneralizedLinearRegressor(max_iter=max_iter)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="must be a positive integer"):
         glm.fit(X, y)
 
 
@@ -324,7 +328,7 @@ def test_glm_tol_argument(tol):
     y = np.array([1, 2])
     X = np.array([[1], [2]])
     glm = GeneralizedLinearRegressor(tol=tol)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="stopping criteria must be positive"):
         glm.fit(X, y)
 
 
@@ -334,7 +338,7 @@ def test_glm_warm_start_argument(warm_start):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(warm_start=warm_start)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="warm_start must be bool"):
         glm.fit(X, y)
 
 
@@ -356,7 +360,7 @@ def test_glm_selection_argument(selection):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(selection=selection)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="argument selection must be"):
         glm.fit(X, y)
 
 
@@ -366,7 +370,7 @@ def test_glm_random_state_argument(random_state):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(random_state=random_state)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="cannot be used to seed"):
         glm.fit(X, y)
 
 
@@ -376,7 +380,7 @@ def test_glm_diag_fisher_argument(diag_fisher):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="diag_fisher must be bool"):
         glm.fit(X, y)
 
 
@@ -386,7 +390,7 @@ def test_glm_copy_X_argument(copy_X):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(copy_X=copy_X)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="copy_X must be bool"):
         glm.fit(X, y)
 
 
@@ -396,7 +400,7 @@ def test_glm_check_input_argument(check_input):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(check_input=check_input)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="check_input must be bool"):
         glm.fit(X, y)
 
 

From 7db0320f460676d8bed8e01bf64657a89532e2cb Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 14 Jun 2019 10:44:16 -0500
Subject: [PATCH 062/209] Add PoissonRegressor alias

---
 sklearn/linear_model/__init__.py |   5 +-
 sklearn/linear_model/_glm.py     | 163 +++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index e5ede64413eb5..121418f901a1a 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -19,7 +19,7 @@
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
 from ._glm import (TweedieDistribution,
-                   GeneralizedLinearRegressor)
+                   GeneralizedLinearRegressor, PoissonRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
@@ -82,4 +82,5 @@
            'ridge_regression',
            'RANSACRegressor',
            'GeneralizedLinearRegressor',
-           'TweedieDistribution']
+           'TweedieDistribution',
+           'PoissonRegressor']
diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index c4f8cf7a975d3..b18731e73f328 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -2326,3 +2326,166 @@ def score(self, X, y, sample_weight=None):
         y_mean = np.average(y, weights=weights)
         dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
         return 1. - dev / dev_null
+
+    def _more_tags(self):
+        return {"requires_positive_y": True}
+
+
+class PoissonRegressor(GeneralizedLinearRegressor):
+    """Regression with the response variable y following a Poisson distribution
+
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
+    fitting and predicting the mean of the target y as mu=h(X*w).
+    The fit minimizes the following objective function with L2 regularization::
+
+            1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2
+
+    with inverse link function h and s=sample_weight. Note that for
+    ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples).
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
+
+    Parameters
+    ----------
+    alpha : float, optional (default=1)
+        Constant that multiplies the penalty terms and thus determines the
+        regularization strength.
+        See the notes for the exact mathematical meaning of this
+        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        case, the design matrix X must have full column rank
+        (no collinearities).
+
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
+        Method for estimation of the dispersion parameter phi. Whether to use
+        the chi squared statistic or the deviance statistic. If None, the
+        dispersion is not estimated.
+
+    solver : {'irls', 'lbfgs', 'newton-cg'}, optional (default='irls')
+        Algorithm to use in the optimization problem:
+
+        'irls'
+            Iterated reweighted least squares. It is the standard algorithm
+            for GLMs.
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cg'
+            Newton conjugate gradient algorithm.
+
+        Note that all solvers except lbfgs use the fisher matrix, i.e. the
+        expected Hessian instead of the Hessian matrix.
+
+    max_iter : int, optional (default=100)
+        The maximal number of iterations for solver algorithms.
+
+    tol : float, optional (default=1e-4)
+        Stopping criterion. For the irls, newton-cg and lbfgs solvers,
+        the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
+        where ``g_i`` is the i-th component of the gradient (derivative) of
+        the objective function.
+
+    warm_start : boolean, optional (default=False)
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` (supersedes option
+        ``start_params``). If set to ``True`` or if the attribute ``coef_``
+        does not exit (first call to ``fit``), option ``start_params`` sets the
+        start values for ``coef_`` and ``intercept_``.
+
+    start_params : {'guess', 'zero', array of shape (n_features*, )}, \
+            optional (default='guess')
+        Relevant only if ``warm_start=False`` or if fit is called
+        the first time (``self.coef_`` does not yet exist).
+
+        'guess'
+            Start values of mu are calculated by family.starting_mu(..). Then,
+            one Newton step obtains start values for ``coef_``. If
+            ``solver='irls'``, it uses one irls step. This gives usually good
+            starting values.
+
+        'zero'
+        All coefficients are set to zero. If ``fit_intercept=True``, the
+        start value for the intercept is obtained by the weighted average of y.
+
+        array
+        The array of size n_features* is directly used as start values
+        for ``coef_``. If ``fit_intercept=True``, the first element
+        is assumed to be the start value for the ``intercept_``.
+        Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes
+        the intercept in counting.
+
+    random_state : {int, RandomState instance, None}, optional (default=None)
+        If int, random_state is the seed used by the random
+        number generator; if RandomState instance, random_state is the random
+        number generator; if None, the random number generator is the
+        RandomState instance used by `np.random`. Used when ``selection`` ==
+        'random'.
+
+    copy_X : boolean, optional, (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    verbose : int, optional (default=0)
+        For the lbfgs solver set verbose to any positive number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array, shape (n_features,)
+        Estimated coefficients for the linear predictor (X*coef_+intercept_) in
+        the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    dispersion_ : float
+        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
+
+    n_iter_ : int
+        Actual number of iterations used in solver.
+
+    Notes
+    -----
+    The fit itself does not need Y to be from an EDM, but only assumes
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    :ref:`User Guide <Generalized_linear_regression>`.
+
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    minimizing the deviance plus penalty term, which is equivalent to
+    (penalized) maximum likelihood estimation.
+
+    For alpha > 0, the feature matrix X should be standardized in order to
+    penalize features equally strong.
+
+    If the target y is a ratio, appropriate sample weights s should be
+    provided.
+    As an example, consider Poisson distributed counts z (integers) and
+    weights s=exposure (time, money, persons years, ...). Then you fit
+    y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``.
+    The weights are necessary for the right (finite sample) mean.
+    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
+    in this case one might say that y has a 'scaled' Poisson distributions.
+
+    References
+    ----------
+    For the coordinate descent implementation:
+        * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+          An Improved GLMNET for L1-regularized Logistic Regression,
+          Journal of Machine Learning Research 13 (2012) 1999-2030
+          https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    """
+    def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None,
+                 solver='irls', max_iter=100,
+                 tol=1e-4, warm_start=False, start_params='guess',
+                 random_state=None, copy_X=True, check_input=True, verbose=0):
+
+        super().__init__(alpha=alpha, fit_intercept=fit_intercept,
+                         family="poisson", link='log',
+                         fit_dispersion=fit_dispersion, solver=solver,
+                         max_iter=max_iter, tol=tol, warm_start=warm_start,
+                         start_params=start_params, random_state=random_state,
+                         copy_X=copy_X, verbose=verbose)

From dcfe9edaf83509d2b6d98ee840c28f674ea4f496 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 27 Jun 2019 08:41:04 -0500
Subject: [PATCH 063/209] TST Simplify comparison with ridge

---
 sklearn/linear_model/tests/test_glm.py | 94 ++++++--------------------
 1 file changed, 22 insertions(+), 72 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index de1a5262b36ce..1416bdcfad680 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -26,6 +26,8 @@
 
 from sklearn.utils.testing import assert_array_equal
 
+GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg', 'cd']
+
 
 @pytest.fixture(scope="module")
 def regression_data():
@@ -404,7 +406,7 @@ def test_glm_check_input_argument(check_input):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
+@pytest.mark.parametrize('solver', GLM_SOLVERS)
 def test_glm_identity_regression(solver):
     """Test GLM regression with identity link on a simple dataset."""
     coef = [1., 2.]
@@ -442,97 +444,45 @@ def test_glm_log_regression(family, solver, tol):
 # newton-cg may issue a LineSearchWarning, which we filter out
 @pytest.mark.filterwarnings('ignore:The line search algorithm')
 @pytest.mark.filterwarnings('ignore:Line Search failed')
-@pytest.mark.parametrize('solver, tol', [('irls', 1e-6),
-                                         ('lbfgs', 1e-6),
-                                         ('newton-cg', 1e-6),
-                                         ('cd', 1e-6)])
-def test_normal_ridge(solver, tol):
+@pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)])
+@pytest.mark.parametrize('fit_intercept', [True, False])
+@pytest.mark.parametrize('solver', GLM_SOLVERS)
+def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
     """Test ridge regression for Normal distributions.
 
+    Case n_samples >> n_features
+
     Compare to test_ridge in test_ridge.py.
     """
-    rng = np.random.RandomState(42)
     alpha = 1.0
-
-    # 1. With more samples than features
-    n_samples, n_features, n_predict = 100, 7, 10
+    n_predict = 10
     X, y, coef = make_regression(n_samples=n_samples+n_predict,
                                  n_features=n_features,
                                  n_informative=n_features-2, noise=0.5,
-                                 coef=True, random_state=rng)
+                                 coef=True, random_state=42)
     y = y[0:n_samples]
     X, T = X[0:n_samples], X[n_samples:]
 
-    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6,
-                  solver='svd', normalize=False)
-    ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
-                                     link='identity', fit_intercept=True,
-                                     tol=tol, max_iter=100, solver=solver,
-                                     check_input=False, random_state=rng)
-    glm.fit(X, y)
-    assert glm.coef_.shape == (X.shape[1], )
-    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6)
-    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
-    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6)
-
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
-                  solver='svd', normalize=False)
-    ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
-                                     link='identity', fit_intercept=False,
-                                     tol=tol, max_iter=100, solver=solver,
-                                     check_input=False, random_state=rng,
-                                     fit_dispersion='chisqr')
-    glm.fit(X, y)
-    assert glm.coef_.shape == (X.shape[1], )
-    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-5)
-    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6)
-    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6)
-    mu = glm.predict(X)
-    assert_allclose(glm.dispersion_,
-                    np.sum((y-mu)**2/(n_samples-n_features)))
-
-    # 2. With more features than samples and sparse
-    n_samples, n_features, n_predict = 10, 100, 10
-    X, y, coef = make_regression(n_samples=n_samples+n_predict,
-                                 n_features=n_features,
-                                 n_informative=n_features-2, noise=0.5,
-                                 coef=True, random_state=rng)
-    y = y[0:n_samples]
-    X, T = X[0:n_samples], X[n_samples:]
+    if n_samples > n_features:
+        ridge_params = {"solver": "svd"}
+    else:
+        ridge_params = {"solver": "sag", "max_iter": 10000, "tol": 1e-9}
 
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9,
-                  solver='sag', normalize=False, max_iter=100000,
-                  random_state=42)
+    ridge = Ridge(alpha=alpha*n_samples, normalize=False,
+                  random_state=42, **ridge_params)
     ridge.fit(X, y)
+
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
                                      link='identity', fit_intercept=True,
-                                     tol=tol, max_iter=300, solver=solver,
-                                     check_input=False, random_state=rng)
+                                     max_iter=300, solver=solver, tol=1e-6,
+                                     check_input=False, random_state=42)
     glm.fit(X, y)
     assert glm.coef_.shape == (X.shape[1], )
     assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6)
     assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6)
     assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
 
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7,
-                  solver='sag', normalize=False, max_iter=1000,
-                  random_state=42)
-    ridge.fit(X, y)
-
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
-                                     link='identity', fit_intercept=False,
-                                     tol=tol*2, max_iter=300, solver=solver,
-                                     check_input=False, random_state=rng)
-    glm.fit(X, y)
-    assert glm.coef_.shape == (X.shape[1], )
-    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-4)
-    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
-    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
-
 
 @pytest.mark.parametrize('solver, tol',
                          [('irls', 1e-7),
@@ -559,7 +509,7 @@ def test_poisson_ridge(solver, tol):
     rng = np.random.RandomState(42)
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
                                      fit_intercept=True, family='poisson',
-                                     link='log', tol=tol,
+                                     link='log', tol=1e-7,
                                      solver=solver, max_iter=300,
                                      random_state=rng)
     glm.fit(X, y)
@@ -750,7 +700,7 @@ def test_fit_dispersion(regression_data):
     assert_allclose(est2.dispersion_,  est3.dispersion_)
 
 
-@pytest.mark.parametrize("solver", ["irls", "lbfgs", "newton-cg", "cd"])
+@pytest.mark.parametrize("solver", GLM_SOLVERS)
 def test_convergence_warning(solver, regression_data):
     X, y = regression_data
 

From 4879bb6e057a95c38ee4950d13ef2dbd98da0a19 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 28 Jun 2019 09:54:59 -0500
Subject: [PATCH 064/209] EXA Add plot_tweedie_regression_insurance_claims.py

---
 ...lot_tweedie_regression_insurance_claims.py | 500 ++++++++++++++++++
 1 file changed, 500 insertions(+)
 create mode 100644 examples/linear_model/plot_tweedie_regression_insurance_claims.py

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
new file mode 100644
index 0000000000000..cb2ff667e8379
--- /dev/null
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -0,0 +1,500 @@
+"""
+======================================
+Tweedie regression on insurance claims
+======================================
+
+This example illustrate the use Poisson, Gamma and Tweedie regression
+on the French Motor Third-Party Liability Claims dataset, and is inspired
+by an R tutorial [1].
+
+Insurance claims data consist of the number of claims and the total claim
+amount. Often, the final goal is to predict the expected value, i.e. the mean,
+of the total claim amount. There are several possibilities to do that, two of
+which are:
+
+1. Model the number of claims with a Poisson distribution, the average
+   claim amount as a Gamma distribution and multiply the predictions, to get
+   the total claim amount.
+2. Model total claim amount directly, typically with a Tweedie distribution.
+
+In this example we will illustrate both approaches. We start by defining a few
+helper functions for loading the data and visualizing results.
+
+
+.. [1]  A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor
+    Third-Party Liability Claims (November 8, 2018).
+    `doi:10.2139/ssrn.3164764 <http://dx.doi.org/10.2139/ssrn.3164764>`_
+
+"""
+print(__doc__)
+
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#          Roman Yurchak <rth.yurchak@gmail.com>
+# License: BSD 3 clause
+from functools import partial
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import GeneralizedLinearRegressor
+from sklearn.linear_model._glm import TweedieDistribution
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
+from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
+
+from sklearn.metrics import mean_absolute_error
+
+
+def load_mtpl2(n_samples=100000):
+    """Fetcher for French Motor Third-Party Liability Claims dataset
+
+    Parameters
+    ----------
+    n_samples: int, default=100000
+      number of samples to select (for faster run time).
+    """
+
+    # Note: this should use the OpenML DataFrame fetcher in the future
+    df_freq = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv",
+        dtype={"IDpol": np.int},
+        index_col=0,
+    )
+
+    df_sev = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff",
+        index_col=0,
+    )
+
+    # sum ClaimAmount over identical IDs
+    df_sev = df_sev.groupby(level=0).sum()
+
+    df = df_freq.join(df_sev, how="left")
+    df["ClaimAmount"].fillna(0, inplace=True)
+
+    # unquote string fields
+    for column_name in df.columns[df.dtypes.values == np.object]:
+        df[column_name] = df[column_name].str.strip("'")
+    return df.iloc[:n_samples]
+
+
+def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None,
+                  title=None, kind_weight=None, ax=None):
+    """Plot observed and predicted - aggregated per feature level.
+
+    Parameters
+    ----------
+    df : DataFrame with at least one column named feature
+    observed : str
+        a column name of the observed target
+    predicted : frame
+        a dataframe, with the same index as df, with the predicted target
+    weight : str
+        column name with the values of weights/exposure
+    """
+    # aggregate observed and predicted variables by feature level
+    df_ = df.loc[:, [feature, weight]].copy()
+    df_["observed"] = df[observed] * df[weight]
+    df_["predicted"] = y_predicted * df[weight]
+    df_ = (
+        df_.groupby([feature])[weight, "observed", "predicted"]
+        .sum()
+        .assign(observed=lambda x: x["observed"] / x[weight])
+        .assign(predicted=lambda x: x["predicted"] / x[weight])
+    )
+
+    ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax)
+    y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8
+    ax.fill_between(
+        df_.index,
+        0,
+        y_max * df_[weight] / df_[weight].values.max(),
+        color="g",
+        alpha=0.1,
+    )
+    ax.set(
+        ylabel=y_label if y_label is not None else None,
+        title=title if title is not None else "Train: Observed vs Predicted",
+    )
+
+
+##############################################################################
+#
+# 1. Loading datasets and pre-processing
+# --------------------------------------
+#
+# We construct the freMTPL2 dataset by joining the  freMTPL2freq table,
+# containing the number of claims (``ClaimNb``) with the freMTPL2sev table
+# containing the claim amount (``ClaimAmount``) for the same user ids.
+
+df = load_mtpl2(n_samples=100000)
+
+# Note: filter out claims with zero amount, as the severity model
+# requires a strictly positive target values.
+df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
+
+# correct for unreasonable observations (that might be data error)
+df["ClaimNb"].clip(upper=4, inplace=True)
+df["Exposure"].clip(upper=1, inplace=True)
+
+column_trans = ColumnTransformer(
+    [
+        ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
+        (
+            "Veh_Brand_Gas_Region",
+            OneHotEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
+        ),
+        ("BonusMalus", "passthrough", ["BonusMalus"]),
+        (
+            "Density_log",
+            make_pipeline(
+                FunctionTransformer(np.log, validate=False), StandardScaler()
+            ),
+            ["Density"],
+        ),
+    ],
+    remainder="drop",
+)
+X = column_trans.fit_transform(df)
+
+
+df["Frequency"] = df.ClaimNb / df.Exposure
+df["AvgClaimAmount"] = df.ClaimAmount / np.fmax(df.ClaimNb, 1)
+
+print(df[df.ClaimAmount > 0].head())
+
+##############################################################################
+#
+# 2. Frequency model -- Poisson distribution
+# -------------------------------------------
+#
+# The number of claims (``ClaimNb``) is a positive integer that can be modeled
+# as a Poisson distribution. It is then assumed to be the number of discrete
+# events occuring with a constant rate in a given time interval (``Exposure``).
+# Here we model the frequency ``y = ClaimNb / Exposure``,
+# which is still a (scaled) Poisson distribution.
+#
+# A very important property of the Poisson distribution is its mean-variance
+# relation: The variance is proportional to the mean.
+
+df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2)
+
+glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0)
+glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
+
+
+def mean_deviance(estimator, y, y_pred, weights):
+    if hasattr(estimator, "_family_instance"):
+        return estimator._family_instance.deviance(y, y_pred, weights) / len(y)
+    else:
+        return np.nan
+
+
+def score_estimator(
+    estimator, X_train, X_test, df_train, df_test, target, weights
+):
+    res = []
+
+    for subset_label, X, df in [
+        ("train", X_train, df_train),
+        ("test", X_test, df_test),
+    ]:
+        y, _weights = df[target], df[weights]
+
+        for score_label, metric in [
+            ("D² explaned", None),
+            ("mean deviance", partial(mean_deviance, estimator)),
+            ("mean abs. error", mean_absolute_error),
+        ]:
+            if estimator.__class__.__name__ == "ClaimProdEstimator":
+                # ClaimProdEstimator is the product of the frequency and
+                # severity models, together with a denormalized by the exposure
+                # values. It does not fully follow the scikit-learn API and we
+                # must handle it separately.
+                y_pred = estimator.predict(X, exposure=df.Exposure.values)
+            else:
+                y_pred = estimator.predict(X)
+            if metric is None:
+                if not hasattr(estimator, "score"):
+                    continue
+                score = estimator.score(X, y, _weights)
+            else:
+                score = metric(y, y_pred, _weights)
+
+            res.append(
+                {"subset": subset_label, "metric": score_label, "score": score}
+            )
+
+    res = (
+        pd.DataFrame(res)
+        .set_index(["metric", "subset"])
+        .score.unstack(-1)
+        .round(3)
+    )
+    return res
+
+
+scores = score_estimator(
+    glm_freq,
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target="Frequency",
+    weights="Exposure",
+)
+print(scores)
+
+##############################################################################
+#
+# We can visually compare observed and predicted values, aggregated by
+# the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance
+# bonus/penalty (``BonusMalus``),
+
+fig, ax = plt.subplots(2, 2, figsize=(16, 8))
+fig.subplots_adjust(hspace=0.3, wspace=0.2)
+
+plot_obs_pred(
+    df_train,
+    "DrivAge",
+    "Frequency",
+    glm_freq.predict(X_train),
+    weight="Exposure",
+    y_label="Claim Frequency",
+    title="train data",
+    ax=ax[0, 0],
+)
+
+plot_obs_pred(
+    df_test,
+    "DrivAge",
+    "Frequency",
+    glm_freq.predict(X_test),
+    weight="Exposure",
+    y_label="Claim Frequency",
+    title="test data",
+    ax=ax[0, 1],
+)
+
+plot_obs_pred(
+    df_test,
+    "VehAge",
+    "Frequency",
+    glm_freq.predict(X_test),
+    weight="Exposure",
+    y_label="Claim Frequency",
+    title="test data",
+    ax=ax[1, 0],
+)
+
+plot_obs_pred(
+    df_test,
+    "BonusMalus",
+    "Frequency",
+    glm_freq.predict(X_test),
+    weight="Exposure",
+    y_label="Claim Frequency",
+    title="test data",
+    ax=ax[1, 1],
+)
+
+
+##############################################################################
+#
+# 3. Severity model -  Gamma Distribution
+# ---------------------------------------
+# The mean claim amount or severity (`AvgClaimAmount`) can be empirically
+# shown to follow a Gamma distribution. We fit a GLM model for the severity
+# with the same features as the frequency model.
+#
+# Note:
+# - We filter out ``ClaimAmount == 0``` as the Gamma distribution as support
+# on :math:`(0, \infty)` not :math:`[0, \infty)`.
+# - We use ``ClaimNb`` as sample weights.
+
+mask_train = df_train["ClaimAmount"] > 0
+mask_test = df_test["ClaimAmount"] > 0
+
+glm_sev = GeneralizedLinearRegressor(family="gamma", alpha=1)
+
+glm_sev.fit(
+    X_train[mask_train.values],
+    df_train.loc[mask_train, "AvgClaimAmount"],
+    sample_weight=df_train.loc[mask_train, "ClaimNb"],
+)
+
+
+scores = score_estimator(
+    glm_sev,
+    X_train[mask_train.values],
+    X_test[mask_test.values],
+    df_train[mask_train],
+    df_test[mask_test],
+    target="AvgClaimAmount",
+    weights="ClaimNb",
+)
+print(scores)
+
+##############################################################################
+#
+# Note that the resulting model is conditional on having at least one claim,
+# and cannot be used to predict the average claim amount in general,
+
+print(
+    "Mean AvgClaim Amount:               %.2f "
+    % df_train.AvgClaimAmount.mean()
+)
+print(
+    "Mean AvgClaim Amount | NbClaim > 0: %.2f"
+    % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean()
+)
+print(
+    "Predicted Mean AvgClaim Amount:     %.2f"
+    % glm_sev.predict(X_train).mean()
+)
+
+
+##############################################################################
+#
+# We can visually compare observed and predicted values, aggregated for
+# the drivers age (``Driv Age``),
+
+fig, ax = plt.subplots(1, 2, figsize=(16, 4))
+
+# plot DivAge
+plot_obs_pred(
+    df_train.loc[mask_train],
+    "DrivAge",
+    "AvgClaimAmount",
+    glm_sev.predict(X_train[mask_train.values]),
+    weight="Exposure",
+    y_label="Average Claim Severity",
+    title="train data",
+    ax=ax[0],
+)
+
+plot_obs_pred(
+    df_test.loc[mask_test],
+    "DrivAge",
+    "AvgClaimAmount",
+    glm_sev.predict(X_test[mask_test.values]),
+    weight="Exposure",
+    y_label="Average Claim Severity",
+    title="test data",
+    ax=ax[1],
+)
+
+
+##############################################################################
+#
+# 3. Total Claims Amount -- Compound Poisson distribution
+# -------------------------------------------------------
+#
+# As mentionned in the introduction, the total claim amount can be modeled
+# either as the product of the frequency model by the severity model.
+
+
+class ClaimProdEstimator:
+    """Total claim amount estimator
+
+    Computed as the product of the frequency model by the serverity model,
+    denormalized by exposure.
+    """
+
+    def __init__(self, est_freq, est_sev):
+        self.est_freq = est_freq
+        self.est_sev = est_sev
+
+    def predict(self, X, exposure):
+        """Predict the total claim amount
+
+        The predict method is not compatible with the scikit-learn API.
+        """
+        return exposure * self.est_freq.predict(X) * self.est_sev.predict(X)
+
+
+est_prod = ClaimProdEstimator(glm_freq, glm_sev)
+
+scores = score_estimator(
+    est_prod,
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target="ClaimAmount",
+    weights="Exposure",
+)
+print(scores)
+
+
+##############################################################################
+#
+# or as a unique Compound Poisson model, also corresponding to a Tweedie model
+# with a power :math:`p \in (1, 2)`. We determine the optimal hyperparameter
+# ``p`` with a grid search,
+
+from sklearn.model_selection import GridSearchCV
+
+# this takes a while
+params = {
+    "family": [
+        TweedieDistribution(power=power) for power in np.linspace(1, 2, 8)
+    ]
+}
+
+glm_total = GridSearchCV(
+    GeneralizedLinearRegressor(), cv=3, param_grid=params, n_jobs=-1
+)
+glm_total.fit(
+    X_train, df_train["ClaimAmount"], sample_weight=df_train["Exposure"]
+)
+
+
+print(
+    "Best hyperparameters: power=%.2f\n"
+    % glm_total.best_estimator_.family.power
+)
+
+scores = score_estimator(
+    glm_total.best_estimator_,
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target="ClaimAmount",
+    weights="Exposure",
+)
+print(scores)
+
+##############################################################################
+#
+# In this example, the mean absolute error is lower for the Compound Poisson
+# model than when using separate models for frequency and severity.
+#
+# We can additionally validate these models by comparing observed and predicted
+# total claim amount over the test and train subsets.
+
+res = []
+for subset_label, X, df in [
+    ("train", X_train, df_train),
+    ("test", X_test, df_test),
+]:
+    res.append(
+        {
+            "subset": subset_label,
+            "observed": df.ClaimAmount.values.sum(),
+            "predicted, frequency*severity model": np.sum(
+                est_prod.predict(X, exposure=df.Exposure.values)
+            ),
+            "predicted, tweedie, p=%.2f"
+            % glm_total.best_estimator_.family.power: np.sum(
+                glm_total.best_estimator_.predict(X)
+            ),
+        }
+    )
+
+print(pd.DataFrame(res).set_index("subset").T)

From 56069e5b3f5b453d9e8a487c9d27e20900ce4d63 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 28 Jun 2019 10:11:37 -0500
Subject: [PATCH 065/209] EXA Fix issues with older pandas versions in example

---
 .../linear_model/plot_tweedie_regression_insurance_claims.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index cb2ff667e8379..063d12e6e291b 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -137,8 +137,8 @@ def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None,
 df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
 
 # correct for unreasonable observations (that might be data error)
-df["ClaimNb"].clip(upper=4, inplace=True)
-df["Exposure"].clip(upper=1, inplace=True)
+df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
+df["Exposure"] = df["Exposure"].clip(upper=1)
 
 column_trans = ColumnTransformer(
     [

From 53f3c5f6670bbdefcd5ddf6d63e405401909cf06 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jul 2019 09:14:30 -0500
Subject: [PATCH 066/209] DOC Add second poisson regression example

---
 ...plot_poisson_regression_non_normal_loss.py | 257 ++++++++++++++++++
 1 file changed, 257 insertions(+)
 create mode 100644 examples/linear_model/plot_poisson_regression_non_normal_loss.py

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
new file mode 100644
index 0000000000000..b06adcb787560
--- /dev/null
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -0,0 +1,257 @@
+"""
+======================================
+Poisson regression and non normal loss
+======================================
+
+This example illustrate the use linear Poisson regression
+on the French Motor Third-Party Liability Claims dataset [1] and compare
+it with learning models with least squared error.
+
+
+We start by defining a few helper functions for loading the data and
+visualizing results.
+
+
+.. [1]  A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor
+    Third-Party Liability Claims (November 8, 2018).
+    `doi:10.2139/ssrn.3164764 <http://dx.doi.org/10.2139/ssrn.3164764>`_
+
+"""
+print(__doc__)
+
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#          Roman Yurchak <rth.yurchak@gmail.com>
+# License: BSD 3 clause
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+from scipy.special import xlogy
+
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import GeneralizedLinearRegressor, LinearRegression
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
+from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
+from sklearn.ensemble import GradientBoostingRegressor
+
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+
+
+def load_mtpl2(n_samples=100000):
+    """Fetcher for French Motor Third-Party Liability Claims dataset
+
+    Parameters
+    ----------
+    n_samples: int, default=100000
+      number of samples to select (for faster run time).
+    """
+
+    # Note: this should use the OpenML DataFrame fetcher in the future
+    df_freq = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv",
+        dtype={"IDpol": np.int},
+        index_col=0,
+    )
+
+    df_sev = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff",
+        index_col=0,
+    )
+
+    # sum ClaimAmount over identical IDs
+    df_sev = df_sev.groupby(level=0).sum()
+
+    df = df_freq.join(df_sev, how="left")
+    df["ClaimAmount"].fillna(0, inplace=True)
+
+    # unquote string fields
+    for column_name in df.columns[df.dtypes.values == np.object]:
+        df[column_name] = df[column_name].str.strip("'")
+    return df.iloc[:n_samples]
+
+
+##############################################################################
+#
+# 1. Loading datasets and pre-processing
+# --------------------------------------
+#
+# We construct the freMTPL2 dataset by joining the  freMTPL2freq table,
+# containing the number of claims (``ClaimNb``) with the freMTPL2sev table
+# containing the claim amount (``ClaimAmount``) for the same user ids.
+
+df = load_mtpl2(n_samples=100000)
+
+# Note: filter out claims with zero amount, as the severity model
+# requires a strictly positive target values.
+df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
+
+# correct for unreasonable observations (that might be data error)
+df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
+df["Exposure"] = df["Exposure"].clip(upper=1)
+
+column_trans = ColumnTransformer(
+    [
+        ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
+        (
+            "Veh_Brand_Gas_Region",
+            OneHotEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
+        ),
+        ("BonusMalus", "passthrough", ["BonusMalus"]),
+        (
+            "Density_log",
+            make_pipeline(
+                FunctionTransformer(np.log, validate=False), StandardScaler()
+            ),
+            ["Density"],
+        ),
+    ],
+    remainder="drop",
+)
+X = column_trans.fit_transform(df)
+
+##############################################################################
+#
+# The number of claims (``ClaimNb``) is a positive integer that can be modeled
+# as a Poisson distribution. It is then assumed to be the number of discrete
+# events occurring with a constant rate in a given time interval
+# (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``,
+# which is still a (scaled) Poisson distribution.
+#
+# A very important property of the Poisson distribution is its mean-variance
+# relation: The variance is proportional to the mean.
+
+df["Frequency"] = df.ClaimNb / df.Exposure
+
+print(
+   pd.cut(df.Frequency, [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts()
+)
+
+##############################################################################
+#
+# It worth noting that 96 % of users have 0 claims, and if we were to convert
+# this problem into a binary classification task, it would be significantly
+# imbalanced.
+#
+# To evaluate the pertinence of the used metrics, we will consider as a
+# baseline an estimator that returns 0 for any input.
+
+df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2)
+
+
+def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None):
+    y_true = np.atleast_1d(y_true)
+    y_pred = np.atleast_1d(y_pred)
+    dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred)
+    return np.average(dev, weights=sample_weights)
+
+
+eps = 1e-5
+print("MSE: %.3f" % mean_squared_error(
+        df_test.Frequency.values, np.zeros(len(df_test)),
+        df_test.Exposure.values))
+print("MAE: %.3f" % mean_absolute_error(
+        df_test.Frequency.values, np.zeros(len(df_test)),
+        df_test.Exposure.values))
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+        df_test.Frequency.values, eps + np.zeros(len(df_test)),
+        df_test.Exposure.values))
+
+
+##############################################################################
+#
+# We start by modeling the target variable with the least squares linear
+# regression model,
+
+
+linregr = LinearRegression()
+linregr.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
+
+print("LinearRegression")
+print("MSE: %.3f" % mean_squared_error(
+          df_test.Frequency.values, linregr.predict(X_test),
+          df_test.Exposure.values))
+print("MSE: %.3f" % mean_absolute_error(
+          df_test.Frequency.values, linregr.predict(X_test),
+          df_test.Exposure.values))
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+        df_test.Frequency.values, np.fmax(linregr.predict(X_test), eps),
+        df_test.Exposure.values))
+
+##############################################################################
+#
+# The Poisson deviance cannot be computed because negative values are
+# predicted by the model,
+
+print('Number Negatives: %s / total: %s' % (
+      (linregr.predict(X_test) < 0).sum(), X_test.shape[0]))
+
+##############################################################################
+#
+# Next we fit the Poisson regressor on the target variable,
+
+glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0)
+glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
+
+print("PoissonRegressor")
+print("MSE: %.3f" % mean_squared_error(
+        df_test.Frequency.values, glm_freq.predict(X_test),
+        df_test.Exposure.values))
+print("MAE: %.3f" % mean_absolute_error(
+        df_test.Frequency.values, glm_freq.predict(X_test),
+        df_test.Exposure.values))
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+        df_test.Frequency.values, glm_freq.predict(X_test),
+        df_test.Exposure.values))
+
+##############################################################################
+#
+# Finally we will consider a non linear model  with Gradient boosting that
+# still minimizes the least square error.
+
+
+gbr = GradientBoostingRegressor(max_depth=3)
+gbr.fit(X_train, df_train.Frequency.values,
+        sample_weight=df_train.Exposure.values)
+
+
+print("GradientBoostingRegressor")
+print("MSE: %.3f" % mean_squared_error(
+      df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
+print("MAE: %.3f" % mean_absolute_error(
+      df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+      df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
+
+##############################################################################
+#
+# In this example, although Gradient boosting minimizes the least square error,
+# because of a higher predictive power it also results in a smaller Poisson
+# deviance than the Poisson regression model.
+#
+# Evaluating models with a single train / test split is prone to numerical
+# errors, we can verify that we would also get equivalent resuts with the
+# cross-validation score.
+#
+# The difference between these models can also be visualized by comparing the
+# histogram of observed target values with that of predicted values,
+
+
+fig, ax = plt.subplots(1, 4, figsize=(16, 3))
+
+df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=ax[0])
+
+ax[0].set_title('Experimental data')
+
+for idx, model in enumerate([linregr, glm_freq, gbr]):
+    y_pred = model.predict(X_train)
+
+    pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=ax[idx+1])
+    ax[idx+1].set_title(model.__class__.__name__)
+
+for axi in ax:
+    axi.set(
+        yscale='log',
+        xlabel="y (Frequency)"
+    )

From be5a3c485684ae45835258a71a1870a59549fbda Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jul 2019 09:55:57 -0500
Subject: [PATCH 067/209] Add GeneralizedHyperbolicSecant and
 BinomialDistributions

---
 sklearn/linear_model/_glm.py           | 109 ++++---------------------
 sklearn/linear_model/tests/test_glm.py |  34 +-------
 2 files changed, 18 insertions(+), 125 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index b18731e73f328..736e50960dcda 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -6,36 +6,6 @@
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
-# TODO: Add cross validation support, e.g. GCV?
-# TODO: Should GeneralizedLinearRegressor inherit from LinearModel?
-#       So far, it does not.
-# TODO: Include further classes in class.rst? ExponentialDispersionModel?
-#       TweedieDistribution?
-# TODO: Negative values in P1 are not allowed so far. They could be used
-#       for group lasso.
-
-# Design Decisions:
-# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
-#   Estimators in sklearn are either regressors or classifiers. A GLM can do
-#   both depending on the distr (Normal => regressor, Binomial => classifier).
-#   Solution: GeneralizedLinearRegressor since this is the focus.
-# - Allow for finer control of penalty terms:
-#   L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude
-#       factors from the L1 penalty.
-#   L2: w*P2*w with P2 a positive (semi-) definite matrix, e.g. P2 could be
-#   a 1st or 2nd order difference matrix (compare B-spline penalties and
-#   Tikhonov regularization).
-# - The link function (instance of class Link) is necessary for the evaluation
-#   of deviance, score, Fisher and Hessian matrix as functions of the
-#   coefficients, which is needed by optimizers.
-#   Solution: link as argument in those functions
-# - Which name/symbol for sample_weight in docu?
-#   sklearn.linear_models uses w for coefficients, standard literature on
-#   GLMs use beta for coefficients and w for (sample) weights.
-#   So far, coefficients=w and sample weights=s.
-# - The intercept term is the first index, i.e. coef[0]
-
-
 from __future__ import division
 from abc import ABCMeta, abstractmethod
 import numbers
@@ -51,6 +21,7 @@
 from ..utils.validation import check_is_fitted, check_random_state
 
 
+
 def _check_weights(sample_weight, n_samples):
     """Check that sample weights are non-negative and have the right shape."""
     if sample_weight is None:
@@ -854,47 +825,12 @@ def __init__(self):
         super(InverseGaussianDistribution, self).__init__(power=3)
 
 
-class GeneralizedHyperbolicSecant(ExponentialDispersionModel):
-    """A class for the Generalized Hyperbolic Secant (GHS) distribution.
-
-    The GHS distribution is for targets y in (-inf, inf).
-    """
-    def __init__(self):
-        self._lower_bound = -np.Inf
-        self._upper_bound = np.Inf
-        self._include_lower_bound = False
-        self._include_upper_bound = False
-
-    def unit_variance(self, mu):
-        return 1 + mu**2
-
-    def unit_variance_derivative(self, mu):
-        return 2 * mu
-
-    def unit_deviance(self, y, mu):
-        return (2 * y * (np.arctan(y) - np.arctan(mu)) +
-                np.log((1 + mu**2)/(1 + y**2)))
-
-
-class BinomialDistribution(ExponentialDispersionModel):
-    """A class for the Binomial distribution.
-
-    The Binomial distribution is for targets y in [0, 1].
-    """
-    def __init__(self):
-        self._lower_bound = 0
-        self._upper_bound = 1
-        self._include_lower_bound = True
-        self._include_upper_bound = True
-
-    def unit_variance(self, mu):
-        return mu * (1 - mu)
-
-    def unit_variance_derivative(self, mu):
-        return 1 - 2 * mu
-
-    def unit_deviance(self, y, mu):
-        return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu)))
+EDM_DISTRIBUTIONS = {
+    'normal': NormalDistribution,
+    'poisson': PoissonDistribution,
+    'gamma': GammaDistribution,
+    'inverse.gaussian': InverseGaussianDistribution,
+}
 
 
 def _irls_step(X, W, P2, z, fit_intercept=True):
@@ -1690,28 +1626,19 @@ def fit(self, X, y, sample_weight=None):
         #######################################################################
         # 1. input validation                                                 #
         #######################################################################
-        # 1.1 validate arguments of __init__ ##################################
+        # 1.1 validate arguments of __init__
         # Guarantee that self._family_instance is an instance of class
         # ExponentialDispersionModel
         if isinstance(self.family, ExponentialDispersionModel):
             self._family_instance = self.family
+        elif self.family in EDM_DISTRIBUTIONS:
+            self._family_instance = EDM_DISTRIBUTIONS[self.family]()
         else:
-            if self.family == 'normal':
-                self._family_instance = NormalDistribution()
-            elif self.family == 'poisson':
-                self._family_instance = PoissonDistribution()
-            elif self.family == 'gamma':
-                self._family_instance = GammaDistribution()
-            elif self.family == 'inverse.gaussian':
-                self._family_instance = InverseGaussianDistribution()
-            elif self.family == 'binomial':
-                self._family_instance = BinomialDistribution()
-            else:
-                raise ValueError(
-                    "The family must be an instance of class"
-                    " ExponentialDispersionModel or an element of"
-                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
-                    "'binomial']; got (family={0})".format(self.family))
+            raise ValueError(
+                "The family must be an instance of class"
+                " ExponentialDispersionModel or an element of"
+                " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
+                "'binomial']; got (family={0})".format(self.family))
 
         # Guarantee that self._link_instance is set to an instance of
         # class Link
@@ -1724,11 +1651,6 @@ def fit(self, X, y, sample_weight=None):
                         self._link_instance = IdentityLink()
                     if self._family_instance.power >= 1:
                         self._link_instance = LogLink()
-                elif isinstance(self._family_instance,
-                                GeneralizedHyperbolicSecant):
-                    self._link_instance = IdentityLink()
-                elif isinstance(self._family_instance, BinomialDistribution):
-                    self._link_instance = LogitLink()
                 else:
                     raise ValueError("No default link known for the "
                                      "specified distribution family. Please "
@@ -2048,7 +1970,6 @@ def fit(self, X, y, sample_weight=None):
         # 4. fit                                                              #
         #######################################################################
         # algorithms for optimization
-        # TODO: Parallelize it?
 
         # 4.1 IRLS ############################################################
         # Note: we already set P2 = l2*P2, see above
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 1416bdcfad680..f51f630ebae7e 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -18,7 +18,6 @@
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
-    GeneralizedHyperbolicSecant, BinomialDistribution,
 )
 from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
 from sklearn.metrics import mean_absolute_error
@@ -103,7 +102,7 @@ def test_tweedie_distribution_power():
      (TweedieDistribution(power=1.5), [0.1, 1.5]),
      (TweedieDistribution(power=2.5), [0.1, 1.5]),
      (TweedieDistribution(power=-4), [0.1, 1.5]),
-     (GeneralizedHyperbolicSecant(), [0.1, 1.5])])
+])
 def test_deviance_zero(family, chk_values):
     """Test deviance(y,y) = 0 for different families."""
     for x in chk_values:
@@ -196,7 +195,7 @@ def test_sample_weights_validation():
                           ('poisson', PoissonDistribution()),
                           ('gamma', GammaDistribution()),
                           ('inverse.gaussian', InverseGaussianDistribution()),
-                          ('binomial', BinomialDistribution())])
+])
 def test_glm_family_argument(f, fam):
     """Test GLM family argument set as string."""
     y = np.array([0.1, 0.5])  # in range of all distributions
@@ -424,7 +423,7 @@ def test_glm_identity_regression(solver):
     [NormalDistribution(), PoissonDistribution(),
      GammaDistribution(), InverseGaussianDistribution(),
      TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
-     GeneralizedHyperbolicSecant()])
+])
 @pytest.mark.parametrize('solver, tol', [('irls', 1e-6),
                                          ('lbfgs', 1e-6),
                                          ('newton-cg', 1e-7),
@@ -620,33 +619,6 @@ def obj(coef):
     assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4)
 
 
-@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10])
-def test_binomial_enet(alpha):
-    """Test elastic net regression with binomial family and LogitLink.
-
-    Compare to LogisticRegression.
-    """
-    l1_ratio = 0.5
-    n_samples = 500
-    rng = np.random.RandomState(42)
-    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6,
-                               n_informative=5, n_redundant=0, n_repeated=0,
-                               random_state=rng)
-    log = LogisticRegression(
-        penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6,
-        max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha),
-        solver='saga')
-    log.fit(X, y)
-
-    glm = GeneralizedLinearRegressor(
-        family=BinomialDistribution(), link=LogitLink(), fit_intercept=False,
-        alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic',
-        tol=1e-7)
-    glm.fit(X, y)
-    assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6)
-    assert_allclose(log.coef_[0, :], glm.coef_, rtol=5e-6)
-
-
 @pytest.mark.parametrize(
         "params",
         [

From e67fecb9bc1ee056ad7934803818fd46a0a1f8b3 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jul 2019 10:00:24 -0500
Subject: [PATCH 068/209] Remove start params option

---
 sklearn/linear_model/_glm.py | 157 +++--------------------------------
 1 file changed, 13 insertions(+), 144 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index 736e50960dcda..18eba80080670 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -1457,33 +1457,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     warm_start : boolean, optional (default=False)
         If set to ``True``, reuse the solution of the previous call to ``fit``
-        as initialization for ``coef_`` and ``intercept_`` (supersedes option
-        ``start_params``). If set to ``True`` or if the attribute ``coef_``
-        does not exit (first call to ``fit``), option ``start_params`` sets the
-        start values for ``coef_`` and ``intercept_``.
-
-    start_params : {'guess', 'zero', array of shape (n_features*, )}, \
-            optional (default='guess')
-        Relevant only if ``warm_start=False`` or if fit is called
-        the first time (``self.coef_`` does not yet exist).
-
-        'guess'
-            Start values of mu are calculated by family.starting_mu(..). Then,
-            one Newton step obtains start values for ``coef_``. If
-            ``solver='irls'``, it uses one irls step, else the Newton step is
-            calculated by the cd solver.
-            This gives usually good starting values.
-
-        'zero'
-        All coefficients are set to zero. If ``fit_intercept=True``, the
-        start value for the intercept is obtained by the weighted average of y.
-
-        array
-        The array of size n_features* is directly used as start values
-        for ``coef_``. If ``fit_intercept=True``, the first element
-        is assumed to be the start value for the ``intercept_``.
-        Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes
-        the intercept in counting.
+        as initialization for ``coef_`` and ``intercept_``.
 
     selection : str, optional (default='cyclic')
         For the solver 'cd' (coordinate descent), the coordinates (features)
@@ -1503,7 +1477,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         'random'.
 
     diag_fisher : boolean, optional, (default=False)
-        Only relevant for solver 'cd' (see also ``start_params='guess'``).
+        Only relevant for solver 'cd'.
         If ``False``, the full Fisher matrix (expected Hessian) is computed in
         each outer iteration (Newton iteration). If ``True``, only a diagonal
         matrix (stored as 1d array) is computed, such that
@@ -1576,7 +1550,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
                  fit_intercept=True, family='normal', link='auto',
                  fit_dispersion=None, solver='auto', max_iter=100,
-                 tol=1e-4, warm_start=False, start_params='guess',
+                 tol=1e-4, warm_start=False,
                  selection='cyclic', random_state=None, diag_fisher=False,
                  copy_X=True, check_input=True, verbose=0):
         self.alpha = alpha
@@ -1591,7 +1565,6 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
         self.max_iter = max_iter
         self.tol = tol
         self.warm_start = warm_start
-        self.start_params = start_params
         self.selection = selection
         self.random_state = random_state
         self.diag_fisher = diag_fisher
@@ -1742,7 +1715,7 @@ def fit(self, X, y, sample_weight=None):
         n_samples, n_features = X.shape
 
         # 1.3 arguments to take special care ##################################
-        # P1, P2, start_params
+        # P1, P2
         if isinstance(self.P1, str) and self.P1 == 'identity':
             P1 = np.ones(n_features)
         else:
@@ -1793,25 +1766,6 @@ def fit(self, X, y, sample_weight=None):
                                  "got (P2.shape=({0}, {1})), needed ({2}, {2})"
                                  .format(P2.shape[0], P2.shape[1], X.shape[1]))
 
-        start_params = self.start_params
-        if isinstance(start_params, str):
-            if start_params not in ['guess',  'zero']:
-                raise ValueError("The argument start_params must be 'guess', "
-                                 "'zero' or an array of correct length; "
-                                 "got(start_params={0})".format(start_params))
-        else:
-            start_params = check_array(start_params, accept_sparse=False,
-                                       force_all_finite=True, ensure_2d=False,
-                                       dtype=_dtype, copy=True)
-            if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
-                    (start_params.ndim != 1)):
-                raise ValueError("Start values for parameters must have the"
-                                 "right length and dimension; required (length"
-                                 "={0}, ndim=1); got (length={1}, ndim={2})."
-                                 .format(X.shape[1] + self.fit_intercept,
-                                         start_params.shape[0],
-                                         start_params.ndim))
-
         l1 = self.alpha * self.l1_ratio
         l2 = self.alpha * (1 - self.l1_ratio)
         # P1 and P2 are now for sure copies
@@ -1899,72 +1853,12 @@ def fit(self, X, y, sample_weight=None):
                                        self.coef_))
             else:
                 coef = self.coef_
-        elif isinstance(start_params, str):
-            if start_params == 'guess':
-                # Set mu=starting_mu of the family and do one Newton step
-                # If solver=cd use cd, else irls
-                mu = family.starting_mu(y, weights=weights)
-                eta = link.link(mu)  # linear predictor
-                if solver in ['cd', 'lbfgs', 'newton-cg']:
-                    # see function _cd_solver
-                    sigma_inv = 1/family.variance(mu, phi=1, weights=weights)
-                    d1 = link.inverse_derivative(eta)
-                    temp = sigma_inv * d1 * (y - mu)
-                    if self.fit_intercept:
-                        score = np.concatenate(([temp.sum()], temp @ X))
-                    else:
-                        score = temp @ X  # same as X.T @ temp
-
-                    d2_sigma_inv = d1 * d1 * sigma_inv
-                    diag_fisher = self.diag_fisher
-                    if diag_fisher:
-                        fisher = d2_sigma_inv
-                    else:
-                        fisher = \
-                            _safe_sandwich_dot(X, d2_sigma_inv,
-                                               intercept=self.fit_intercept)
-                    # set up space for search direction d for inner loop
-                    if self.fit_intercept:
-                        coef = np.zeros(n_features+1)
-                    else:
-                        coef = np.zeros(n_features)
-                    d = np.zeros_like(coef)
-                    # initial stopping tolerance of inner loop
-                    # use L1-norm of minimum of norm of subgradient of F
-                    # use less restrictive tolerance for initial guess
-                    inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2,
-                                                 P1=P1)
-                    inner_tol = 4 * linalg.norm(inner_tol, ord=1)
-                    # just one outer loop = Newton step
-                    n_cycles = 0
-                    d, coef_P2, n_cycles, inner_tol = \
-                        _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles,
-                                  inner_tol, max_inner_iter=1000,
-                                  selection=self.selection,
-                                  random_state=random_state,
-                                  diag_fisher=self.diag_fisher)
-                    coef += d  # for simplicity no line search here
-                else:
-                    # See _irls_solver
-                    # h'(eta)
-                    hp = link.inverse_derivative(eta)
-                    # working weights W, in principle a diagonal matrix
-                    # therefore here just as 1d array
-                    W = (hp**2 / family.variance(mu, phi=1, weights=weights))
-                    # working observations
-                    z = eta + (y-mu)/hp
-                    # solve A*coef = b
-                    # A = X' W X + l2 P2, b = X' W z
-                    coef = _irls_step(X, W, P2, z,
-                                      fit_intercept=self.fit_intercept)
-            else:  # start_params == 'zero'
-                if self.fit_intercept:
-                    coef = np.zeros(n_features+1)
-                    coef[0] = link.link(np.average(y, weights=weights))
-                else:
-                    coef = np.zeros(n_features)
-        else:  # assign given array as start values
-            coef = start_params
+        else:
+            if self.fit_intercept:
+                coef = np.zeros(n_features+1)
+                coef[0] = link.link(np.average(y, weights=weights))
+            else:
+                coef = np.zeros(n_features)
 
         #######################################################################
         # 4. fit                                                              #
@@ -2312,32 +2206,7 @@ class PoissonRegressor(GeneralizedLinearRegressor):
 
     warm_start : boolean, optional (default=False)
         If set to ``True``, reuse the solution of the previous call to ``fit``
-        as initialization for ``coef_`` and ``intercept_`` (supersedes option
-        ``start_params``). If set to ``True`` or if the attribute ``coef_``
-        does not exit (first call to ``fit``), option ``start_params`` sets the
-        start values for ``coef_`` and ``intercept_``.
-
-    start_params : {'guess', 'zero', array of shape (n_features*, )}, \
-            optional (default='guess')
-        Relevant only if ``warm_start=False`` or if fit is called
-        the first time (``self.coef_`` does not yet exist).
-
-        'guess'
-            Start values of mu are calculated by family.starting_mu(..). Then,
-            one Newton step obtains start values for ``coef_``. If
-            ``solver='irls'``, it uses one irls step. This gives usually good
-            starting values.
-
-        'zero'
-        All coefficients are set to zero. If ``fit_intercept=True``, the
-        start value for the intercept is obtained by the weighted average of y.
-
-        array
-        The array of size n_features* is directly used as start values
-        for ``coef_``. If ``fit_intercept=True``, the first element
-        is assumed to be the start value for the ``intercept_``.
-        Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes
-        the intercept in counting.
+        as initialization for ``coef_`` and ``intercept_`` .
 
     random_state : {int, RandomState instance, None}, optional (default=None)
         If int, random_state is the seed used by the random
@@ -2401,12 +2270,12 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     """
     def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None,
                  solver='irls', max_iter=100,
-                 tol=1e-4, warm_start=False, start_params='guess',
+                 tol=1e-4, warm_start=False,
                  random_state=None, copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
                          family="poisson", link='log',
                          fit_dispersion=fit_dispersion, solver=solver,
                          max_iter=max_iter, tol=tol, warm_start=warm_start,
-                         start_params=start_params, random_state=random_state,
+                         random_state=random_state,
                          copy_X=copy_X, verbose=verbose)

From 62f4448101c16a797d74119fe1df5b45b93136b0 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jul 2019 10:31:42 -0500
Subject: [PATCH 069/209] Remove L1 penalty and CD solver

---
 sklearn/linear_model/_glm.py           | 536 +------------------------
 sklearn/linear_model/tests/test_glm.py | 185 +--------
 2 files changed, 36 insertions(+), 685 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index 18eba80080670..15211c6038007 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -93,49 +93,6 @@ def _safe_sandwich_dot(X, d, intercept=False):
     return res
 
 
-def _min_norm_sugrad(coef, grad, P2, P1):
-    """Compute the gradient of all subgradients with minimal L2-norm.
-
-    subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1)
-
-    g_i = grad_i + (P2*coef)_i
-
-    if coef_i > 0:   g_i + P1_i
-    if coef_i < 0:   g_i - P1_i
-    if coef_i = 0:   sign(g_i) * max(|g_i|-P1_i, 0)
-
-    Parameters
-    ----------
-    coef : ndarray
-        coef[0] may be intercept.
-
-    grad : ndarray, shape=coef.shape
-
-    P2 : {1d or 2d array, None}
-        always without intercept, ``None`` means P2 = 0
-
-    P1 : ndarray
-        always without intercept
-    """
-    intercept = (coef.size == P1.size + 1)
-    idx = 1 if intercept else 0  # offset if coef[0] is intercept
-    # compute grad + coef @ P2 without intercept
-    grad_wP2 = grad[idx:].copy()
-    if P2 is None:
-        pass
-    elif P2.ndim == 1:
-        grad_wP2 += coef[idx:] * P2
-    else:
-        grad_wP2 += coef[idx:] @ P2
-    res = np.where(coef[idx:] == 0,
-                   np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0),
-                   grad_wP2 + np.sign(coef[idx:]) * P1)
-    if intercept:
-        return np.concatenate(([grad[0]], res))
-    else:
-        return res
-
-
 class Link(metaclass=ABCMeta):
     """Abstract base class for Link functions."""
 
@@ -915,7 +872,7 @@ def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link,
     #   D   = link.inverse_derivative(eta) = diag_matrix(h'(X w))
     #   D2  = link.inverse_derivative(eta)^2 = D^2
     #   W   = D2/V(mu)
-    #   l2  = alpha * (1 - l1_ratio)
+    #   l2  = alpha
     #   Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w
     #        = -X' D (y-mu)/V(mu) + l2 P2 w
     #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
@@ -981,345 +938,6 @@ def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link,
     return coef, n_iter
 
 
-def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
-              max_inner_iter=1000, selection='cyclic',
-              random_state=None, diag_fisher=False):
-    """Compute inner loop of coordinate descent, i.e. cycles through features.
-
-    Minimization of 1-d subproblems::
-
-        min_z q(d+z*e_j) - q(d)
-        = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1
-
-    A = f'(w) + d*H(w) + (w+d)*P2
-    B = H+P2
-    Note: f'=-score and H=fisher are updated at the end of outer iteration.
-    """
-    # TODO: use sparsity (coefficient already 0 due to L1 penalty)
-    #       => active set of features for featurelist, see paper
-    #          of Improved GLMNET or Gap Safe Screening Rules
-    #          https://arxiv.org/abs/1611.05780
-    n_samples, n_features = X.shape
-    intercept = (coef.size == X.shape[1] + 1)
-    idx = 1 if intercept else 0  # offset if coef[0] is intercept
-    B = fisher
-    if P2.ndim == 1:
-        coef_P2 = coef[idx:] * P2
-        if not diag_fisher:
-            idiag = np.arange(start=idx, stop=B.shape[0])
-            # B[np.diag_indices_from(B)] += P2
-            B[(idiag, idiag)] += P2
-    else:
-        coef_P2 = coef[idx:] @ P2
-        if not diag_fisher:
-            if sparse.issparse(P2):
-                B[idx:, idx:] += P2.toarray()
-            else:
-                B[idx:, idx:] += P2
-    A = -score
-    A[idx:] += coef_P2
-    # A += d @ (H+P2) but so far d=0
-    # inner loop
-    for inner_iter in range(1, max_inner_iter+1):
-        inner_iter += 1
-        n_cycles += 1
-        # cycle through features, update intercept separately at the end
-        if selection == 'random':
-            featurelist = random_state.permutation(n_features)
-        else:
-            featurelist = np.arange(n_features)
-        for j in featurelist:
-            # minimize_z: a z + 1/2 b z^2 + c |d+z|
-            # a = A_j
-            # b = B_jj > 0
-            # c = |P1_j| = P1_j > 0, see 1.3
-            # d = w_j + d_j
-            # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
-            # with beta = z+d, beta_hat = d-a/b and gamma = c/b
-            # z = 1/b * S(bd-a,c) - d
-            # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
-            jdx = j+idx  # index for arrays containing entries for intercept
-            a = A[jdx]
-            if diag_fisher:
-                # Note: fisher is ndarray of shape (n_samples,) => no idx
-                # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway
-                Bj = np.zeros_like(A)
-                if intercept:
-                    Bj[0] = fisher.sum()
-                if sparse.issparse(X):
-                    Bj[idx:] = _safe_toarray(X[:, j].transpose() @
-                                             X.multiply(fisher[:, np.newaxis])
-                                             ).ravel()
-                else:
-                    Bj[idx:] = (fisher * X[:, j]) @ X
-
-                if P2.ndim == 1:
-                    Bj[idx:] += P2[j]
-                else:
-                    if sparse.issparse(P2):
-                        # slice columns as P2 is csc
-                        Bj[idx:] += P2[:, j].toarray().ravel()
-                    else:
-                        Bj[idx:] += P2[:, j]
-                b = Bj[jdx]
-            else:
-                b = B[jdx, jdx]
-
-            # those ten lines are what it is all about
-            if b <= 0:
-                z = 0
-            elif P1[j] == 0:
-                z = -a/b
-            elif a + P1[j] < b * (coef[jdx] + d[jdx]):
-                z = -(a + P1[j])/b
-            elif a - P1[j] > b * (coef[jdx] + d[jdx]):
-                z = -(a - P1[j])/b
-            else:
-                z = -(coef[jdx] + d[jdx])
-
-            # update direction d
-            d[jdx] += z
-            # update A because d_j is now d_j+z
-            # A = f'(w) + d*H(w) + (w+d)*P2
-            # => A += (H+P2)*e_j z = B_j * z
-            # Note: B is symmetric B = B.transpose
-            if diag_fisher:
-                # Bj = B[:, j] calculated above, still valid
-                A += Bj * z
-            else:
-                # B is symmetric, C- or F-contiguous, but never sparse
-                if B.flags['F_CONTIGUOUS']:
-                    # slice columns like for sparse csc
-                    A += B[:, jdx] * z
-                else:  # B.flags['C_CONTIGUOUS'] might be true
-                    # slice rows
-                    A += B[jdx, :] * z
-            # end of cycle over features
-        # update intercept
-        if intercept:
-            if diag_fisher:
-                Bj = np.zeros_like(A)
-                Bj[0] = fisher.sum()
-                Bj[1:] = fisher @ X
-                b = Bj[0]
-            else:
-                b = B[0, 0]
-            z = 0 if b <= 0 else -A[0]/b
-            d[0] += z
-            if diag_fisher:
-                A += Bj * z
-            else:
-                if B.flags['F_CONTIGUOUS']:
-                    A += B[:, 0] * z
-                else:
-                    A += B[0, :] * z
-        # end of complete cycle
-        # stopping criterion for inner loop
-        # sum_i(|minimum of norm of subgrad of q(d)_i|)
-        # subgrad q(d) = A + subgrad ||P1*(w+d)||_1
-        mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1)
-        mn_subgrad = linalg.norm(mn_subgrad, ord=1)
-        if mn_subgrad <= inner_tol:
-            if inner_iter == 1:
-                inner_tol = inner_tol/4.
-            break
-        # end of inner loop
-    return d, coef_P2, n_cycles, inner_tol
-
-
-def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
-               max_iter=100, max_inner_iter=1000, tol=1e-4,
-               selection='cyclic ', random_state=None,
-               diag_fisher=False, copy_X=True):
-    """Solve GLM with L1 and L2 penalty by coordinate descent algorithm.
-
-    The objective being minimized in the coefficients w=coef is::
-
-        F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1
-
-    An Improved GLMNET for L1-regularized Logistic Regression:
-
-    1. Find optimal descent direction d by minimizing
-       min_d F(w+d) = min_d F(w+d) - F(w)
-    2. Quadratic approximation of F(w+d)-F(w) = q(d):
-       using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives:
-       q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
-       + ||P1*(w+d)||_1 - ||P1*w||_1
-       Then minimize q(d): min_d q(d)
-    3. Coordinate descent by updating coordinate j (d -> d+z*e_j):
-       min_z q(d+z*e_j)
-       = min_z q(d+z*e_j) - q(d)
-       = min_z A_j z + 1/2 B_jj z^2
-               + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
-       A = f'(w) + d*H(w) + (w+d)*P2
-       B = H + P2
-
-    Repeat steps 1-3 until convergence.
-    Note: Use Fisher matrix instead of Hessian for H.
-    Note: f' = -score, H = Fisher matrix
-
-    Parameters
-    ----------
-    coef : ndarray, shape (c,)
-        If fit_intercept=False, shape c=X.shape[1].
-        If fit_intercept=True, then c=X.shape[1] + 1.
-
-    X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
-        Training data (with intercept included if present). If not sparse,
-        pass directly as Fortran-contiguous data to avoid
-        unnecessary memory duplication.
-
-    y : ndarray, shape (n_samples,)
-        Target values.
-
-    weights: ndarray, shape (n_samples,)
-        Sample weights with which the deviance is weighted. The weights must
-        bee normalized and sum to 1.
-
-    P1 : {ndarray}, shape (n_features,)
-        The L1-penalty vector (=diagonal matrix)
-
-    P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features)
-        The L2-penalty matrix or vector (=diagonal matrix). If a matrix is
-        passed, it must be symmetric. If X is sparse, P2 must also be sparse.
-
-    fit_intercept : boolean, optional (default=True)
-        Specifies if a constant (a.k.a. bias or intercept) should be
-        added to the linear predictor (X*coef+intercept).
-
-    family : ExponentialDispersionModel
-
-    link : Link
-
-    max_iter : int, optional (default=100)
-        Maximum numer of outer (Newton) iterations.
-
-    max_inner_iter : int, optional (default=1000)
-        Maximum number of iterations in each inner loop, i.e. max number of
-        cycles over all features per inner loop.
-
-    tol : float, optional (default=1e-4)
-        Convergence criterion is
-        sum_i(|minimum of norm of subgrad of objective_i|)<=tol.
-
-    selection : str, optional (default='cyclic')
-        If 'random', randomly chose features in inner loop.
-
-    random_state : {int, RandomState instance, None}, optional (default=None)
-
-    diag_fisher : boolean, optional (default=False)
-        ``False`` calculates full fisher matrix, ``True`` only diagonal matrix
-        s.t. fisher = X.T @ diag @ X. This saves storage but needs more
-        matrix-vector multiplications.
-
-    copy_X : boolean, optional (default=True)
-        If ``True``, X will be copied; else, it may be overwritten.
-
-    Returns
-    -------
-    coef : ndarray, shape (c,)
-        If fit_intercept=False, shape c=X.shape[1].
-        If fit_intercept=True, then c=X.shape[1] + 1.
-
-    n_iter : number of outer iterations = newton iterations
-
-    n_cycles : number of cycles over features
-
-    References
-    ----------
-    Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
-    An Improved GLMNET for L1-regularized Logistic Regression,
-    Journal of Machine Learning Research 13 (2012) 1999-2030
-    https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
-    """
-    X = check_array(X, 'csc', dtype=[np.float64, np.float32],
-                    order='F', copy=copy_X)
-    if P2.ndim == 2:
-        P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32],
-                         order='F', copy=copy_X)
-    if sparse.issparse(X):
-        if not sparse.isspmatrix_csc(P2):
-            raise ValueError("If X is sparse, P2 must also be sparse csc"
-                             "format. Got P2 not sparse.")
-    random_state = check_random_state(random_state)
-    # Note: we already set P2 = l2*P2, P1 = l1*P1
-    # Note: we already symmetrized P2 = 1/2 (P2 + P2')
-    n_iter = 0  # number of outer iterations
-    n_cycles = 0  # number of (complete) cycles over features
-    converged = False
-    n_samples, n_features = X.shape
-    idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
-    # line search parameters
-    (beta, sigma) = (0.5, 0.01)
-    # some precalculations
-    # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a
-    #       1d array representing a diagonal matrix.
-    eta, mu, score, fisher = family._eta_mu_score_fisher(
-        coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
-        diag_fisher=diag_fisher)
-    # set up space for search direction d for inner loop
-    d = np.zeros_like(coef)
-    # initial stopping tolerance of inner loop
-    # use L1-norm of minimum of norm of subgradient of F
-    inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
-    inner_tol = linalg.norm(inner_tol, ord=1)
-    # outer loop
-    while n_iter < max_iter:
-        n_iter += 1
-        # initialize search direction d (to be optimized) with zero
-        d.fill(0)
-        # inner loop = _cd_cycle
-        d, coef_P2, n_cycles, inner_tol = \
-            _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
-                      max_inner_iter=max_inner_iter, selection=selection,
-                      random_state=random_state, diag_fisher=diag_fisher)
-        # line search by sequence beta^k, k=0, 1, ..
-        # F(w + lambda d) - F(w) <= lambda * bound
-        # bound = sigma * (f'(w)*d + w*P2*d
-        #                  +||P1 (w+d)||_1 - ||P1 w||_1)
-        P1w_1 = linalg.norm(P1 * coef[idx:], ord=1)
-        P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1)
-        # Note: coef_P2 already calculated and still valid
-        bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1)
-        Fw = (0.5 * family.deviance(y, mu, weights) +
-              0.5 * (coef_P2 @ coef[idx:]) + P1w_1)
-        la = 1./beta
-        for k in range(20):
-            la *= beta  # starts with la=1
-            coef_wd = coef + la * d
-            mu_wd = link.inverse(_safe_lin_pred(X, coef_wd))
-            Fwd = (0.5 * family.deviance(y, mu_wd, weights) +
-                   linalg.norm(P1 * coef_wd[idx:], ord=1))
-            if P2.ndim == 1:
-                Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:])
-            else:
-                Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:]))
-            if Fwd - Fw <= sigma * la * bound:
-                break
-        # update coefficients
-        coef += la * d
-        # calculate eta, mu, score, Fisher matrix for next iteration
-        eta, mu, score, fisher = family._eta_mu_score_fisher(
-            coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
-            diag_fisher=diag_fisher)
-        # stopping criterion for outer loop
-        # sum_i(|minimum-norm of subgrad of F(w)_i|)
-        # fp_wP2 = f'(w) + w*P2
-        # Note: eta, mu and score are already updated
-        mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
-        mn_subgrad = linalg.norm(mn_subgrad, ord=1)
-        if mn_subgrad <= tol:
-            converged = True
-            break
-        # end of outer loop
-    if not converged:
-        warnings.warn("Coordinate descent failed to converge. Increase"
-                      " the maximum number of iterations max_iter"
-                      " (currently {0})".format(max_iter), ConvergenceWarning)
-
-    return coef, n_iter, n_cycles
-
-
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """Regression via a Generalized Linear Model (GLM) with penalties.
 
@@ -1329,28 +947,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     priors as regularizer::
 
             1/(2*sum(s)) * deviance(y, h(X*w); s)
-            + alpha * l1_ratio * ||P1*w||_1
-            + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
-
-    with inverse link function h and s=sample_weight. Note that for
-    ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples).
-    For ``P1=P2='identity'``, the penalty is the elastic net::
-
-            alpha * l1_ratio * ||w||_1
-            + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2
-
-    If you are interested in controlling the L1 and L2 penalties
-    separately, keep in mind that this is equivalent to::
+            + 1/2 * alpha * w*P2*w
 
-            a * L1 + b * L2
-
-    where::
-
-            alpha = a + b and l1_ratio = a / (a + b)
-
-    The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet,
-    while ``alpha`` corresponds to the lambda parameter in glmnet.
-    Specifically, l1_ratio = 1 is the lasso penalty.
+    with inverse link function h and s=sample_weight. 
+    The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
 
     Read more in the :ref:`User Guide <Generalized_linear_regression>`.
 
@@ -1364,19 +964,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         case, the design matrix X must have full column rank
         (no collinearities).
 
-    l1_ratio : float, optional (default=0)
-        The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For
-        ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it
-        is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
-        combination of L1 and L2.
-
-    P1 : {'identity', array-like}, shape (n_features,), optional \
-            (default='identity')
-        With this array, you can exclude coefficients from the L1 penalty.
-        Set the corresponding value to 1 (include) or 0 (exclude). The
-        default value ``'identity'`` is the same as a 1d array of ones.
-        Note that n_features = X.shape[1].
-
     P2 : {'identity', array-like, sparse matrix}, shape \
             (n_features,) or (n_features, n_features), optional \
             (default='identity')
@@ -1416,18 +1003,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         the chi squared statistic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \
+    solver : {'auto', 'irls', 'lbfgs', 'newton-cg'}, \
             optional (default='auto')
         Algorithm to use in the optimization problem:
 
         'auto'
-            Sets 'irls' if l1_ratio equals 0, else 'cd'.
-
-        'cd'
-            Coordinate descent algorithm. It can deal with L1 as well as L2
-            penalties. Note that in order to avoid unnecessary memory
-            duplication of X in the ``fit`` method, X should be directly passed
-            as a Fortran-contiguous numpy array or sparse csc matrix.
+            Sets 'irls'
 
         'irls'
             Iterated reweighted least squares.
@@ -1450,31 +1031,17 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative) of
-        the objective function. For the cd solver, convergence is reached
-        when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the
-        subgradient of the objective and minimum-norm of ``g_i`` is the element
-        of the subgradient ``g_i`` with the smallest L2-norm.
+        the objective function. 
 
     warm_start : boolean, optional (default=False)
         If set to ``True``, reuse the solution of the previous call to ``fit``
         as initialization for ``coef_`` and ``intercept_``.
 
-    selection : str, optional (default='cyclic')
-        For the solver 'cd' (coordinate descent), the coordinates (features)
-        can be updated in either cyclic or random order.
-        If set to 'random', a random coefficient is updated every iteration
-        rather than looping over features sequentially in the same order. This
-        (setting to 'random') often leads to significantly faster convergence
-        especially when tol is higher than 1e-4.
-
     random_state : {int, RandomState instance, None}, optional (default=None)
-        The seed of the pseudo random number generator that selects a random
-        feature to be updated for solver 'cd' (coordinate descent).
         If int, random_state is the seed used by the random
         number generator; if RandomState instance, random_state is the random
         number generator; if None, the random number generator is the
-        RandomState instance used by `np.random`. Used when ``selection`` ==
-        'random'.
+        RandomState instance used by `np.random`. 
 
     diag_fisher : boolean, optional, (default=False)
         Only relevant for solver 'cd'.
@@ -1547,15 +1114,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
           Journal of Machine Learning Research 13 (2012) 1999-2030
           https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
     """
-    def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
+    def __init__(self, alpha=1.0, P2='identity',
                  fit_intercept=True, family='normal', link='auto',
                  fit_dispersion=None, solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False,
-                 selection='cyclic', random_state=None, diag_fisher=False,
+                 random_state=None, diag_fisher=False,
                  copy_X=True, check_input=True, verbose=0):
         self.alpha = alpha
-        self.l1_ratio = l1_ratio
-        self.P1 = P1
         self.P2 = P2
         self.fit_intercept = fit_intercept
         self.family = family
@@ -1565,7 +1130,6 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
         self.max_iter = max_iter
         self.tol = tol
         self.warm_start = warm_start
-        self.selection = selection
         self.random_state = random_state
         self.diag_fisher = diag_fisher
         self.copy_X = copy_X
@@ -1645,28 +1209,16 @@ def fit(self, X, y, sample_weight=None):
         if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
             raise ValueError("Penalty term must be a non-negative number;"
                              " got (alpha={0})".format(self.alpha))
-        if (not isinstance(self.l1_ratio, numbers.Number) or
-                self.l1_ratio < 0 or self.l1_ratio > 1):
-            raise ValueError("l1_ratio must be a number in interval [0, 1];"
-                             " got (l1_ratio={0})".format(self.l1_ratio))
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
-        if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']:
+        if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg']:
             raise ValueError("GeneralizedLinearRegressor supports only solvers"
-                             " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';"
+                             " 'auto', 'irls', 'lbfgs', 'newton-cg';"
                              " got {0}".format(self.solver))
         solver = self.solver
         if self.solver == 'auto':
-            if self.l1_ratio == 0:
-                solver = 'irls'
-            else:
-                solver = 'cd'
-        if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']):
-            raise ValueError("The chosen solver (solver={0}) can't deal "
-                             "with L1 penalties, which are included with "
-                             "(alpha={1}) and (l1_ratio={2})."
-                             .format(solver, self.alpha, self.l1_ratio))
+            solver = 'irls'
         if (not isinstance(self.max_iter, int)
                 or self.max_iter <= 0):
             raise ValueError("Maximum number of iteration must be a positive "
@@ -1678,10 +1230,6 @@ def fit(self, X, y, sample_weight=None):
         if not isinstance(self.warm_start, bool):
             raise ValueError("The argument warm_start must be bool;"
                              " got {0}".format(self.warm_start))
-        if self.selection not in ['cyclic', 'random']:
-            raise ValueError("The argument selection must be 'cyclic' or "
-                             "'random'; got (selection={0})"
-                             .format(self.selection))
         random_state = check_random_state(self.random_state)
         if not isinstance(self.diag_fisher, bool):
             raise ValueError("The argument diag_fisher must be bool;"
@@ -1698,16 +1246,10 @@ def fit(self, X, y, sample_weight=None):
 
         # 1.2 validate arguments of fit #######################################
         _dtype = [np.float64, np.float32]
-        if solver == 'cd':
-            _stype = ['csc']
-        else:
-            _stype = ['csc', 'csr']
+        _stype = ['csc', 'csr']
         X, y = check_X_y(X, y, accept_sparse=_stype,
                          dtype=_dtype, y_numeric=True, multi_output=False,
                          copy=self.copy_X)
-        # Without converting y to float, deviance might raise
-        # ValueError: Integers to negative integer powers are not allowed.
-        # Also, y must not be sparse.
         y = np.asarray(y, dtype=np.float64)
 
         weights = _check_weights(sample_weight, y.shape[0])
@@ -1715,23 +1257,8 @@ def fit(self, X, y, sample_weight=None):
         n_samples, n_features = X.shape
 
         # 1.3 arguments to take special care ##################################
-        # P1, P2
-        if isinstance(self.P1, str) and self.P1 == 'identity':
-            P1 = np.ones(n_features)
-        else:
-            P1 = np.atleast_1d(self.P1)
-            try:
-                P1 = P1.astype(np.float64, casting='safe', copy=False)
-            except TypeError:
-                raise TypeError("The given P1 cannot be converted to a numeric"
-                                "array; got (P1.dtype={0})."
-                                .format(P1.dtype))
-            if (P1.ndim != 1) or (P1.shape[0] != n_features):
-                raise ValueError("P1 must be either 'identity' or a 1d array "
-                                 "with the length of X.shape[1]; "
-                                 "got (P1.shape[0]={0}), "
-                                 "needed (X.shape[1]={1})."
-                                 .format(P1.shape[0], n_features))
+        # P2
+
         # If X is sparse, make P2 sparse, too.
         if isinstance(self.P2, str) and self.P2 == 'identity':
             if sparse.issparse(X):
@@ -1766,10 +1293,8 @@ def fit(self, X, y, sample_weight=None):
                                  "got (P2.shape=({0}, {1})), needed ({2}, {2})"
                                  .format(P2.shape[0], P2.shape[1], X.shape[1]))
 
-        l1 = self.alpha * self.l1_ratio
-        l2 = self.alpha * (1 - self.l1_ratio)
-        # P1 and P2 are now for sure copies
-        P1 = l1 * P1
+        l2 = self.alpha
+        # P2 is now for sure a copy
         P2 = l2 * P2
         # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
         # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
@@ -1792,11 +1317,6 @@ def fit(self, X, y, sample_weight=None):
                 raise ValueError("Some value(s) of y are out of the valid "
                                  "range for family {0}"
                                  .format(family.__class__.__name__))
-            # check if P1 has only non-negative values, negative values might
-            # indicate group lasso in the future.
-            if not isinstance(self.P1, str):  # if self.P1 != 'identity':
-                if not np.all(P1 >= 0):
-                    raise ValueError("P1 must not have negative values.")
             # check if P2 is positive semidefinite
             # np.linalg.cholesky(P2) 'only' asserts positive definite
             if not isinstance(self.P2, str):  # self.P2 != 'identity'
@@ -1845,8 +1365,6 @@ def fit(self, X, y, sample_weight=None):
         # Note: Since phi=self.dispersion_ does not enter the estimation
         #       of mu_i=E[y_i], set it to 1.
 
-        # set start values for coef
-        coef = None
         if self.warm_start and hasattr(self, 'coef_'):
             if self.fit_intercept:
                 coef = np.concatenate((np.array([self.intercept_]),
@@ -1975,18 +1493,6 @@ def Hs(coef):
                                            args=args, maxiter=self.max_iter,
                                            tol=self.tol)
 
-        # 4.4 coordinate descent ##############################################
-        # Note: we already set P1 = l1*P1, see above
-        # Note: we already set P2 = l2*P2, see above
-        # Note: we already symmetrized P2 = 1/2 (P2 + P2')
-        elif solver == 'cd':
-            coef, self.n_iter_, self._n_cycles = \
-                _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1,
-                           P2=P2, fit_intercept=self.fit_intercept,
-                           family=family, link=link,
-                           max_iter=self.max_iter, tol=self.tol,
-                           selection=self.selection, random_state=random_state,
-                           diag_fisher=self.diag_fisher, copy_X=self.copy_X)
 
         #######################################################################
         # 5. postprocessing                                                   #
@@ -2097,9 +1603,6 @@ def estimate_phi(self, X, y, sample_weight=None):
             dev = self._family_instance.deviance(y, mu, weights)
             return dev/(n_samples - n_features)
 
-    # Note: check_estimator(GeneralizedLinearRegressor) might raise
-    # "AssertionError: -0.28014056555724598 not greater than 0.5"
-    # unless GeneralizedLinearRegressor has a score which passes the test.
     def score(self, X, y, sample_weight=None):
         """Compute D^2, the percentage of deviance explained.
 
@@ -2212,8 +1715,7 @@ class PoissonRegressor(GeneralizedLinearRegressor):
         If int, random_state is the seed used by the random
         number generator; if RandomState instance, random_state is the random
         number generator; if None, the random number generator is the
-        RandomState instance used by `np.random`. Used when ``selection`` ==
-        'random'.
+        RandomState instance used by `np.random`.
 
     copy_X : boolean, optional, (default=True)
         If ``True``, X will be copied; else, it may be overwritten.
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index f51f630ebae7e..8fc1241e1da7a 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -25,7 +25,7 @@
 
 from sklearn.utils.testing import assert_array_equal
 
-GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg', 'cd']
+GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg']
 
 
 @pytest.fixture(scope="module")
@@ -236,28 +236,6 @@ def test_glm_alpha_argument(alpha):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]])
-def test_glm_l1_ratio_argument(l1_ratio):
-    """Test GLM for invalid l1_ratio argument."""
-    y = np.array([1, 2])
-    X = np.array([[1], [2]])
-    glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio)
-    with pytest.raises(ValueError,
-                       match="l1_ratio must be a number in interval.*0, 1"):
-        glm.fit(X, y)
-
-
-@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3],
-                                [-1]])
-def test_glm_P1_argument(P1):
-    """Test GLM for invalid P1 argument."""
-    y = np.array([1, 2])
-    X = np.array([[1], [2]])
-    glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True)
-    with pytest.raises((ValueError, TypeError)):
-        glm.fit(X, y)
-
-
 @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]],
                                 sparse.csr_matrix([1, 2, 3]), [-1]])
 def test_glm_P2_argument(P2):
@@ -301,14 +279,13 @@ def test_glm_fit_intercept_argument(fit_intercept):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize('solver, l1_ratio',
-                         [('not a solver', 0), (1, 0), ([1], 0),
-                          ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)])
-def test_glm_solver_argument(solver, l1_ratio):
+@pytest.mark.parametrize('solver',
+                         ['not a solver', 1, [1]])
+def test_glm_solver_argument(solver):
     """Test GLM for invalid solver argument."""
     y = np.array([1, 2])
     X = np.array([[1], [2]])
-    glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio)
+    glm = GeneralizedLinearRegressor(solver=solver)
     with pytest.raises(ValueError):
         glm.fit(X, y)
 
@@ -343,28 +320,6 @@ def test_glm_warm_start_argument(warm_start):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize('start_params',
-                         ['not a start_params', ['zero'], [0, 0, 0],
-                          [[0, 0]], ['a', 'b']])
-def test_glm_start_params_argument(start_params):
-    """Test GLM for invalid start_params argument."""
-    y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    glm = GeneralizedLinearRegressor(start_params=start_params)
-    with pytest.raises(ValueError):
-        glm.fit(X, y)
-
-
-@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']])
-def test_glm_selection_argument(selection):
-    """Test GLM for invalid selection argument"""
-    y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    glm = GeneralizedLinearRegressor(selection=selection)
-    with pytest.raises(ValueError, match="argument selection must be"):
-        glm.fit(X, y)
-
-
 @pytest.mark.parametrize('random_state', ['a string', 0.5, [0]])
 def test_glm_random_state_argument(random_state):
     """Test GLM for invalid random_state argument."""
@@ -413,7 +368,7 @@ def test_glm_identity_regression(solver):
     y = np.dot(X, coef)
     glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
                                      fit_intercept=False, solver=solver,
-                                     start_params='zero', tol=1e-7)
+                                     tol=1e-7)
     res = glm.fit(X, y)
     assert_allclose(res.coef_, coef, rtol=1e-6)
 
@@ -427,7 +382,7 @@ def test_glm_identity_regression(solver):
 @pytest.mark.parametrize('solver, tol', [('irls', 1e-6),
                                          ('lbfgs', 1e-6),
                                          ('newton-cg', 1e-7),
-                                         ('cd', 1e-7)])
+])
 def test_glm_log_regression(family, solver, tol):
     """Test GLM regression with log link on a simple dataset."""
     coef = [0.2, -0.1]
@@ -435,7 +390,7 @@ def test_glm_log_regression(family, solver, tol):
     y = np.exp(np.dot(X, coef))
     glm = GeneralizedLinearRegressor(
                 alpha=0, family=family, link='log', fit_intercept=False,
-                solver=solver, start_params='guess', tol=tol)
+                solver=solver, tol=tol)
     res = glm.fit(X, y)
     assert_allclose(res.coef_, coef, rtol=5e-6)
 
@@ -472,14 +427,14 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
                   random_state=42, **ridge_params)
     ridge.fit(X, y)
 
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+    glm = GeneralizedLinearRegressor(alpha=1.0, family='normal',
                                      link='identity', fit_intercept=True,
                                      max_iter=300, solver=solver, tol=1e-6,
                                      check_input=False, random_state=42)
     glm.fit(X, y)
     assert glm.coef_.shape == (X.shape[1], )
     assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6)
-    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6)
+    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
     assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
 
 
@@ -487,7 +442,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
                          [('irls', 1e-7),
                           ('lbfgs', 1e-7),
                           ('newton-cg', 1e-7),
-                          ('cd', 1e-7)])
+])
 def test_poisson_ridge(solver, tol):
     """Test ridge regression with poisson family and LogLink.
 
@@ -506,130 +461,24 @@ def test_poisson_ridge(solver, tol):
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
     rng = np.random.RandomState(42)
-    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
+    glm = GeneralizedLinearRegressor(alpha=1,
                                      fit_intercept=True, family='poisson',
                                      link='log', tol=1e-7,
                                      solver=solver, max_iter=300,
                                      random_state=rng)
     glm.fit(X, y)
     assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5)
-    assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-6)
-
-
-@pytest.mark.parametrize('diag_fisher', [False, True])
-def test_normal_enet(diag_fisher):
-    """Test elastic net regression with normal/gaussian family."""
-    alpha, l1_ratio = 0.3, 0.7
-    n_samples, n_features = 20, 2
-    rng = np.random.RandomState(42)
-    X = rng.randn(n_samples, n_features).copy(order='F')
-    beta = rng.randn(n_features)
-    y = 2 + np.dot(X, beta) + rng.randn(n_samples)
-
-    # 1. test normal enet on dense data
-    glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio,
-                                     family='normal', link='identity',
-                                     fit_intercept=True, tol=1e-8,
-                                     max_iter=100, selection='cyclic',
-                                     solver='cd', start_params='zero',
-                                     check_input=False,
-                                     diag_fisher=diag_fisher)
-    glm.fit(X, y)
-
-    enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True,
-                      normalize=False, tol=1e-8, copy_X=True)
-    enet.fit(X, y)
-
-    assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7)
-    assert_allclose(glm.coef_, enet.coef_, rtol=5e-5)
-
-    # 2. test normal enet on sparse data
-    X = sparse.csc_matrix(X)
-    glm.fit(X, y)
-    assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7)
-    assert_allclose(glm.coef_, enet.coef_, rtol=5e-5)
-
-
-def test_poisson_enet():
-    """Test elastic net regression with poisson family and LogLink.
-
-    Compare to R's glmnet"""
-    # library("glmnet")
-    # options(digits=10)
-    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
-    # x <- data.matrix(df[,c("a", "b")])
-    # y <- df$y
-    # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson",
-    #               standardize=F, thresh=1e-10, nlambda=10000)
-    # coef(fit, s=1)
-    # (Intercept) -0.03550978409
-    # a            0.16936423283
-    # b            .
-    glmnet_intercept = -0.03550978409
-    glmnet_coef = [0.16936423283, 0.]
-    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
-    y = np.array([0, 1, 1, 2])
-    rng = np.random.RandomState(42)
-    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
-                                     link='log', solver='cd', tol=1e-8,
-                                     selection='random', random_state=rng,
-                                     start_params='guess')
-    glm.fit(X, y)
-    assert_allclose(glm.intercept_, glmnet_intercept, rtol=2e-6)
-    assert_allclose(glm.coef_, glmnet_coef, rtol=2e-7)
-
-    # test results with general optimization procedure
-    def obj(coef):
-        pd = PoissonDistribution()
-        link = LogLink()
-        N = y.shape[0]
-        mu = link.inverse(X @ coef[1:] + coef[0])
-        alpha, l1_ratio = (1, 0.5)
-        return 1./(2.*N) * pd.deviance(y, mu) \
-            + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \
-            + alpha * l1_ratio * np.sum(np.abs(coef[1:]))
-    res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10,
-                            options={'maxiter': 1000, 'disp': False})
-    assert_allclose(glm.intercept_, res.x[0], rtol=5e-5)
-    assert_allclose(glm.coef_, res.x[1:], rtol=1e-5, atol=1e-9)
-    assert_allclose(obj(np.concatenate(([glm.intercept_], glm.coef_))),
-                    res.fun, rtol=1e-8)
-
-    # same for start_params='zero' and selection='cyclic'
-    # with reduced precision
-    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
-                                     link='log', solver='cd', tol=1e-5,
-                                     selection='cyclic', start_params='zero')
-    glm.fit(X, y)
-    assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4)
-    assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4)
-
-    # check warm_start, therefore start with different alpha
-    glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5,
-                                     family='poisson', max_iter=300,
-                                     link='log', solver='cd', tol=1e-5,
-                                     selection='cyclic', start_params='zero')
-    glm.fit(X, y)
-    # warm start with original alpha and use of sparse matrices
-    glm.warm_start = True
-    glm.alpha = 1
-    X = sparse.csr_matrix(X)
-    glm.fit(X, y)
-    assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4)
-    assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4)
+    assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5)
 
 
 @pytest.mark.parametrize(
         "params",
         [
-            {"solver": "irls", "start_params": "guess"},
-            {"solver": "irls", "start_params": "zero"},
-            {"solver": "lbfgs", "start_params": "guess"},
-            {"solver": "lbfgs", "start_params": "zero"},
+            {"solver": "irls" },
+            {"solver": "irls" },
+            {"solver": "lbfgs" },
+            {"solver": "lbfgs"},
             {"solver": "newton-cg"},
-            {"solver": "cd", "selection": "cyclic", "diag_fisher": False},
-            {"solver": "cd", "selection": "cyclic", "diag_fisher": True},
-            {"solver": "cd", "selection": "random", "diag_fisher": False},
         ],
         ids=lambda params: ', '.join("%s=%s" % (key, val)
                                      for key,  val in params.items())

From d25042e23c595b34cd5382a60bfd39613ac3a2ae Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jul 2019 10:50:27 -0500
Subject: [PATCH 070/209] Remove newton CG algorithm

---
 sklearn/linear_model/_glm.py           | 100 +++----------------------
 sklearn/linear_model/tests/test_glm.py |   8 +-
 2 files changed, 13 insertions(+), 95 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index 15211c6038007..e11d7c08064ce 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -17,7 +17,6 @@
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array, check_X_y
-from ..utils.optimize import newton_cg
 from ..utils.validation import check_is_fitted, check_random_state
 
 
@@ -1003,7 +1002,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         the chi squared statistic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'auto', 'irls', 'lbfgs', 'newton-cg'}, \
+    solver : {'auto', 'irls', 'lbfgs'}, \
             optional (default='auto')
         Algorithm to use in the optimization problem:
 
@@ -1016,10 +1015,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
             L1 penalties.
 
         'lbfgs'
-            Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties.
+            Calls scipy's L-BFGS-B optimizer.
 
-        'newton-cg', 'lbfgs'
-            Newton conjugate gradient algorithm cannot deal with L1 penalties.
 
         Note that all solvers except lbfgs use the fisher matrix, i.e. the
         expected Hessian instead of the Hessian matrix.
@@ -1028,7 +1025,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         The maximal number of iterations for solver algorithms.
 
     tol : float, optional (default=1e-4)
-        Stopping criterion. For the irls, newton-cg and lbfgs solvers,
+        Stopping criterion. For the irls, and lbfgs solvers,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative) of
         the objective function. 
@@ -1212,9 +1209,9 @@ def fit(self, X, y, sample_weight=None):
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
-        if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg']:
+        if self.solver not in ['auto', 'irls', 'lbfgs']:
             raise ValueError("GeneralizedLinearRegressor supports only solvers"
-                             " 'auto', 'irls', 'lbfgs', 'newton-cg';"
+                             "'auto', 'irls', 'lbfgs';"
                              " got {0}".format(self.solver))
         solver = self.solver
         if self.solver == 'auto':
@@ -1410,6 +1407,9 @@ def func(coef, X, y, weights, P2, family, link):
                 return obj, objp
 
             args = (X, y, weights, P2, family, link)
+            # TODO: refactor this once
+            # https://github.com/scikit-learn/scikit-learn/pull/14250
+            # is merged.
             coef, loss, info = fmin_l_bfgs_b(
                 func, coef, fprime=None, args=args,
                 iprint=(self.verbose > 0) - 1, pgtol=self.tol,
@@ -1423,76 +1423,6 @@ def func(coef, X, y, weights, P2, family, link):
                               .format(info["task"]))
             self.n_iter_ = info['nit']
 
-        # 4.3 Newton-CG #######################################################
-        # We use again the fisher matrix instead of the hessian. More
-        # precisely, expected hessian of deviance.
-        elif solver == 'newton-cg':
-            def func(coef, X, y, weights, P2, family, link):
-                intercept = (coef.size == X.shape[1] + 1)
-                idx = 1 if intercept else 0  # offset if coef[0] is intercept
-                if P2.ndim == 1:
-                    L2 = coef[idx:] @ (P2 * coef[idx:])
-                else:
-                    L2 = coef[idx:] @ (P2 @ coef[idx:])
-                mu = link.inverse(_safe_lin_pred(X, coef))
-                return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2
-
-            def grad(coef, X, y, weights, P2, family, link):
-                mu, devp = \
-                    family._mu_deviance_derivative(coef, X, y, weights, link)
-                intercept = (coef.size == X.shape[1] + 1)
-                idx = 1 if intercept else 0  # offset if coef[0] is intercept
-                if P2.ndim == 1:
-                    L2 = P2 * coef[idx:]
-                else:
-                    L2 = P2 @ coef[idx:]
-                objp = 0.5 * devp
-                objp[idx:] += L2
-                return objp
-
-            def grad_hess(coef, X, y, weights, P2, family, link):
-                intercept = (coef.size == X.shape[1] + 1)
-                idx = 1 if intercept else 0  # offset if coef[0] is intercept
-                if P2.ndim == 1:
-                    L2 = P2 * coef[idx:]
-                else:
-                    L2 = P2 @ coef[idx:]
-                eta = _safe_lin_pred(X, coef)
-                mu = link.inverse(eta)
-                d1 = link.inverse_derivative(eta)
-                temp = d1 * family.deviance_derivative(y, mu, weights)
-                if intercept:
-                    grad = np.concatenate(([0.5 * temp.sum()],
-                                           0.5 * temp @ X + L2))
-                else:
-                    grad = 0.5 * temp @ X + L2  # same as 0.5* X.T @ temp + L2
-
-                # expected hessian = fisher = X.T @ diag_matrix @ X
-                # calculate only diag_matrix
-                diag = d1**2 / family.variance(mu, phi=1, weights=weights)
-                if intercept:
-                    h0i = np.concatenate(([diag.sum()], diag @ X))
-
-                def Hs(coef):
-                    # return (0.5 * fisher + P2) @ coef
-                    # ret = 0.5 * (X.T @ (diag * (X @ coef)))
-                    ret = 0.5 * ((diag * (X @ coef[idx:])) @ X)
-                    if P2.ndim == 1:
-                        ret += P2 * coef[idx:]
-                    else:
-                        ret += P2 @ coef[idx:]
-                    if intercept:
-                        ret = np.concatenate(([0.5 * (h0i @ coef)],
-                                             ret + 0.5 * coef[0] * h0i[1:]))
-                    return ret
-
-                return grad, Hs
-
-            args = (X, y, weights, P2, family, link)
-            coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef,
-                                           args=args, maxiter=self.max_iter,
-                                           tol=self.tol)
-
 
         #######################################################################
         # 5. postprocessing                                                   #
@@ -1511,7 +1441,7 @@ def Hs(coef):
 
         return self
 
-    def linear_predictor(self, X):
+    def _linear_predictor(self, X):
         """Compute the linear_predictor = X*coef_ + intercept_.
 
         Parameters
@@ -1552,7 +1482,7 @@ def predict(self, X, sample_weight=None):
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
                         dtype='numeric', copy=True, ensure_2d=True,
                         allow_nd=False)
-        eta = self.linear_predictor(X)
+        eta = self._linear_predictor(X)
         mu = self._link_instance.inverse(eta)
         weights = _check_weights(sample_weight, X.shape[0])
 
@@ -1682,7 +1612,7 @@ class PoissonRegressor(GeneralizedLinearRegressor):
         the chi squared statistic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'irls', 'lbfgs', 'newton-cg'}, optional (default='irls')
+    solver : {'irls', 'lbfgs'}, optional (default='irls')
         Algorithm to use in the optimization problem:
 
         'irls'
@@ -1692,17 +1622,11 @@ class PoissonRegressor(GeneralizedLinearRegressor):
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
 
-        'newton-cg'
-            Newton conjugate gradient algorithm.
-
-        Note that all solvers except lbfgs use the fisher matrix, i.e. the
-        expected Hessian instead of the Hessian matrix.
-
     max_iter : int, optional (default=100)
         The maximal number of iterations for solver algorithms.
 
     tol : float, optional (default=1e-4)
-        Stopping criterion. For the irls, newton-cg and lbfgs solvers,
+        Stopping criterion. For the irls, and lbfgs solvers,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative) of
         the objective function.
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 8fc1241e1da7a..0cee56afb3042 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -25,7 +25,7 @@
 
 from sklearn.utils.testing import assert_array_equal
 
-GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg']
+GLM_SOLVERS = ['irls', 'lbfgs']
 
 
 @pytest.fixture(scope="module")
@@ -381,7 +381,6 @@ def test_glm_identity_regression(solver):
 ])
 @pytest.mark.parametrize('solver, tol', [('irls', 1e-6),
                                          ('lbfgs', 1e-6),
-                                         ('newton-cg', 1e-7),
 ])
 def test_glm_log_regression(family, solver, tol):
     """Test GLM regression with log link on a simple dataset."""
@@ -395,9 +394,6 @@ def test_glm_log_regression(family, solver, tol):
     assert_allclose(res.coef_, coef, rtol=5e-6)
 
 
-# newton-cg may issue a LineSearchWarning, which we filter out
-@pytest.mark.filterwarnings('ignore:The line search algorithm')
-@pytest.mark.filterwarnings('ignore:Line Search failed')
 @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)])
 @pytest.mark.parametrize('fit_intercept', [True, False])
 @pytest.mark.parametrize('solver', GLM_SOLVERS)
@@ -441,7 +437,6 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
 @pytest.mark.parametrize('solver, tol',
                          [('irls', 1e-7),
                           ('lbfgs', 1e-7),
-                          ('newton-cg', 1e-7),
 ])
 def test_poisson_ridge(solver, tol):
     """Test ridge regression with poisson family and LogLink.
@@ -478,7 +473,6 @@ def test_poisson_ridge(solver, tol):
             {"solver": "irls" },
             {"solver": "lbfgs" },
             {"solver": "lbfgs"},
-            {"solver": "newton-cg"},
         ],
         ids=lambda params: ', '.join("%s=%s" % (key, val)
                                      for key,  val in params.items())

From 07ee4954ef118227832d9ac2ad562a5aec7af38b Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jul 2019 10:54:20 -0500
Subject: [PATCH 071/209] Remove fisher_matrix, _observed_information and
 _eta_mu_score_fisher

---
 sklearn/linear_model/_glm.py           | 120 -------------------------
 sklearn/linear_model/tests/test_glm.py |  43 ---------
 2 files changed, 163 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index e11d7c08064ce..fa434821bb80e 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -253,9 +253,6 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
 
     _mu_deviance_derivative
     _score
-    _fisher_matrix
-    _observed_information
-    _eta_mu_score_fisher
 
     References
     ----------
@@ -518,123 +515,6 @@ def _score(self, coef, phi, X, y, weights, link):
             score = temp @ X  # sampe as X.T @ temp
         return score
 
-    def _fisher_matrix(self, coef, phi, X, y, weights, link):
-        r"""Compute the Fisher information matrix.
-
-        The Fisher information matrix, also known as expected information
-        matrix is given by
-
-        .. math:
-
-            \mathbf{F}(\boldsymbol{w}) =
-            \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial
-            \boldsymbol{w}} \right]
-            = \mathrm{E}\left[
-            -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
-            \partial\boldsymbol{w}^T}\right]
-            = \mathbf{X}^T W \mathbf{X} \,,
-
-        with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`,
-        see func:`_score`.
-        """
-        lin_pred = _safe_lin_pred(X, coef)
-        mu = link.inverse(lin_pred)
-        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
-        d = link.inverse_derivative(lin_pred)
-        d2_sigma_inv = sigma_inv * d * d
-        intercept = (coef.size == X.shape[1] + 1)
-        fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv,
-                                           intercept=intercept)
-        return fisher_matrix
-
-    def _observed_information(self, coef, phi, X, y, weights, link):
-        r"""Compute the observed information matrix.
-
-        The observed information matrix, also known as the negative of
-        the Hessian matrix of the log-likelihood, is given by
-
-        .. math:
-
-            \mathbf{H}(\boldsymbol{w}) =
-            -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
-            \partial\boldsymbol{w}^T}
-            = \mathbf{X}^T \left[
-            - \mathbf{D}' \mathbf{R}
-            + \mathbf{D}^2 \mathbf{V} \mathbf{R}
-            + \mathbf{D}^2
-            \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,,
-
-        with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`,
-        :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{
-        v(\mu_i)}
-        \right)`,
-        see :func:`score_` function and :func:`_fisher_matrix`.
-        """
-        lin_pred = _safe_lin_pred(X, coef)
-        mu = link.inverse(lin_pred)
-        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
-        dp = link.inverse_derivative2(lin_pred)
-        d2 = link.inverse_derivative(lin_pred)**2
-        v = self.unit_variance_derivative(mu)/self.unit_variance(mu)
-        r = y - mu
-        temp = sigma_inv * (-dp * r + d2 * v * r + d2)
-        intercept = (coef.size == X.shape[1] + 1)
-        observed_information = _safe_sandwich_dot(X, temp,
-                                                  intercept=intercept)
-        return observed_information
-
-    def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link,
-                             diag_fisher=False):
-        """Compute linear predictor, mean, score function and fisher matrix.
-
-        It calculates the linear predictor, the mean, score function
-        (derivative of log-likelihood) and Fisher information matrix
-        all in one go as function of `coef` (:math:`w`) and the data.
-
-        Parameters
-        ----------
-        diag_fisher : boolean, optional (default=False)
-            If ``True``, returns only an array d such that
-            fisher = X.T @ np.diag(d) @ X.
-
-        Returns
-        -------
-        (eta, mu, score, fisher) : tuple with 4 elements
-            The 4 elements are:
-
-            * eta: ndarray, shape (X.shape[0],)
-            * mu: ndarray, shape (X.shape[0],)
-            * score: ndarray, shape (X.shape[0],)
-            * fisher:
-
-                * If diag_fisher is ``False``, the full fisher matrix,
-                  an array of shape (X.shape[1], X.shape[1])
-                * If diag_fisher is ``True`, an array of shape (X.shape[0])
-        """
-        intercept = (coef.size == X.shape[1] + 1)
-        # eta = linear predictor
-        eta = _safe_lin_pred(X, coef)
-        mu = link.inverse(eta)
-        sigma_inv = 1./self.variance(mu, phi=phi, weights=weights)
-        d1 = link.inverse_derivative(eta)  # = h'(eta)
-        # Alternatively:
-        # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
-        # d1 = 1./link.derivative(mu)
-        d1_sigma_inv = d1 * sigma_inv
-        temp = d1_sigma_inv * (y - mu)
-        if intercept:
-            score = np.concatenate(([temp.sum()], temp @ X))
-        else:
-            score = temp @ X
-
-        d2_sigma_inv = d1 * d1_sigma_inv
-        if diag_fisher:
-            fisher_matrix = d2_sigma_inv
-        else:
-            fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv,
-                                               intercept=intercept)
-        return eta, mu, score, fisher_matrix
-
 
 class TweedieDistribution(ExponentialDispersionModel):
     r"""A class for the Tweedie distribution.
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 0cee56afb3042..b9716388fb36c 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -109,49 +109,6 @@ def test_deviance_zero(family, chk_values):
         assert_allclose(family.deviance(x, x), 0, atol=1e-9)
 
 
-@pytest.mark.parametrize(
-    'family, link',
-    [(NormalDistribution(), IdentityLink()),
-     (PoissonDistribution(), LogLink()),
-     (GammaDistribution(), LogLink()),
-     (InverseGaussianDistribution(), LogLink()),
-     (TweedieDistribution(power=1.5), LogLink()),
-     (TweedieDistribution(power=4.5), LogLink())],
-    ids=lambda args: args.__class__.__name__)
-def test_fisher_matrix(family, link):
-    """Test the Fisher matrix numerically.
-    Trick: Use numerical differentiation with y = mu"""
-    coef = np.array([-2, 1, 0, 1, 2.5])
-    phi = 0.5
-    rng = np.random.RandomState(42)
-    X = rng.randn(10, 5)
-    lin_pred = np.dot(X, coef)
-    mu = link.inverse(lin_pred)
-    weights = rng.randn(10)**2 + 1
-    fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu,
-                                   weights=weights, link=link)
-    # check that the Fisher matrix is square and positive definite
-    assert fisher.ndim == 2
-    assert fisher.shape[0] == fisher.shape[1]
-    assert np.all(np.linalg.eigvals(fisher) >= 0)
-
-    approx = np.array([]).reshape(0, coef.shape[0])
-    for i in range(coef.shape[0]):
-        def f(coef):
-            return -family._score(coef=coef, phi=phi, X=X, y=mu,
-                                  weights=weights, link=link)[i]
-        approx = np.vstack(
-            [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)])
-    assert_allclose(fisher, approx, rtol=1e-3)
-
-    # check the observed information matrix
-    oim = family._observed_information(coef=coef, phi=phi, X=X, y=mu,
-                                       weights=weights, link=link)
-    assert oim.ndim == 2
-    assert oim.shape == fisher.shape
-    assert_allclose(oim, fisher)
-
-
 def test_sample_weights_validation():
     """Test the raised errors in the validation of sample_weight."""
     # scalar value but not positive

From d0eb2850b91e74d1e8591f1bbd758cb06d85a3bc Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jul 2019 11:21:18 -0500
Subject: [PATCH 072/209] Remove matrix L2 penalty and IRLS solver

---
 sklearn/linear_model/_glm.py           | 327 ++-----------------------
 sklearn/linear_model/tests/test_glm.py |  57 +----
 2 files changed, 26 insertions(+), 358 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index fa434821bb80e..b469b3c4edd17 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -669,164 +669,16 @@ def __init__(self):
 }
 
 
-def _irls_step(X, W, P2, z, fit_intercept=True):
-    """Compute one step in iteratively reweighted least squares.
-
-    Solve A w = b for w with
-    A = (X' W X + P2)
-    b = X' W z
-    z = eta + D^-1 (y-mu)
-
-    See also fit method of :class:`GeneralizedLinearRegressor`.
-
-    Parameters
-    ----------
-    X : {ndarray, sparse matrix}, shape (n_samples, n_features)
-        Training data (with intercept included if present)
-
-    W : ndarray, shape (n_samples,)
-
-    P2 : {ndarray, sparse matrix}, shape (n_features, n_features)
-        The L2-penalty matrix or vector (=diagonal matrix)
-
-    z : ndarray, shape (n_samples,)
-        Working observations
-
-    fit_intercept : boolean, optional (default=True)
-
-    Returns
-    -------
-    coef : ndarray, shape (c,)
-        If fit_intercept=False, shape c=X.shape[1].
-        If fit_intercept=True, then c=X.shapee[1] + 1.
-    """
-    # Note: solve vs least squares, what is more appropriate?
-    #       scipy.linalg.solve seems faster, but scipy.linalg.lstsq
-    #       is more robust.
-    # Note: X.T @ W @ X is not sparse, even when X is sparse.
-    #      Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b)
-    if fit_intercept:
-        Wz = W * z
-        if sparse.issparse(X):
-            b = np.concatenate(([Wz.sum()], X.transpose() @ Wz))
-        else:
-            b = np.concatenate(([Wz.sum()], X.T @ Wz))
-        A = _safe_sandwich_dot(X, W, intercept=fit_intercept)
-        if P2.ndim == 1:
-            idx = np.arange(start=1, stop=A.shape[0])
-            A[(idx, idx)] += P2  # add to diag elements without intercept
-        elif sparse.issparse(P2):
-            A[1:, 1:] += P2.toarray()
-        else:
-            A[1:, 1:] += P2
-    else:
-        if sparse.issparse(X):
-            XtW = X.transpose().multiply(W)
-            # for older versions of numpy and scipy, A may be a np.matrix
-            A = _safe_toarray(XtW @ X)
-        else:
-            XtW = (X.T * W)
-            A = XtW @ X
-        b = XtW @ z
-        if P2.ndim == 1:
-            A[np.diag_indices_from(A)] += P2
-        elif sparse.issparse(P2):
-            A += P2.toarray()
-        else:
-            A += P2
-
-    coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
-    return coef
-
-
-def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link,
-                 max_iter, tol):
-    """Solve GLM with L2 penalty by IRLS algorithm.
-
-    Note: If X is sparse, P2 must also be sparse.
-    """
-    # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj'
-    #   Obj = objective function = 1/2 Dev + l2/2 w P2 w
-    #   Dev = deviance, s = normalized weights, variance V(mu) but phi=1
-    #   D   = link.inverse_derivative(eta) = diag_matrix(h'(X w))
-    #   D2  = link.inverse_derivative(eta)^2 = D^2
-    #   W   = D2/V(mu)
-    #   l2  = alpha
-    #   Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w
-    #        = -X' D (y-mu)/V(mu) + l2 P2 w
-    #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
-    #   Use Fisher matrix instead of full info matrix -X'(...) X,
-    #    i.e. E[Dev''] with E[y-mu]=0:
-    #   Obj'' ~ X' W X + l2 P2
-    # (1): w = (X' W X + l2 P2)^-1 X' W z,
-    #      with z = eta + D^-1 (y-mu)
-    # Note: P2 must be symmetrized
-    # Note: ' denotes derivative, but also transpose for matrices
-
-    eta = _safe_lin_pred(X, coef)
-    mu = link.inverse(eta)
-    # D = h'(eta)
-    hp = link.inverse_derivative(eta)
-    V = family.variance(mu, phi=1, weights=weights)
-
-    converged = False
-    n_iter = 0
-    while n_iter < max_iter:
-        n_iter += 1
-        # coef_old not used so far.
-        # coef_old = coef
-        # working weights W, in principle a diagonal matrix
-        # therefore here just as 1d array
-        W = hp**2 / V
-        # working observations
-        z = eta + (y - mu) / hp
-        # solve A*coef = b
-        # A = X' W X + P2, b = X' W z
-        coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept)
-        # updated linear predictor
-        # do it here for updated values for tolerance
-        eta = _safe_lin_pred(X, coef)
-        mu = link.inverse(eta)
-        hp = link.inverse_derivative(eta)
-        V = family.variance(mu, phi=1, weights=weights)
-
-        # which tolerace? |coef - coef_old| or gradient?
-        # use gradient for compliance with newton-cg and lbfgs
-        # gradient = -X' D (y-mu)/V(mu) + l2 P2 w
-        temp = hp * (y - mu) / V
-        if sparse.issparse(X):
-            gradient = -(X.transpose() @ temp)
-        else:
-            gradient = -(X.T @ temp)
-        idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
-        if P2.ndim == 1:
-            gradient += P2 * coef[idx:]
-        else:
-            gradient += P2 @ coef[idx:]
-        if fit_intercept:
-            gradient = np.concatenate(([-temp.sum()], gradient))
-        if (np.max(np.abs(gradient)) <= tol):
-            converged = True
-            break
-
-    if not converged:
-        warnings.warn("irls failed to converge. Increase the number "
-                      "of iterations (currently {0})"
-                      .format(max_iter), ConvergenceWarning)
-
-    return coef, n_iter
-
-
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """Regression via a Generalized Linear Model (GLM) with penalties.
 
     GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
     fitting and predicting the mean of the target y as mu=h(X*w). Therefore,
-    the fit minimizes the following objective function with combined L1 and L2
+    the fit minimizes the following objective function with L2
     priors as regularizer::
 
             1/(2*sum(s)) * deviance(y, h(X*w); s)
-            + 1/2 * alpha * w*P2*w
+            + 1/2 * alpha * |w|_2
 
     with inverse link function h and s=sample_weight. 
     The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
@@ -843,18 +695,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         case, the design matrix X must have full column rank
         (no collinearities).
 
-    P2 : {'identity', array-like, sparse matrix}, shape \
-            (n_features,) or (n_features, n_features), optional \
-            (default='identity')
-        With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`.
-        This gives a fine control over this penalty (Tikhonov regularization).
-        A 2d array is directly used as the square matrix P2. A 1d array is
-        interpreted as diagonal (square) matrix. The default 'identity' sets
-        the identity matrix, which gives the usual squared L2-norm. If you just
-        want to exclude certain coefficients, pass a 1d array filled with 1,
-        and 0 for the coefficients to be excluded.
-        Note that P2 must be positive semi-definite.
-
     fit_intercept : boolean, optional (default=True)
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
@@ -882,17 +722,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         the chi squared statistic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'auto', 'irls', 'lbfgs'}, \
-            optional (default='auto')
+    solver : {'auto', 'lbfgs'}, optional (default='auto')
         Algorithm to use in the optimization problem:
 
         'auto'
-            Sets 'irls'
-
-        'irls'
-            Iterated reweighted least squares.
-            It is the standard algorithm for GLMs. It cannot deal with
-            L1 penalties.
+            Sets 'lbfgs'
 
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
@@ -905,7 +739,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         The maximal number of iterations for solver algorithms.
 
     tol : float, optional (default=1e-4)
-        Stopping criterion. For the irls, and lbfgs solvers,
+        Stopping criterion. For the lbfgs solver,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative) of
         the objective function. 
@@ -920,22 +754,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         number generator; if None, the random number generator is the
         RandomState instance used by `np.random`. 
 
-    diag_fisher : boolean, optional, (default=False)
-        Only relevant for solver 'cd'.
-        If ``False``, the full Fisher matrix (expected Hessian) is computed in
-        each outer iteration (Newton iteration). If ``True``, only a diagonal
-        matrix (stored as 1d array) is computed, such that
-        fisher = X.T @ diag @ X. This saves memory and matrix-matrix
-        multiplications, but needs more matrix-vector multiplications. If you
-        use large sparse X or if you have many features,
-        i.e. n_features >> n_samples, you might set this option to ``True``.
-
     copy_X : boolean, optional, (default=True)
         If ``True``, X will be copied; else, it may be overwritten.
 
     check_input : boolean, optional (default=True)
         Allow to bypass several checks on input: y values in range of family,
-        sample_weight non-negative, P2 positive semi-definite.
+        sample_weight non-negative.
         Don't use this parameter unless you know what you do.
 
     verbose : int, optional (default=0)
@@ -991,14 +815,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
           Journal of Machine Learning Research 13 (2012) 1999-2030
           https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
     """
-    def __init__(self, alpha=1.0, P2='identity',
+    def __init__(self, alpha=1.0,
                  fit_intercept=True, family='normal', link='auto',
                  fit_dispersion=None, solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False,
-                 random_state=None, diag_fisher=False,
+                 random_state=None,
                  copy_X=True, check_input=True, verbose=0):
         self.alpha = alpha
-        self.P2 = P2
         self.fit_intercept = fit_intercept
         self.family = family
         self.link = link
@@ -1008,7 +831,6 @@ def __init__(self, alpha=1.0, P2='identity',
         self.tol = tol
         self.warm_start = warm_start
         self.random_state = random_state
-        self.diag_fisher = diag_fisher
         self.copy_X = copy_X
         self.check_input = check_input
         self.verbose = verbose
@@ -1051,8 +873,8 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError(
                 "The family must be an instance of class"
                 " ExponentialDispersionModel or an element of"
-                " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
-                "'binomial']; got (family={0})".format(self.family))
+                " ['normal', 'poisson', 'gamma', 'inverse.gaussian']"
+                "; got (family={0})".format(self.family))
 
         # Guarantee that self._link_instance is set to an instance of
         # class Link
@@ -1089,13 +911,13 @@ def fit(self, X, y, sample_weight=None):
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
-        if self.solver not in ['auto', 'irls', 'lbfgs']:
+        if self.solver not in ['auto', 'lbfgs']:
             raise ValueError("GeneralizedLinearRegressor supports only solvers"
-                             "'auto', 'irls', 'lbfgs';"
+                             "'auto', 'lbfgs';"
                              " got {0}".format(self.solver))
         solver = self.solver
         if self.solver == 'auto':
-            solver = 'irls'
+            solver = 'lbfgs'
         if (not isinstance(self.max_iter, int)
                 or self.max_iter <= 0):
             raise ValueError("Maximum number of iteration must be a positive "
@@ -1108,9 +930,6 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("The argument warm_start must be bool;"
                              " got {0}".format(self.warm_start))
         random_state = check_random_state(self.random_state)
-        if not isinstance(self.diag_fisher, bool):
-            raise ValueError("The argument diag_fisher must be bool;"
-                             " got {0}".format(self.diag_fisher))
         if not isinstance(self.copy_X, bool):
             raise ValueError("The argument copy_X must be bool;"
                              " got {0}".format(self.copy_X))
@@ -1133,95 +952,12 @@ def fit(self, X, y, sample_weight=None):
 
         n_samples, n_features = X.shape
 
-        # 1.3 arguments to take special care ##################################
-        # P2
-
-        # If X is sparse, make P2 sparse, too.
-        if isinstance(self.P2, str) and self.P2 == 'identity':
-            if sparse.issparse(X):
-                P2 = (sparse.dia_matrix((np.ones(n_features), 0),
-                      shape=(n_features, n_features))).tocsc()
-            else:
-                P2 = np.ones(n_features)
-        else:
-            P2 = check_array(self.P2, copy=True,
-                             accept_sparse=_stype,
-                             dtype=_dtype, ensure_2d=False)
-            if P2.ndim == 1:
-                P2 = np.asarray(P2)
-                if P2.shape[0] != n_features:
-                    raise ValueError("P2 should be a 1d array of shape "
-                                     "(n_features,) with "
-                                     "n_features=X.shape[1]; "
-                                     "got (P2.shape=({0},)), needed ({1},)"
-                                     .format(P2.shape[0], X.shape[1]))
-                if sparse.issparse(X):
-                    P2 = (sparse.dia_matrix((P2, 0),
-                          shape=(n_features, n_features))).tocsc()
-            elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and
-                    P2.shape[0] == X.shape[1]):
-                if sparse.issparse(X):
-                    P2 = (sparse.dia_matrix((P2, 0),
-                          shape=(n_features, n_features))).tocsc()
-            else:
-                raise ValueError("P2 must be either None or an array of shape "
-                                 "(n_features, n_features) with "
-                                 "n_features=X.shape[1]; "
-                                 "got (P2.shape=({0}, {1})), needed ({2}, {2})"
-                                 .format(P2.shape[0], P2.shape[1], X.shape[1]))
-
-        l2 = self.alpha
-        # P2 is now for sure a copy
-        P2 = l2 * P2
-        # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
-        # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
-        if P2.ndim == 2:
-            if sparse.issparse(P2):
-                if sparse.isspmatrix_csc(P2):
-                    P2 = 0.5 * (P2 + P2.transpose()).tocsc()
-                else:
-                    P2 = 0.5 * (P2 + P2.transpose()).tocsr()
-            else:
-                P2 = 0.5 * (P2 + P2.T)
-
-        # For coordinate descent, if X is sparse, P2 must also be csc
-        if solver == 'cd' and sparse.issparse(X):
-            P2 = sparse.csc_matrix(P2)
-
         # 1.4 additional validations ##########################################
         if self.check_input:
             if not np.all(family.in_y_range(y)):
                 raise ValueError("Some value(s) of y are out of the valid "
                                  "range for family {0}"
                                  .format(family.__class__.__name__))
-            # check if P2 is positive semidefinite
-            # np.linalg.cholesky(P2) 'only' asserts positive definite
-            if not isinstance(self.P2, str):  # self.P2 != 'identity'
-                # due to numerical precision, we allow eigenvalues to be a
-                # tiny bit negative
-                epsneg = -10 * np.finfo(P2.dtype).epsneg
-                if P2.ndim == 1 or P2.shape[0] == 1:
-                    p2 = P2
-                    if sparse.issparse(P2):
-                        p2 = P2.toarray()
-                    if not np.all(p2 >= 0):
-                        raise ValueError("1d array P2 must not have negative "
-                                         "values.")
-                elif sparse.issparse(P2):
-                    # for sparse matrices, not all eigenvals can be computed
-                    # efficiently, use only half of n_features
-                    # k = how many eigenvals to compute
-                    k = np.min([10, n_features // 10 + 1])
-                    sigma = 0  # start searching near this value
-                    which = 'SA'  # find smallest algebraic eigenvalues first
-                    eigenvalues = splinalg.eigsh(P2, k=k, sigma=sigma,
-                                                 which=which,
-                                                 return_eigenvectors=False)
-                    if not np.all(eigenvalues >= epsneg):
-                        raise ValueError("P2 must be positive semi-definite.")
-                else:
-                    if not np.all(linalg.eigvalsh(P2) >= epsneg):
-                        raise ValueError("P2 must be positive semi-definite.")
             # TODO: if alpha=0 check that X is not rank deficient
             # TODO: what else to check?
 
@@ -1229,10 +965,10 @@ def fit(self, X, y, sample_weight=None):
         # 2. rescaling of weights (sample_weight)                             #
         #######################################################################
         # IMPORTANT NOTE: Since we want to minimize
-        # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
+        # 1/(2*sum(sample_weight)) * deviance + L2,
         # deviance = sum(sample_weight * unit_deviance),
         # we rescale weights such that sum(weights) = 1 and this becomes
-        # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
+        # 1/2*deviance + L2 with deviance=sum(weights * unit_deviance)
         weights_sum = np.sum(weights)
         weights = weights/weights_sum
 
@@ -1260,33 +996,21 @@ def fit(self, X, y, sample_weight=None):
         #######################################################################
         # algorithms for optimization
 
-        # 4.1 IRLS ############################################################
-        # Note: we already set P2 = l2*P2, see above
-        # Note: we already symmetrized P2 = 1/2 (P2 + P2')
-        if solver == 'irls':
-            coef, self.n_iter_ = \
-                _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2,
-                             fit_intercept=self.fit_intercept, family=family,
-                             link=link, max_iter=self.max_iter, tol=self.tol)
-
-        # 4.2 L-BFGS ##########################################################
-        elif solver == 'lbfgs':
-            def func(coef, X, y, weights, P2, family, link):
+        # 4.1 L-BFGS ##########################################################
+        if solver == 'lbfgs':
+            def func(coef, X, y, weights, alpha, family, link):
                 mu, devp = \
                     family._mu_deviance_derivative(coef, X, y, weights, link)
                 dev = family.deviance(y, mu, weights)
                 intercept = (coef.size == X.shape[1] + 1)
                 idx = 1 if intercept else 0  # offset if coef[0] is intercept
-                if P2.ndim == 1:
-                    L2 = P2 * coef[idx:]
-                else:
-                    L2 = P2 @ coef[idx:]
+                L2 = alpha * coef[idx:]
                 obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2)
                 objp = 0.5 * devp
                 objp[idx:] += L2
                 return obj, objp
 
-            args = (X, y, weights, P2, family, link)
+            args = (X, y, weights, self.alpha, family, link)
             # TODO: refactor this once
             # https://github.com/scikit-learn/scikit-learn/pull/14250
             # is merged.
@@ -1492,13 +1216,9 @@ class PoissonRegressor(GeneralizedLinearRegressor):
         the chi squared statistic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'irls', 'lbfgs'}, optional (default='irls')
+    solver : {'lbfgs'}, optional (default='lbfgs')
         Algorithm to use in the optimization problem:
 
-        'irls'
-            Iterated reweighted least squares. It is the standard algorithm
-            for GLMs.
-
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
 
@@ -1506,7 +1226,7 @@ class PoissonRegressor(GeneralizedLinearRegressor):
         The maximal number of iterations for solver algorithms.
 
     tol : float, optional (default=1e-4)
-        Stopping criterion. For the irls, and lbfgs solvers,
+        Stopping criterion. For the lbfgs solver,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative) of
         the objective function.
@@ -1575,8 +1295,7 @@ class PoissonRegressor(GeneralizedLinearRegressor):
           https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
     """
     def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None,
-                 solver='irls', max_iter=100,
-                 tol=1e-4, warm_start=False,
+                 solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
                  random_state=None, copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index b9716388fb36c..877b37993f587 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -25,7 +25,7 @@
 
 from sklearn.utils.testing import assert_array_equal
 
-GLM_SOLVERS = ['irls', 'lbfgs']
+GLM_SOLVERS = ['lbfgs']
 
 
 @pytest.fixture(scope="module")
@@ -193,39 +193,6 @@ def test_glm_alpha_argument(alpha):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]],
-                                sparse.csr_matrix([1, 2, 3]), [-1]])
-def test_glm_P2_argument(P2):
-    """Test GLM for invalid P2 argument."""
-    y = np.array([1, 2])
-    X = np.array([[1], [2]])
-    glm = GeneralizedLinearRegressor(P2=P2, check_input=True)
-    with pytest.raises(ValueError):
-        glm.fit(X, y)
-
-
-def test_glm_P2_positive_semidefinite():
-    """Test GLM for a positive semi-definite P2 argument."""
-    n_samples, n_features = 10, 5
-    y = np.arange(n_samples)
-    X = np.zeros((n_samples, n_features))
-    P2 = np.diag([100, 10, 5, 0, -1E-5])
-    rng = np.random.RandomState(42)
-    # construct random orthogonal matrix Q
-    Q, R = linalg.qr(rng.randn(n_features, n_features))
-    P2 = Q.T @ P2 @ Q
-    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
-                                     check_input=True)
-    with pytest.raises(ValueError, match="P2 must be positive semi-definite"):
-        glm.fit(X, y)
-
-    P2 = sparse.csr_matrix(P2)
-    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
-                                     check_input=True)
-    with pytest.raises(ValueError, match="P2 must be positive semi-definite"):
-        glm.fit(X, y)
-
-
 @pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]])
 def test_glm_fit_intercept_argument(fit_intercept):
     """Test GLM for invalid fit_intercept argument."""
@@ -287,16 +254,6 @@ def test_glm_random_state_argument(random_state):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]])
-def test_glm_diag_fisher_argument(diag_fisher):
-    """Test GLM for invalid diag_fisher arguments."""
-    y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher)
-    with pytest.raises(ValueError, match="diag_fisher must be bool"):
-        glm.fit(X, y)
-
-
 @pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]])
 def test_glm_copy_X_argument(copy_X):
     """Test GLM for invalid copy_X arguments."""
@@ -336,9 +293,7 @@ def test_glm_identity_regression(solver):
      GammaDistribution(), InverseGaussianDistribution(),
      TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
 ])
-@pytest.mark.parametrize('solver, tol', [('irls', 1e-6),
-                                         ('lbfgs', 1e-6),
-])
+@pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-6)])
 def test_glm_log_regression(family, solver, tol):
     """Test GLM regression with log link on a simple dataset."""
     coef = [0.2, -0.1]
@@ -391,10 +346,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
     assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
 
 
-@pytest.mark.parametrize('solver, tol',
-                         [('irls', 1e-7),
-                          ('lbfgs', 1e-7),
-])
+@pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-7)])
 def test_poisson_ridge(solver, tol):
     """Test ridge regression with poisson family and LogLink.
 
@@ -426,9 +378,6 @@ def test_poisson_ridge(solver, tol):
 @pytest.mark.parametrize(
         "params",
         [
-            {"solver": "irls" },
-            {"solver": "irls" },
-            {"solver": "lbfgs" },
             {"solver": "lbfgs"},
         ],
         ids=lambda params: ', '.join("%s=%s" % (key, val)

From 1e4b5380b7ad699e5d059ef6d9836c37a7ba16e2 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jul 2019 11:23:31 -0500
Subject: [PATCH 073/209] Remove plot_poisson_spline_regression.py example

---
 .../plot_poisson_spline_regression.py         | 85 -------------------
 1 file changed, 85 deletions(-)
 delete mode 100644 examples/linear_model/plot_poisson_spline_regression.py

diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py
deleted file mode 100644
index 30b5881bba1f5..0000000000000
--- a/examples/linear_model/plot_poisson_spline_regression.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""
-=================================
-Poisson Regression with B-Splines
-=================================
-
-As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py`
-example, a Poisson regression with penalized B-splines (P-splines) [1]_ is
-fitted on slightly different sinusoidal, Poisson distributed data and
-compared to an AdaBoost model with decision trees.
-One can see, that this is a hard problem for both estimators.
-
-.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines
-       and penalties". Statist. Sci. 11 (1996), no. 2, 89--121.
-       `doi:10.1214/ss/1038425655
-       <https://projecteuclid.org/euclid.ss/1038425655>`_
-
-"""
-print(__doc__)
-
-# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
-# based on the AdaBoost regression example from Noel Dawe <noel.dawe@gmail.com>
-# License: BSD 3 clause
-
-# importing necessary libraries
-import numpy as np
-from scipy.linalg import toeplitz
-# from scipy.interpolate import BSpline
-from scipy.interpolate import splev
-import matplotlib.pyplot as plt
-from sklearn.tree import DecisionTreeRegressor
-from sklearn.ensemble import AdaBoostRegressor
-from sklearn.linear_model import GeneralizedLinearRegressor
-
-
-# Create the dataset
-xmin, xmax = 0, 6
-rng = np.random.RandomState(1)
-X = np.linspace(xmin, xmax, 500)[:, np.newaxis]
-y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel())
-y = rng.poisson(y_true, X.shape[0])
-
-# b-spline basis
-nknots, degree = 40, 3
-ns = nknots - degree - 1  # number of base spline functions
-dx = (xmax - xmin) / (nknots - 1 - 2 * degree)
-knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots)
-coef = np.zeros(ns)
-splineBasis = np.empty((X.shape[0], ns), dtype=float)
-for i in range(ns):
-    coef[i] = 1
-#    splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \
-#        .ravel()
-    splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel()
-    coef[i] = 0
-
-# second order difference matrix
-P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float)
-P2[0, 0] = P2[-1, -1] = 1
-
-# Fit regression model
-regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
-                           n_estimators=10, random_state=rng)
-
-regr_2 = GeneralizedLinearRegressor(family='poisson', link='log',
-                                    fit_intercept=True, alpha=0.02,
-                                    l1_ratio=0.1, P2=P2)
-
-regr_1.fit(X, y)
-regr_2.fit(splineBasis, y)
-
-# Predict
-y_1 = regr_1.predict(X)
-y_2 = regr_2.predict(splineBasis)
-
-# Plot the results
-plt.figure()
-plt.plot(X, y_true, c="b", label="true mean")
-plt.scatter(X, y, c="k", marker='.', label="training samples")
-plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2)
-plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2)
-plt.xlabel("data")
-plt.ylabel("target")
-plt.title("Regression Comparison")
-plt.legend()
-plt.show()

From 3265148aa4df59bd1a4c8ea5eb8e2102bb650006 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jul 2019 11:38:48 -0500
Subject: [PATCH 074/209] Remove random_state parameter

---
 sklearn/linear_model/_glm.py           | 30 ++++---------------
 sklearn/linear_model/tests/test_glm.py | 41 ++++++++------------------
 2 files changed, 17 insertions(+), 54 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index b469b3c4edd17..4a1f7c260e649 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -10,15 +10,13 @@
 from abc import ABCMeta, abstractmethod
 import numbers
 import numpy as np
-from scipy import linalg, sparse, special
-import scipy.sparse.linalg as splinalg
+from scipy import sparse, special
 from scipy.optimize import fmin_l_bfgs_b
 import warnings
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array, check_X_y
-from ..utils.validation import check_is_fitted, check_random_state
-
+from ..utils.validation import check_is_fitted
 
 
 def _check_weights(sample_weight, n_samples):
@@ -680,7 +678,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
             1/(2*sum(s)) * deviance(y, h(X*w); s)
             + 1/2 * alpha * |w|_2
 
-    with inverse link function h and s=sample_weight. 
+    with inverse link function h and s=sample_weight.
     The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
 
     Read more in the :ref:`User Guide <Generalized_linear_regression>`.
@@ -742,18 +740,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         Stopping criterion. For the lbfgs solver,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative) of
-        the objective function. 
+        the objective function.
 
     warm_start : boolean, optional (default=False)
         If set to ``True``, reuse the solution of the previous call to ``fit``
         as initialization for ``coef_`` and ``intercept_``.
 
-    random_state : {int, RandomState instance, None}, optional (default=None)
-        If int, random_state is the seed used by the random
-        number generator; if RandomState instance, random_state is the random
-        number generator; if None, the random number generator is the
-        RandomState instance used by `np.random`. 
-
     copy_X : boolean, optional, (default=True)
         If ``True``, X will be copied; else, it may be overwritten.
 
@@ -819,7 +811,6 @@ def __init__(self, alpha=1.0,
                  fit_intercept=True, family='normal', link='auto',
                  fit_dispersion=None, solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False,
-                 random_state=None,
                  copy_X=True, check_input=True, verbose=0):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
@@ -830,7 +821,6 @@ def __init__(self, alpha=1.0,
         self.max_iter = max_iter
         self.tol = tol
         self.warm_start = warm_start
-        self.random_state = random_state
         self.copy_X = copy_X
         self.check_input = check_input
         self.verbose = verbose
@@ -929,7 +919,6 @@ def fit(self, X, y, sample_weight=None):
         if not isinstance(self.warm_start, bool):
             raise ValueError("The argument warm_start must be bool;"
                              " got {0}".format(self.warm_start))
-        random_state = check_random_state(self.random_state)
         if not isinstance(self.copy_X, bool):
             raise ValueError("The argument copy_X must be bool;"
                              " got {0}".format(self.copy_X))
@@ -959,7 +948,6 @@ def fit(self, X, y, sample_weight=None):
                                  "range for family {0}"
                                  .format(family.__class__.__name__))
             # TODO: if alpha=0 check that X is not rank deficient
-            # TODO: what else to check?
 
         #######################################################################
         # 2. rescaling of weights (sample_weight)                             #
@@ -1027,7 +1015,6 @@ def func(coef, X, y, weights, alpha, family, link):
                               .format(info["task"]))
             self.n_iter_ = info['nit']
 
-
         #######################################################################
         # 5. postprocessing                                                   #
         #######################################################################
@@ -1235,12 +1222,6 @@ class PoissonRegressor(GeneralizedLinearRegressor):
         If set to ``True``, reuse the solution of the previous call to ``fit``
         as initialization for ``coef_`` and ``intercept_`` .
 
-    random_state : {int, RandomState instance, None}, optional (default=None)
-        If int, random_state is the seed used by the random
-        number generator; if RandomState instance, random_state is the random
-        number generator; if None, the random number generator is the
-        RandomState instance used by `np.random`.
-
     copy_X : boolean, optional, (default=True)
         If ``True``, X will be copied; else, it may be overwritten.
 
@@ -1296,11 +1277,10 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     """
     def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None,
                  solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
-                 random_state=None, copy_X=True, check_input=True, verbose=0):
+                 copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
                          family="poisson", link='log',
                          fit_dispersion=fit_dispersion, solver=solver,
                          max_iter=max_iter, tol=tol, warm_start=warm_start,
-                         random_state=random_state,
                          copy_X=copy_X, verbose=verbose)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 877b37993f587..a921c7e065878 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -5,10 +5,8 @@
 import numpy as np
 from numpy.testing import assert_allclose
 import pytest
-import scipy as sp
-from scipy import linalg, optimize, sparse
 
-from sklearn.datasets import make_classification, make_regression
+from sklearn.datasets import make_regression
 from sklearn.linear_model import GeneralizedLinearRegressor
 from sklearn.linear_model._glm import (
     Link,
@@ -19,7 +17,7 @@
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
 )
-from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
+from sklearn.linear_model import Ridge
 from sklearn.metrics import mean_absolute_error
 from sklearn.exceptions import ConvergenceWarning
 
@@ -101,8 +99,7 @@ def test_tweedie_distribution_power():
      (TweedieDistribution(power=-1), [0.1, 1.5]),
      (TweedieDistribution(power=1.5), [0.1, 1.5]),
      (TweedieDistribution(power=2.5), [0.1, 1.5]),
-     (TweedieDistribution(power=-4), [0.1, 1.5]),
-])
+     (TweedieDistribution(power=-4), [0.1, 1.5])])
 def test_deviance_zero(family, chk_values):
     """Test deviance(y,y) = 0 for different families."""
     for x in chk_values:
@@ -151,8 +148,7 @@ def test_sample_weights_validation():
                          [('normal', NormalDistribution()),
                           ('poisson', PoissonDistribution()),
                           ('gamma', GammaDistribution()),
-                          ('inverse.gaussian', InverseGaussianDistribution()),
-])
+                          ('inverse.gaussian', InverseGaussianDistribution())])
 def test_glm_family_argument(f, fam):
     """Test GLM family argument set as string."""
     y = np.array([0.1, 0.5])  # in range of all distributions
@@ -244,16 +240,6 @@ def test_glm_warm_start_argument(warm_start):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]])
-def test_glm_random_state_argument(random_state):
-    """Test GLM for invalid random_state argument."""
-    y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    glm = GeneralizedLinearRegressor(random_state=random_state)
-    with pytest.raises(ValueError, match="cannot be used to seed"):
-        glm.fit(X, y)
-
-
 @pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]])
 def test_glm_copy_X_argument(copy_X):
     """Test GLM for invalid copy_X arguments."""
@@ -291,8 +277,7 @@ def test_glm_identity_regression(solver):
     'family',
     [NormalDistribution(), PoissonDistribution(),
      GammaDistribution(), InverseGaussianDistribution(),
-     TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
-])
+     TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)])
 @pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-6)])
 def test_glm_log_regression(family, solver, tol):
     """Test GLM regression with log link on a simple dataset."""
@@ -338,7 +323,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
     glm = GeneralizedLinearRegressor(alpha=1.0, family='normal',
                                      link='identity', fit_intercept=True,
                                      max_iter=300, solver=solver, tol=1e-6,
-                                     check_input=False, random_state=42)
+                                     check_input=False)
     glm.fit(X, y)
     assert glm.coef_.shape == (X.shape[1], )
     assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6)
@@ -369,7 +354,7 @@ def test_poisson_ridge(solver, tol):
                                      fit_intercept=True, family='poisson',
                                      link='log', tol=1e-7,
                                      solver=solver, max_iter=300,
-                                     random_state=rng)
+                                     )
     glm.fit(X, y)
     assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5)
     assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5)
@@ -385,11 +370,10 @@ def test_poisson_ridge(solver, tol):
 )
 def test_solver_equivalence(params, regression_data):
     X, y = regression_data
-    est_ref = GeneralizedLinearRegressor(random_state=2)
+    est_ref = GeneralizedLinearRegressor()
     est_ref.fit(X, y)
 
     estimator = GeneralizedLinearRegressor(**params)
-    estimator.set_params(random_state=2)
 
     estimator.fit(X, y)
 
@@ -405,16 +389,15 @@ def test_solver_equivalence(params, regression_data):
 def test_fit_dispersion(regression_data):
     X, y = regression_data
 
-    est1 = GeneralizedLinearRegressor(random_state=2)
+    est1 = GeneralizedLinearRegressor()
     est1.fit(X, y)
     assert not hasattr(est1, "dispersion_")
 
-    est2 = GeneralizedLinearRegressor(random_state=2, fit_dispersion="chisqr")
+    est2 = GeneralizedLinearRegressor(fit_dispersion="chisqr")
     est2.fit(X, y)
     assert isinstance(est2.dispersion_, float)
 
-    est3 = GeneralizedLinearRegressor(
-            random_state=2, fit_dispersion="deviance")
+    est3 = GeneralizedLinearRegressor(fit_dispersion="deviance")
     est3.fit(X, y)
     assert isinstance(est3.dispersion_, float)
 
@@ -425,7 +408,7 @@ def test_fit_dispersion(regression_data):
 def test_convergence_warning(solver, regression_data):
     X, y = regression_data
 
-    est = GeneralizedLinearRegressor(solver=solver, random_state=2,
+    est = GeneralizedLinearRegressor(solver=solver,
                                      max_iter=1, tol=1e-20)
     with pytest.warns(ConvergenceWarning):
         est.fit(X, y)

From 1862ab6811db3a6a4ad54719fcd048dc151d0a37 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jul 2019 11:55:13 -0500
Subject: [PATCH 075/209] Lint

---
 sklearn/linear_model/tests/test_glm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index a921c7e065878..1712f7b5e1d3d 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -349,7 +349,6 @@ def test_poisson_ridge(solver, tol):
     # b            0.03741173122
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
-    rng = np.random.RandomState(42)
     glm = GeneralizedLinearRegressor(alpha=1,
                                      fit_intercept=True, family='poisson',
                                      link='log', tol=1e-7,

From 4154074a7367be310976a8a1bba00ed737cf9e3d Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 10 Jul 2019 14:16:05 +0200
Subject: [PATCH 076/209] Fix docstring

---
 doc/modules/linear_model.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index eba51315d2ae8..681a13cdf9d42 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -940,9 +940,9 @@ follows:
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
     GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.coef_
-    array([0.24630169, 0.43373464])
+    array([0.2463..., 0.4337...])
     >>> reg.intercept_
-    -0.76383633...
+    -0.7638...
 
 
 .. topic:: Examples:

From c5d77d78d78a57b242e401f0f2f3efa572e74105 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 10 Jul 2019 14:20:42 +0200
Subject: [PATCH 077/209] Remove unused core

---
 sklearn/linear_model/_glm.py | 68 +-----------------------------------
 1 file changed, 1 insertion(+), 67 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index 4a1f7c260e649..244c781ae8f48 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -10,7 +10,7 @@
 from abc import ABCMeta, abstractmethod
 import numbers
 import numpy as np
-from scipy import sparse, special
+from scipy import special
 from scipy.optimize import fmin_l_bfgs_b
 import warnings
 from ..base import BaseEstimator, RegressorMixin
@@ -54,42 +54,6 @@ def _safe_lin_pred(X, coef):
         return X @ coef
 
 
-def _safe_toarray(X):
-    """Returns a numpy array."""
-    if sparse.issparse(X):
-        return X.toarray()
-    else:
-        return np.asarray(X)
-
-
-def _safe_sandwich_dot(X, d, intercept=False):
-    """Compute sandwich product X.T @ diag(d) @ X.
-
-    With ``intercept=True``, X is treated as if a column of 1 were appended as
-    first column of X.
-    X can be sparse, d must be an ndarray. Always returns a ndarray."""
-    if sparse.issparse(X):
-        temp = (X.transpose() @ X.multiply(d[:, np.newaxis]))
-        # for older versions of numpy and scipy, temp may be a np.matrix
-        temp = _safe_toarray(temp)
-    else:
-        temp = (X.T * d) @ X
-    if intercept:
-        dim = X.shape[1] + 1
-        if sparse.issparse(X):
-            order = 'F' if sparse.isspmatrix_csc(X) else 'C'
-        else:
-            order = 'F' if X.flags['F_CONTIGUOUS'] else 'C'
-        res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order)
-        res[0, 0] = d.sum()
-        res[1:, 0] = d @ X
-        res[0, 1:] = res[1:, 0]
-        res[1:, 1:] = temp
-    else:
-        res = temp
-    return res
-
-
 class Link(metaclass=ABCMeta):
     """Abstract base class for Link functions."""
 
@@ -250,7 +214,6 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
     starting_mu
 
     _mu_deviance_derivative
-    _score
 
     References
     ----------
@@ -484,35 +447,6 @@ def _mu_deviance_derivative(self, coef, X, y, weights, link):
             devp = temp @ X  # sampe as X.T @ temp
         return mu, devp
 
-    def _score(self, coef, phi, X, y, weights, link):
-        r"""Compute the score function.
-
-        The score function is the derivative of the
-        log-likelihood w.r.t. `coef` (:math:`w`).
-        It is given by
-
-        .. math:
-
-            \mathbf{score}(\boldsymbol{w})
-            = \frac{\partial loglike}{\partial\boldsymbol{w}}
-            = \mathbf{X}^T \mathbf{D}
-            \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,,
-
-        with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and
-        :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`.
-        Note: The derivative of the deviance w.r.t. coef equals -2 * score.
-        """
-        lin_pred = _safe_lin_pred(X, coef)
-        mu = link.inverse(lin_pred)
-        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
-        d = link.inverse_derivative(lin_pred)
-        temp = sigma_inv * d * (y - mu)
-        if coef.size == X.shape[1] + 1:
-            score = np.concatenate(([temp.sum()], temp @ X))
-        else:
-            score = temp @ X  # sampe as X.T @ temp
-        return score
-
 
 class TweedieDistribution(ExponentialDispersionModel):
     r"""A class for the Tweedie distribution.

From 9ab5ac2506bd33d84c96f9f848b62cb4ee5b0853 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sat, 13 Jul 2019 15:13:14 +0200
Subject: [PATCH 078/209] Update
 examples/linear_model/plot_poisson_regression_non_normal_loss.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>
---
 .../linear_model/plot_poisson_regression_non_normal_loss.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index b06adcb787560..471c137840e82 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -1,6 +1,6 @@
 """
 ======================================
-Poisson regression and non normal loss
+Poisson regression and non-normal loss
 ======================================
 
 This example illustrate the use linear Poisson regression

From e4d0be190239435e7dcd4d27cd616006909db98b Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sat, 13 Jul 2019 15:13:26 +0200
Subject: [PATCH 079/209] Update
 examples/linear_model/plot_poisson_regression_non_normal_loss.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>
---
 .../linear_model/plot_poisson_regression_non_normal_loss.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 471c137840e82..0537704b2cf1f 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -3,7 +3,7 @@
 Poisson regression and non-normal loss
 ======================================
 
-This example illustrate the use linear Poisson regression
+This example illustrates the use of linear Poisson regression
 on the French Motor Third-Party Liability Claims dataset [1] and compare
 it with learning models with least squared error.
 

From 6ff4d588f5adae2aed6810ddff64733c5bc595a3 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sat, 13 Jul 2019 15:13:50 +0200
Subject: [PATCH 080/209] Update doc/modules/linear_model.rst

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>
---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 681a13cdf9d42..2f750b33623aa 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -929,7 +929,7 @@ Since the linear predictor :math:`Xw` can be negative and
 Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
 it is convenient to apply a link function different from the identity link
 :math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with
-:math:`h(Xw)=\exp(Xw)`.
+:math:`h(x^\top w)=\exp(x^\top w)`.
 
 Note that the feature matrix `X` should be standardized before fitting. This
 ensures that the penalty treats features equally. The estimator can be used as

From 13102d5a2b4538734f162a70979df874a31c7798 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sat, 13 Jul 2019 15:13:59 +0200
Subject: [PATCH 081/209] Update doc/modules/linear_model.rst

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>
---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 2f750b33623aa..49662064827f3 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -928,7 +928,7 @@ are the following:
 Since the linear predictor :math:`Xw` can be negative and
 Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
 it is convenient to apply a link function different from the identity link
-:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with
+:math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the log-link with
 :math:`h(x^\top w)=\exp(x^\top w)`.
 
 Note that the feature matrix `X` should be standardized before fitting. This

From af89e5281b880b19346db625838e1623275b438a Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sat, 13 Jul 2019 15:14:12 +0200
Subject: [PATCH 082/209] Update doc/modules/linear_model.rst

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>
---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 49662064827f3..c8c103288c5ab 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -898,7 +898,7 @@ ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear
 combination of the input variables :math:`X` via an inverse link function
 :math:`h` as
 
-.. math::    \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p).
+.. math::    \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p).
 
 Secondly, the squared loss function is replaced by the deviance :math:`D` of an
 exponential dispersion model (EDM) [11]_. The objective function being minimized

From ddc4b717bcece9c8d0347d46b9f4a89b681aec96 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Sat, 13 Jul 2019 15:27:22 +0200
Subject: [PATCH 083/209] Use scipy.optimize.minimize interface for LBFGS
 optimizer

---
 sklearn/linear_model/_glm.py | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index 244c781ae8f48..939249e42e4f4 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -6,16 +6,16 @@
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
-from __future__ import division
 from abc import ABCMeta, abstractmethod
 import numbers
+
 import numpy as np
 from scipy import special
-from scipy.optimize import fmin_l_bfgs_b
-import warnings
+import scipy.optimize
+
 from ..base import BaseEstimator, RegressorMixin
-from ..exceptions import ConvergenceWarning
 from ..utils import check_array, check_X_y
+from ..utils.optimize import _check_optimize_result
 from ..utils.validation import check_is_fitted
 
 
@@ -933,21 +933,18 @@ def func(coef, X, y, weights, alpha, family, link):
                 return obj, objp
 
             args = (X, y, weights, self.alpha, family, link)
-            # TODO: refactor this once
-            # https://github.com/scikit-learn/scikit-learn/pull/14250
-            # is merged.
-            coef, loss, info = fmin_l_bfgs_b(
-                func, coef, fprime=None, args=args,
-                iprint=(self.verbose > 0) - 1, pgtol=self.tol,
-                maxiter=self.max_iter, factr=1e3)
-            if info["warnflag"] == 1:
-                warnings.warn("lbfgs failed to converge."
-                              " Increase the number of iterations.",
-                              ConvergenceWarning)
-            elif info["warnflag"] == 2:
-                warnings.warn("lbfgs failed for the reason: {0}"
-                              .format(info["task"]))
-            self.n_iter_ = info['nit']
+
+            opt_res = scipy.optimize.minimize(
+                func, coef, method="L-BFGS-B", jac=True,
+                options={
+                    "maxiter": self.max_iter,
+                    "iprint": (self.verbose > 0) - 1,
+                    "gtol": self.tol,
+                    "ftol": 1e3*np.finfo(float).eps,
+                },
+                args=args)
+            self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
+            coef = opt_res.x
 
         #######################################################################
         # 5. postprocessing                                                   #

From 426ae1d711c27d14a7dd7d22763fee5d9d20c1f5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 14 Jul 2019 18:20:09 +0200
Subject: [PATCH 084/209] EXA wording and score in
 plot_tweedie_regression_insurance_claims.html

---
 ...lot_tweedie_regression_insurance_claims.py | 137 ++++++++++--------
 1 file changed, 78 insertions(+), 59 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 063d12e6e291b..00111b811f923 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -3,7 +3,7 @@
 Tweedie regression on insurance claims
 ======================================
 
-This example illustrate the use Poisson, Gamma and Tweedie regression
+This example illustrates the use of Poisson, Gamma and Tweedie regression
 on the French Motor Third-Party Liability Claims dataset, and is inspired
 by an R tutorial [1].
 
@@ -13,9 +13,10 @@
 which are:
 
 1. Model the number of claims with a Poisson distribution, the average
-   claim amount as a Gamma distribution and multiply the predictions, to get
-   the total claim amount.
-2. Model total claim amount directly, typically with a Tweedie distribution.
+   claim amount as a Gamma distribution and multiply the predictions of both in
+   order to get the total claim amount.
+2. Model total claim amount directly, typically with a Tweedie distribution of
+   Tweedie power :math:`p \\in (1, 2)`.
 
 In this example we will illustrate both approaches. We start by defining a few
 helper functions for loading the data and visualizing results.
@@ -49,7 +50,7 @@
 
 
 def load_mtpl2(n_samples=100000):
-    """Fetcher for French Motor Third-Party Liability Claims dataset
+    """Fetch the French Motor Third-Party Liability Claims dataset.
 
     Parameters
     ----------
@@ -81,24 +82,27 @@ def load_mtpl2(n_samples=100000):
     return df.iloc[:n_samples]
 
 
-def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None,
+def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
                   title=None, kind_weight=None, ax=None):
     """Plot observed and predicted - aggregated per feature level.
 
     Parameters
     ----------
-    df : DataFrame with at least one column named feature
+    df : DataFrame with at least three columns named feature, weight and
+         observed
+    feature: str
+        a column name of df for the feature to be plotted
+    weight : str
+        column name of df with the values of weights or exposure
     observed : str
-        a column name of the observed target
+        a column name of df with the observed target
     predicted : frame
         a dataframe, with the same index as df, with the predicted target
-    weight : str
-        column name with the values of weights/exposure
     """
     # aggregate observed and predicted variables by feature level
     df_ = df.loc[:, [feature, weight]].copy()
     df_["observed"] = df[observed] * df[weight]
-    df_["predicted"] = y_predicted * df[weight]
+    df_["predicted"] = predicted * df[weight]
     df_ = (
         df_.groupby([feature])[weight, "observed", "predicted"]
         .sum()
@@ -126,9 +130,10 @@ def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None,
 # 1. Loading datasets and pre-processing
 # --------------------------------------
 #
-# We construct the freMTPL2 dataset by joining the  freMTPL2freq table,
-# containing the number of claims (``ClaimNb``) with the freMTPL2sev table
-# containing the claim amount (``ClaimAmount``) for the same user ids.
+# We construct the freMTPL2 dataset by joining the freMTPL2freq table,
+# containing the number of claims (``ClaimNb``), with the freMTPL2sev table,
+# containing the claim amount (``ClaimAmount``) for the same policy ids
+# (``IDpol``).
 
 df = load_mtpl2(n_samples=100000)
 
@@ -206,14 +211,14 @@ def score_estimator(
         y, _weights = df[target], df[weights]
 
         for score_label, metric in [
-            ("D² explaned", None),
+            ("D² explained", None),
             ("mean deviance", partial(mean_deviance, estimator)),
             ("mean abs. error", mean_absolute_error),
         ]:
             if estimator.__class__.__name__ == "ClaimProdEstimator":
-                # ClaimProdEstimator is the product of the frequency and
-                # severity models, together with a denormalized by the exposure
-                # values. It does not fully follow the scikit-learn API and we
+                # ClaimProdEstimator is the product of frequency and severity
+                # models, denormalized by the exposure values.
+                # It does not fully follow the scikit-learn API and we
                 # must handle it separately.
                 y_pred = estimator.predict(X, exposure=df.Exposure.values)
             else:
@@ -253,50 +258,50 @@ def score_estimator(
 #
 # We can visually compare observed and predicted values, aggregated by
 # the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance
-# bonus/penalty (``BonusMalus``),
+# bonus/malus (``BonusMalus``).
 
 fig, ax = plt.subplots(2, 2, figsize=(16, 8))
 fig.subplots_adjust(hspace=0.3, wspace=0.2)
 
 plot_obs_pred(
-    df_train,
-    "DrivAge",
-    "Frequency",
-    glm_freq.predict(X_train),
+    df=df_train,
+    feature="DrivAge",
     weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_train),
     y_label="Claim Frequency",
     title="train data",
     ax=ax[0, 0],
 )
 
 plot_obs_pred(
-    df_test,
-    "DrivAge",
-    "Frequency",
-    glm_freq.predict(X_test),
+    df=df_test,
+    feature="DrivAge",
     weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_test),
     y_label="Claim Frequency",
     title="test data",
     ax=ax[0, 1],
 )
 
 plot_obs_pred(
-    df_test,
-    "VehAge",
-    "Frequency",
-    glm_freq.predict(X_test),
+    df=df_test,
+    feature="VehAge",
     weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_test),
     y_label="Claim Frequency",
     title="test data",
     ax=ax[1, 0],
 )
 
 plot_obs_pred(
-    df_test,
-    "BonusMalus",
-    "Frequency",
-    glm_freq.predict(X_test),
+    df=df_test,
+    feature="BonusMalus",
     weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_test),
     y_label="Claim Frequency",
     title="test data",
     ax=ax[1, 1],
@@ -308,12 +313,13 @@ def score_estimator(
 # 3. Severity model -  Gamma Distribution
 # ---------------------------------------
 # The mean claim amount or severity (`AvgClaimAmount`) can be empirically
-# shown to follow a Gamma distribution. We fit a GLM model for the severity
-# with the same features as the frequency model.
+# shown to follow approximately a Gamma distribution. We fit a GLM model for
+# the severity with the same features as the frequency model.
 #
 # Note:
-# - We filter out ``ClaimAmount == 0``` as the Gamma distribution as support
-# on :math:`(0, \infty)` not :math:`[0, \infty)`.
+#
+# - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support
+#   on :math:`(0, \infty)`, not :math:`[0, \infty)`.
 # - We use ``ClaimNb`` as sample weights.
 
 mask_train = df_train["ClaimAmount"] > 0
@@ -341,19 +347,20 @@ def score_estimator(
 
 ##############################################################################
 #
-# Note that the resulting model is conditional on having at least one claim,
-# and cannot be used to predict the average claim amount in general,
+# Note that the resulting model is the average claim amount per claim. As such,
+# it is conditional on having at least one claim, and cannot be used to predict
+# the average claim amount per policy in general.
 
 print(
-    "Mean AvgClaim Amount:               %.2f "
+    "Mean AvgClaim Amount per policy:              %.2f "
     % df_train.AvgClaimAmount.mean()
 )
 print(
-    "Mean AvgClaim Amount | NbClaim > 0: %.2f"
+    "Mean AvgClaim Amount | NbClaim > 0:           %.2f"
     % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean()
 )
 print(
-    "Predicted Mean AvgClaim Amount:     %.2f"
+    "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f"
     % glm_sev.predict(X_train).mean()
 )
 
@@ -361,28 +368,28 @@ def score_estimator(
 ##############################################################################
 #
 # We can visually compare observed and predicted values, aggregated for
-# the drivers age (``Driv Age``),
+# the drivers age (``DrivAge``).
 
 fig, ax = plt.subplots(1, 2, figsize=(16, 4))
 
 # plot DivAge
 plot_obs_pred(
-    df_train.loc[mask_train],
-    "DrivAge",
-    "AvgClaimAmount",
-    glm_sev.predict(X_train[mask_train.values]),
+    df=df_train.loc[mask_train],
+    feature="DrivAge",
     weight="Exposure",
+    observed="AvgClaimAmount",
+    predicted=glm_sev.predict(X_train[mask_train.values]),
     y_label="Average Claim Severity",
     title="train data",
     ax=ax[0],
 )
 
 plot_obs_pred(
-    df_test.loc[mask_test],
-    "DrivAge",
-    "AvgClaimAmount",
-    glm_sev.predict(X_test[mask_test.values]),
+    df=df_test.loc[mask_test],
+    feature="DrivAge",
     weight="Exposure",
+    observed="AvgClaimAmount",
+    predicted=glm_sev.predict(X_test[mask_test.values]),
     y_label="Average Claim Severity",
     title="test data",
     ax=ax[1],
@@ -391,31 +398,41 @@ def score_estimator(
 
 ##############################################################################
 #
-# 3. Total Claims Amount -- Compound Poisson distribution
+# 4. Total Claims Amount -- Compound Poisson distribution
 # -------------------------------------------------------
 #
 # As mentionned in the introduction, the total claim amount can be modeled
-# either as the product of the frequency model by the severity model.
+# either as the product of the frequency model by the severity model,
 
 
 class ClaimProdEstimator:
-    """Total claim amount estimator
+    """Total claim amount estimator.
 
     Computed as the product of the frequency model by the serverity model,
-    denormalized by exposure.
+    denormalized by exposure. Use Tweedie deviance with `p=1.5`.
     """
 
     def __init__(self, est_freq, est_sev):
         self.est_freq = est_freq
         self.est_sev = est_sev
+        self._family_instance = TweedieDistribution(power=1.5)
 
     def predict(self, X, exposure):
-        """Predict the total claim amount
+        """Predict the total claim amount.
 
         The predict method is not compatible with the scikit-learn API.
         """
         return exposure * self.est_freq.predict(X) * self.est_sev.predict(X)
 
+    def score(self, X, y, sample_weight=None):
+        """Compute D², the percentage of deviance explained."""
+        mu = self.predict(X, exposure=sample_weight)
+        dev = self._family_instance.deviance(y, mu, weights=sample_weight)
+        y_mean = np.average(y, weights=sample_weight)
+        dev_null = self._family_instance.deviance(y, y_mean,
+                                                  weights=sample_weight)
+        return 1. - dev / dev_null
+
 
 est_prod = ClaimProdEstimator(glm_freq, glm_sev)
 
@@ -476,7 +493,9 @@ def predict(self, X, exposure):
 # model than when using separate models for frequency and severity.
 #
 # We can additionally validate these models by comparing observed and predicted
-# total claim amount over the test and train subsets.
+# total claim amount over the test and train subsets. We see that in our case
+# the frequency-severity model underestimates the total claim amount, whereas
+# the Tweedie model overestimates.
 
 res = []
 for subset_label, X, df in [

From a4043847d0dfde68bb09dc0d9de3ab10fc07d41b Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Mon, 15 Jul 2019 14:15:26 +0200
Subject: [PATCH 085/209] Address review comments

---
 .../plot_tweedie_regression_insurance_claims.py   | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 00111b811f923..1c8dd42df336d 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -46,7 +46,7 @@
 from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
 from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
 
-from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import mean_absolute_error, mean_squared_error
 
 
 def load_mtpl2(n_samples=100000):
@@ -83,7 +83,7 @@ def load_mtpl2(n_samples=100000):
 
 
 def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
-                  title=None, kind_weight=None, ax=None):
+                  title=None, ax=None):
     """Plot observed and predicted - aggregated per feature level.
 
     Parameters
@@ -141,9 +141,11 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 # requires a strictly positive target values.
 df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
 
-# correct for unreasonable observations (that might be data error)
+# Correct for unreasonable observations (that might be data error)
+# and a few exceptionally large claim amounts
 df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
 df["Exposure"] = df["Exposure"].clip(upper=1)
+df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)
 
 column_trans = ColumnTransformer(
     [
@@ -188,7 +190,9 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 
 df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2)
 
-glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0)
+# Some of the features are colinear, we use a weak penalization to avoid
+# numerical issues.
+glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=1e-2)
 glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
 
 
@@ -214,6 +218,7 @@ def score_estimator(
             ("D² explained", None),
             ("mean deviance", partial(mean_deviance, estimator)),
             ("mean abs. error", mean_absolute_error),
+            ("mean squared error", mean_squared_error),
         ]:
             if estimator.__class__.__name__ == "ClaimProdEstimator":
                 # ClaimProdEstimator is the product of frequency and severity
@@ -325,7 +330,7 @@ def score_estimator(
 mask_train = df_train["ClaimAmount"] > 0
 mask_test = df_test["ClaimAmount"] > 0
 
-glm_sev = GeneralizedLinearRegressor(family="gamma", alpha=1)
+glm_sev = GeneralizedLinearRegressor(family="gamma")
 
 glm_sev.fit(
     X_train[mask_train.values],

From 65796a3d3e74aff111c187b6be20e0e9287e0797 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Tue, 16 Jul 2019 16:41:02 +0200
Subject: [PATCH 086/209] Review comments on the documentation

---
 doc/modules/linear_model.rst | 50 +++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index c8c103288c5ab..b6f7c2b82c1f5 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -893,27 +893,23 @@ to warm-starting (see :term:`Glossary <warm_start>`).
 Generalized Linear Regression
 =============================
 
-:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two
-ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear
+:class:`GeneralizedLinearRegressor` generalizes linear models in two ways
+[10]_. First, the predicted values :math:`\hat{y}` are linked to a linear
 combination of the input variables :math:`X` via an inverse link function
 :math:`h` as
 
 .. math::    \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p).
 
 Secondly, the squared loss function is replaced by the deviance :math:`D` of an
-exponential dispersion model (EDM) [11]_. The objective function being minimized
-becomes
+exponential dispersion model (EDM) [11]_. The objective function being
+minimized becomes
 
-.. math::    \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1
-            +\frac{\alpha(1-\rho)}{2} w^T P_2 w
+.. math::    \frac{1}{2 \sum s_i}D(y, \hat{y}; s) +\frac{\alpha}{2} ||w||_2
 
-with sample weights :math:`s`.
-:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in
-the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows
-for a more versatile L2 penalty.
+with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`.
 
-Use cases, where a loss different from the squared loss might be appropriate,
-are the following:
+In the following use cases, a loss different from the squared loss might be
+appropriate,
 
   * If the target values :math:`y` are counts (non-negative integer valued) or
     frequencies (non-negative), you might use a Poisson deviance with log-link.
@@ -928,10 +924,10 @@ are the following:
 Since the linear predictor :math:`Xw` can be negative and
 Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
 it is convenient to apply a link function different from the identity link
-:math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the log-link with
-:math:`h(x^\top w)=\exp(x^\top w)`.
+:math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the
+log-link with :math:`h(x^\top w)=\exp(x^\top w)`.
 
-Note that the feature matrix `X` should be standardized before fitting. This
+Note that the feature matrix ``X`` should be standardized before fitting. This
 ensures that the penalty treats features equally. The estimator can be used as
 follows:
 
@@ -947,7 +943,8 @@ follows:
 
 .. topic:: Examples:
 
-  * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py`
+  * :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py`
+  * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_regression_non_normal_loss.py`
 
 Mathematical formulation
 ------------------------
@@ -967,12 +964,9 @@ Note that the first assumption implies
 function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the
 same as specifying a unit variance function (they are one-to-one).
 
-Including penalties helps to avoid overfitting or, in case of L1 penalty, to
-obtain sparse solutions. But there are also other motivations to include them,
-e.g. accounting for the dependence structure of :math:`y`.
-
-The objective function, which is independent of :math:`\phi`, is minimized with
-respect to the coefficients :math:`w`.
+The objective function (the penalized negative log likelihood) is
+independent of :math:`\phi` and is minimized with respect to the
+coefficients :math:`w`.
 
 The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
 likelihood as
@@ -1005,12 +999,16 @@ Two remarks:
 
 .. topic:: References:
 
-    .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+    .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models,
+       Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
 
-    .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
-           See also `Exponential dispersion model. <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
+    .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models
+       and analysis of deviance. Monografias de matemática, no. 51.  See also
+       `Exponential dispersion model.
+       <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
 
-    .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
+    .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts.
+       <https://arxiv.org/pdf/0912.0902.pdf>`_
 
 Stochastic Gradient Descent - SGD
 =================================

From e44afe7b6c9d2c7169ea2d61ecc58c389e67c6de Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Tue, 16 Jul 2019 18:08:30 +0200
Subject: [PATCH 087/209] Split the implementation into several files

---
 sklearn/linear_model/__init__.py              |   3 +-
 sklearn/linear_model/_glm/__init__.py         |   5 +
 sklearn/linear_model/_glm/distribution.py     | 443 ++++++++++++++
 sklearn/linear_model/{_glm.py => _glm/glm.py} | 576 +-----------------
 sklearn/linear_model/_glm/link.py             | 135 ++++
 sklearn/linear_model/_glm/tests/__init__.py   |   1 +
 .../_glm/tests/test_distribution.py           |  61 ++
 .../linear_model/{ => _glm}/tests/test_glm.py |  78 +--
 sklearn/linear_model/_glm/tests/test_link.py  |  38 ++
 9 files changed, 702 insertions(+), 638 deletions(-)
 create mode 100644 sklearn/linear_model/_glm/__init__.py
 create mode 100644 sklearn/linear_model/_glm/distribution.py
 rename sklearn/linear_model/{_glm.py => _glm/glm.py} (62%)
 create mode 100644 sklearn/linear_model/_glm/link.py
 create mode 100644 sklearn/linear_model/_glm/tests/__init__.py
 create mode 100644 sklearn/linear_model/_glm/tests/test_distribution.py
 rename sklearn/linear_model/{ => _glm}/tests/test_glm.py (81%)
 create mode 100644 sklearn/linear_model/_glm/tests/test_link.py

diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 121418f901a1a..1c0df55d27c90 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,8 +18,7 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
-from ._glm import (TweedieDistribution,
-                   GeneralizedLinearRegressor, PoissonRegressor)
+from ._glm.glm import (GeneralizedLinearRegressor, PoissonRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py
new file mode 100644
index 0000000000000..9a88e5604de8a
--- /dev/null
+++ b/sklearn/linear_model/_glm/__init__.py
@@ -0,0 +1,5 @@
+# License: BSD 3 clause
+
+from . import distribution
+
+__all__ = ['distribution']
diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
new file mode 100644
index 0000000000000..2dc720124b48b
--- /dev/null
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -0,0 +1,443 @@
+"""
+Distribution functions used in GLM
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# License: BSD 3 clause
+
+from abc import ABCMeta, abstractmethod
+import numbers
+
+import numpy as np
+from scipy.special import xlogy
+
+
+def _safe_lin_pred(X, coef):
+    """Compute the linear predictor taking care if intercept is present."""
+    if coef.size == X.shape[1] + 1:
+        return X @ coef[1:] + coef[0]
+    else:
+        return X @ coef
+
+
+class ExponentialDispersionModel(metaclass=ABCMeta):
+    r"""Base class for reproductive Exponential Dispersion Models (EDM).
+
+    The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
+
+    .. math:: p(y| \theta, \phi) = c(y, \phi)
+        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
+        = \tilde{c}(y, \phi)
+            \exp\left(-\frac{d(y, \mu)}{2\phi}\right)
+
+    with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`,
+    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`,
+    unit variance :math:`v(\mu)` and
+    unit deviance :math:`d(y,\mu)`.
+
+    Attributes
+    ----------
+    lower_bound
+    upper_bound
+    include_lower_bound
+    include_upper_bound
+
+    Methods
+    -------
+    in_y_range
+    unit_variance
+    unit_variance_derivative
+    variance
+    variance_derivative
+    unit_deviance
+    unit_deviance_derivative
+    deviance
+    deviance_derivative
+    starting_mu
+
+    _mu_deviance_derivative
+
+    References
+    ----------
+
+    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+    """
+    @property
+    def lower_bound(self):
+        """Get the lower bound of values for Y~EDM."""
+        return self._lower_bound
+
+    @property
+    def upper_bound(self):
+        """Get the upper bound of values for Y~EDM."""
+        return self._upper_bound
+
+    @property
+    def include_lower_bound(self):
+        """Get True if lower bound for y is included: y >= lower_bound."""
+        return self._include_lower_bound
+
+    @property
+    def include_upper_bound(self):
+        """Get True if upper bound for y is included: y <= upper_bound."""
+        return self._include_upper_bound
+
+    def in_y_range(self, x):
+        """Returns ``True`` if x is in the valid range of Y~EDM.
+
+        Parameters
+        ----------
+        x : array, shape (n_samples,)
+            Target values.
+        """
+        if self.include_lower_bound:
+            if self.include_upper_bound:
+                return np.logical_and(np.greater_equal(x, self.lower_bound),
+                                      np.less_equal(x, self.upper_bound))
+            else:
+                return np.logical_and(np.greater_equal(x, self.lower_bound),
+                                      np.less(x, self.upper_bound))
+        else:
+            if self.include_upper_bound:
+                return np.logical_and(np.greater(x, self.lower_bound),
+                                      np.less_equal(x, self.upper_bound))
+            else:
+                return np.logical_and(np.greater(x, self.lower_bound),
+                                      np.less(x, self.upper_bound))
+
+    @abstractmethod
+    def unit_variance(self, mu):
+        r"""Compute the unit variance function.
+
+        The unit variance :math:`v(\mu)` determines the variance as
+        a function of the mean :math:`\mu` by
+        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`.
+        It can also be derived from the unit deviance :math:`d(y,\mu)` as
+
+        .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{
+            \partial\mu^2}}\big|_{y=\mu}
+
+        See also :func:`variance`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        pass
+
+    @abstractmethod
+    def unit_variance_derivative(self, mu):
+        r"""Compute the derivative of the unit variance w.r.t. mu.
+
+        Return :math:`v'(\mu)`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Target values.
+        """
+        pass
+
+    def variance(self, mu, phi=1, weights=1):
+        r"""Compute the variance function.
+
+        The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
+        :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`,
+        with unit variance :math:`v(\mu)` and weights :math:`s_i`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        phi : float (default=1)
+            Dispersion parameter.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return phi/weights * self.unit_variance(mu)
+
+    def variance_derivative(self, mu, phi=1, weights=1):
+        r"""Compute the derivative of the variance w.r.t. mu.
+
+        Returns
+        :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
+        =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)`
+        and weights :math:`s_i`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        phi : float (default=1)
+            Dispersion parameter.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return phi/weights * self.unit_variance_derivative(mu)
+
+    @abstractmethod
+    def unit_deviance(self, y, mu):
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,\mu)` can be defined by the
+        log-likelihood as
+        :math:`d(y,\mu) = -2\phi\cdot
+        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        pass
+
+    def unit_deviance_derivative(self, y, mu):
+        r"""Compute the derivative of the unit deviance w.r.t. mu.
+
+        The derivative of the unit deviance is given by
+        :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
+        with unit variance :math:`v(\mu)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        return -2 * (y - mu) / self.unit_variance(mu)
+
+    def deviance(self, y, mu, weights=1):
+        r"""Compute the deviance.
+
+        The deviance is a weighted sum of the per sample unit deviances,
+        :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)`
+        with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`.
+        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
+        \left(loglike(y,\mu,\frac{phi}{s})
+        - loglike(y,y,\frac{phi}{s})\right)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return np.sum(weights * self.unit_deviance(y, mu))
+
+    def deviance_derivative(self, y, mu, weights=1):
+        """Compute the derivative of the deviance w.r.t. mu.
+
+        It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return weights * self.unit_deviance_derivative(y, mu)
+
+    def starting_mu(self, y, weights=1, ind_weight=0.5):
+        """Set starting values for the mean mu.
+
+        These may be good starting points for the (unpenalized) IRLS solver.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+
+        ind_weight : float (default=0.5)
+            Must be between 0 and 1. Specifies how much weight is given to the
+            individual observations instead of the mean of y.
+        """
+        return (ind_weight * y +
+                (1. - ind_weight) * np.average(y, weights=weights))
+
+    def _mu_deviance_derivative(self, coef, X, y, weights, link):
+        """Compute mu and the derivative of the deviance w.r.t coef."""
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        d1 = link.inverse_derivative(lin_pred)
+        temp = d1 * self.deviance_derivative(y, mu, weights)
+        if coef.size == X.shape[1] + 1:
+            devp = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            devp = temp @ X  # sampe as X.T @ temp
+        return mu, devp
+
+
+class TweedieDistribution(ExponentialDispersionModel):
+    r"""A class for the Tweedie distribution.
+
+    A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely
+    defined by it's mean-variance relationship
+    :math:`\mathrm{Var}[Y] \propto \mu^power`.
+
+    Special cases are:
+
+    ===== ================
+    Power Distribution
+    ===== ================
+    0     Normal
+    1     Poisson
+    (0,1) Compound Poisson
+    2     Gamma
+    3     Inverse Gaussian
+
+    Parameters
+    ----------
+    power : float (default=0)
+            The variance power of the `unit_variance`
+            :math:`v(\mu) = \mu^{power}`.
+            For ``0<power<1``, no distribution exists.
+    """
+    def __init__(self, power=0):
+        # validate power and set _upper_bound, _include_upper_bound attrs
+        self.power = power
+
+    @property
+    def power(self):
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        if not isinstance(power, numbers.Real):
+            raise TypeError('power must be a real number, input was {0}'
+                            .format(power))
+
+        self._upper_bound = np.Inf
+        self._include_upper_bound = False
+        if power < 0:
+            # Extreme Stable
+            self._lower_bound = -np.Inf
+            self._include_lower_bound = False
+        elif power == 0:
+            # NormalDistribution
+            self._lower_bound = -np.Inf
+            self._include_lower_bound = False
+        elif (power > 0) and (power < 1):
+            raise ValueError('For 0<power<1, no distribution exists.')
+        elif power == 1:
+            # PoissonDistribution
+            self._lower_bound = 0
+            self._include_lower_bound = True
+        elif (power > 1) and (power < 2):
+            # Compound Poisson
+            self._lower_bound = 0
+            self._include_lower_bound = True
+        elif power == 2:
+            # GammaDistribution
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        elif (power > 2) and (power < 3):
+            # Positive Stable
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        elif power == 3:
+            # InverseGaussianDistribution
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        elif power > 3:
+            # Positive Stable
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        else:  # pragma: no cover
+            # this branch should be unreachable.
+            raise ValueError
+
+        self._power = power
+
+    def unit_variance(self, mu):
+        """Compute the unit variance of a Tweedie distribution v(mu)=mu**power.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        return np.power(mu, self.power)
+
+    def unit_variance_derivative(self, mu):
+        """Compute the derivative of the unit variance of a Tweedie
+        distribution v(mu)=power*mu**(power-1).
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        return self.power * np.power(mu, self.power - 1)
+
+    def unit_deviance(self, y, mu):
+        p = self.power
+        if p == 0:
+            # NormalDistribution
+            return (y - mu)**2
+        if p == 1:
+            # PoissonDistribution
+            # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
+            return 2 * (xlogy(y, y/mu) - y + mu)
+        elif p == 2:
+            # GammaDistribution
+            return 2 * (np.log(mu/y) + y/mu - 1)
+        else:
+            # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
+            #    - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p))
+            return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
+                        y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
+
+
+class NormalDistribution(TweedieDistribution):
+    """Class for the Normal (aka Gaussian) distribution"""
+    def __init__(self):
+        super(NormalDistribution, self).__init__(power=0)
+
+
+class PoissonDistribution(TweedieDistribution):
+    """Class for the scaled Poisson distribution"""
+    def __init__(self):
+        super(PoissonDistribution, self).__init__(power=1)
+
+
+class GammaDistribution(TweedieDistribution):
+    """Class for the Gamma distribution"""
+    def __init__(self):
+        super(GammaDistribution, self).__init__(power=2)
+
+
+class InverseGaussianDistribution(TweedieDistribution):
+    """Class for the scaled InverseGaussianDistribution distribution"""
+    def __init__(self):
+        super(InverseGaussianDistribution, self).__init__(power=3)
+
+
+EDM_DISTRIBUTIONS = {
+    'normal': NormalDistribution,
+    'poisson': PoissonDistribution,
+    'gamma': GammaDistribution,
+    'inverse.gaussian': InverseGaussianDistribution,
+}
diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm/glm.py
similarity index 62%
rename from sklearn/linear_model/_glm.py
rename to sklearn/linear_model/_glm/glm.py
index 939249e42e4f4..bff8ea43fd550 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -6,17 +6,26 @@
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
-from abc import ABCMeta, abstractmethod
 import numbers
 
 import numpy as np
-from scipy import special
 import scipy.optimize
 
-from ..base import BaseEstimator, RegressorMixin
-from ..utils import check_array, check_X_y
-from ..utils.optimize import _check_optimize_result
-from ..utils.validation import check_is_fitted
+from ...base import BaseEstimator, RegressorMixin
+from ...utils import check_array, check_X_y
+from ...utils.optimize import _check_optimize_result
+from ...utils.validation import check_is_fitted
+from .distribution import (
+        ExponentialDispersionModel,
+        TweedieDistribution,
+        EDM_DISTRIBUTIONS
+)
+from .link import (
+        Link,
+        IdentityLink,
+        LogLink,
+        LogitLink,
+)
 
 
 def _check_weights(sample_weight, n_samples):
@@ -46,561 +55,6 @@ def _check_weights(sample_weight, n_samples):
     return weights
 
 
-def _safe_lin_pred(X, coef):
-    """Compute the linear predictor taking care if intercept is present."""
-    if coef.size == X.shape[1] + 1:
-        return X @ coef[1:] + coef[0]
-    else:
-        return X @ coef
-
-
-class Link(metaclass=ABCMeta):
-    """Abstract base class for Link functions."""
-
-    @abstractmethod
-    def link(self, mu):
-        """Compute the link function g(mu).
-
-        The link function links the mean mu=E[Y] to the so called linear
-        predictor (X*w), i.e. g(mu) = linear predictor.
-
-        Parameters
-        ----------
-        mu : array, shape (n_samples,)
-            Usually the (predicted) mean.
-        """
-        pass
-
-    @abstractmethod
-    def derivative(self, mu):
-        """Compute the derivative of the link g'(mu).
-
-        Parameters
-        ----------
-        mu : array, shape (n_samples,)
-            Usually the (predicted) mean.
-        """
-        pass
-
-    @abstractmethod
-    def inverse(self, lin_pred):
-        """Compute the inverse link function h(lin_pred).
-
-        Gives the inverse relationship between linear predictor and the mean
-        mu=E[Y], i.e. h(linear predictor) = mu.
-
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        pass
-
-    @abstractmethod
-    def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse link function h'(lin_pred).
-
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        pass
-
-    @abstractmethod
-    def inverse_derivative2(self, lin_pred):
-        """Compute 2nd derivative of the inverse link function h''(lin_pred).
-
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        pass
-
-
-class IdentityLink(Link):
-    """The identity link function g(x)=x."""
-
-    def link(self, mu):
-        return mu
-
-    def derivative(self, mu):
-        return np.ones_like(mu)
-
-    def inverse(self, lin_pred):
-        return lin_pred
-
-    def inverse_derivative(self, lin_pred):
-        return np.ones_like(lin_pred)
-
-    def inverse_derivative2(self, lin_pred):
-        return np.zeros_like(lin_pred)
-
-
-class LogLink(Link):
-    """The log link function g(x)=log(x)."""
-
-    def link(self, mu):
-        return np.log(mu)
-
-    def derivative(self, mu):
-        return 1./mu
-
-    def inverse(self, lin_pred):
-        return np.exp(lin_pred)
-
-    def inverse_derivative(self, lin_pred):
-        return np.exp(lin_pred)
-
-    def inverse_derivative2(self, lin_pred):
-        return np.exp(lin_pred)
-
-
-class LogitLink(Link):
-    """The logit link function g(x)=logit(x)."""
-
-    def link(self, mu):
-        return special.logit(mu)
-
-    def derivative(self, mu):
-        return 1. / (mu * (1 - mu))
-
-    def inverse(self, lin_pred):
-        return special.expit(lin_pred)
-
-    def inverse_derivative(self, lin_pred):
-        ep = special.expit(lin_pred)
-        return ep * (1. - ep)
-
-    def inverse_derivative2(self, lin_pred):
-        ep = special.expit(lin_pred)
-        return ep * (1. - ep) * (1. - 2 * ep)
-
-
-class ExponentialDispersionModel(metaclass=ABCMeta):
-    r"""Base class for reproductive Exponential Dispersion Models (EDM).
-
-    The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
-
-    .. math:: p(y| \theta, \phi) = c(y, \phi)
-        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
-        = \tilde{c}(y, \phi)
-            \exp\left(-\frac{d(y, \mu)}{2\phi}\right)
-
-    with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`,
-    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`,
-    unit variance :math:`v(\mu)` and
-    unit deviance :math:`d(y,\mu)`.
-
-    Attributes
-    ----------
-    lower_bound
-    upper_bound
-    include_lower_bound
-    include_upper_bound
-
-    Methods
-    -------
-    in_y_range
-    unit_variance
-    unit_variance_derivative
-    variance
-    variance_derivative
-    unit_deviance
-    unit_deviance_derivative
-    deviance
-    deviance_derivative
-    starting_mu
-
-    _mu_deviance_derivative
-
-    References
-    ----------
-
-    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
-    """
-    @property
-    def lower_bound(self):
-        """Get the lower bound of values for Y~EDM."""
-        return self._lower_bound
-
-    @property
-    def upper_bound(self):
-        """Get the upper bound of values for Y~EDM."""
-        return self._upper_bound
-
-    @property
-    def include_lower_bound(self):
-        """Get True if lower bound for y is included: y >= lower_bound."""
-        return self._include_lower_bound
-
-    @property
-    def include_upper_bound(self):
-        """Get True if upper bound for y is included: y <= upper_bound."""
-        return self._include_upper_bound
-
-    def in_y_range(self, x):
-        """Returns ``True`` if x is in the valid range of Y~EDM.
-
-        Parameters
-        ----------
-        x : array, shape (n_samples,)
-            Target values.
-        """
-        if self.include_lower_bound:
-            if self.include_upper_bound:
-                return np.logical_and(np.greater_equal(x, self.lower_bound),
-                                      np.less_equal(x, self.upper_bound))
-            else:
-                return np.logical_and(np.greater_equal(x, self.lower_bound),
-                                      np.less(x, self.upper_bound))
-        else:
-            if self.include_upper_bound:
-                return np.logical_and(np.greater(x, self.lower_bound),
-                                      np.less_equal(x, self.upper_bound))
-            else:
-                return np.logical_and(np.greater(x, self.lower_bound),
-                                      np.less(x, self.upper_bound))
-
-    @abstractmethod
-    def unit_variance(self, mu):
-        r"""Compute the unit variance function.
-
-        The unit variance :math:`v(\mu)` determines the variance as
-        a function of the mean :math:`\mu` by
-        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`.
-        It can also be derived from the unit deviance :math:`d(y,\mu)` as
-
-        .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{
-            \partial\mu^2}}\big|_{y=\mu}
-
-        See also :func:`variance`.
-
-        Parameters
-        ----------
-        mu : array, shape (n_samples,)
-            Predicted mean.
-        """
-        pass
-
-    @abstractmethod
-    def unit_variance_derivative(self, mu):
-        r"""Compute the derivative of the unit variance w.r.t. mu.
-
-        Return :math:`v'(\mu)`.
-
-        Parameters
-        ----------
-        mu : array, shape (n_samples,)
-            Target values.
-        """
-        pass
-
-    def variance(self, mu, phi=1, weights=1):
-        r"""Compute the variance function.
-
-        The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
-        :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`,
-        with unit variance :math:`v(\mu)` and weights :math:`s_i`.
-
-        Parameters
-        ----------
-        mu : array, shape (n_samples,)
-            Predicted mean.
-
-        phi : float (default=1)
-            Dispersion parameter.
-
-        weights : array, shape (n_samples,) (default=1)
-            Weights or exposure to which variance is inverse proportional.
-        """
-        return phi/weights * self.unit_variance(mu)
-
-    def variance_derivative(self, mu, phi=1, weights=1):
-        r"""Compute the derivative of the variance w.r.t. mu.
-
-        Returns
-        :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
-        =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)`
-        and weights :math:`s_i`.
-
-        Parameters
-        ----------
-        mu : array, shape (n_samples,)
-            Predicted mean.
-
-        phi : float (default=1)
-            Dispersion parameter.
-
-        weights : array, shape (n_samples,) (default=1)
-            Weights or exposure to which variance is inverse proportional.
-        """
-        return phi/weights * self.unit_variance_derivative(mu)
-
-    @abstractmethod
-    def unit_deviance(self, y, mu):
-        r"""Compute the unit deviance.
-
-        The unit_deviance :math:`d(y,\mu)` can be defined by the
-        log-likelihood as
-        :math:`d(y,\mu) = -2\phi\cdot
-        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
-
-        Parameters
-        ----------
-        y : array, shape (n_samples,)
-            Target values.
-
-        mu : array, shape (n_samples,)
-            Predicted mean.
-        """
-        pass
-
-    def unit_deviance_derivative(self, y, mu):
-        r"""Compute the derivative of the unit deviance w.r.t. mu.
-
-        The derivative of the unit deviance is given by
-        :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
-        with unit variance :math:`v(\mu)`.
-
-        Parameters
-        ----------
-        y : array, shape (n_samples,)
-            Target values.
-
-        mu : array, shape (n_samples,)
-            Predicted mean.
-        """
-        return -2 * (y - mu) / self.unit_variance(mu)
-
-    def deviance(self, y, mu, weights=1):
-        r"""Compute the deviance.
-
-        The deviance is a weighted sum of the per sample unit deviances,
-        :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)`
-        with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`.
-        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
-        \left(loglike(y,\mu,\frac{phi}{s})
-        - loglike(y,y,\frac{phi}{s})\right)`.
-
-        Parameters
-        ----------
-        y : array, shape (n_samples,)
-            Target values.
-
-        mu : array, shape (n_samples,)
-            Predicted mean.
-
-        weights : array, shape (n_samples,) (default=1)
-            Weights or exposure to which variance is inverse proportional.
-        """
-        return np.sum(weights * self.unit_deviance(y, mu))
-
-    def deviance_derivative(self, y, mu, weights=1):
-        """Compute the derivative of the deviance w.r.t. mu.
-
-        It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`.
-
-        Parameters
-        ----------
-        y : array, shape (n_samples,)
-            Target values.
-
-        mu : array, shape (n_samples,)
-            Predicted mean.
-
-        weights : array, shape (n_samples,) (default=1)
-            Weights or exposure to which variance is inverse proportional.
-        """
-        return weights * self.unit_deviance_derivative(y, mu)
-
-    def starting_mu(self, y, weights=1, ind_weight=0.5):
-        """Set starting values for the mean mu.
-
-        These may be good starting points for the (unpenalized) IRLS solver.
-
-        Parameters
-        ----------
-        y : array, shape (n_samples,)
-            Target values.
-
-        weights : array, shape (n_samples,) (default=1)
-            Weights or exposure to which variance is inverse proportional.
-
-        ind_weight : float (default=0.5)
-            Must be between 0 and 1. Specifies how much weight is given to the
-            individual observations instead of the mean of y.
-        """
-        return (ind_weight * y +
-                (1. - ind_weight) * np.average(y, weights=weights))
-
-    def _mu_deviance_derivative(self, coef, X, y, weights, link):
-        """Compute mu and the derivative of the deviance w.r.t coef."""
-        lin_pred = _safe_lin_pred(X, coef)
-        mu = link.inverse(lin_pred)
-        d1 = link.inverse_derivative(lin_pred)
-        temp = d1 * self.deviance_derivative(y, mu, weights)
-        if coef.size == X.shape[1] + 1:
-            devp = np.concatenate(([temp.sum()], temp @ X))
-        else:
-            devp = temp @ X  # sampe as X.T @ temp
-        return mu, devp
-
-
-class TweedieDistribution(ExponentialDispersionModel):
-    r"""A class for the Tweedie distribution.
-
-    A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely
-    defined by it's mean-variance relationship
-    :math:`\mathrm{Var}[Y] \propto \mu^power`.
-
-    Special cases are:
-
-    ===== ================
-    Power Distribution
-    ===== ================
-    0     Normal
-    1     Poisson
-    (0,1) Compound Poisson
-    2     Gamma
-    3     Inverse Gaussian
-
-    Parameters
-    ----------
-    power : float (default=0)
-            The variance power of the `unit_variance`
-            :math:`v(\mu) = \mu^{power}`.
-            For ``0<power<1``, no distribution exists.
-    """
-    def __init__(self, power=0):
-        # validate power and set _upper_bound, _include_upper_bound attrs
-        self.power = power
-
-    @property
-    def power(self):
-        return self._power
-
-    @power.setter
-    def power(self, power):
-        if not isinstance(power, numbers.Real):
-            raise TypeError('power must be a real number, input was {0}'
-                            .format(power))
-
-        self._upper_bound = np.Inf
-        self._include_upper_bound = False
-        if power < 0:
-            # Extreme Stable
-            self._lower_bound = -np.Inf
-            self._include_lower_bound = False
-        elif power == 0:
-            # NormalDistribution
-            self._lower_bound = -np.Inf
-            self._include_lower_bound = False
-        elif (power > 0) and (power < 1):
-            raise ValueError('For 0<power<1, no distribution exists.')
-        elif power == 1:
-            # PoissonDistribution
-            self._lower_bound = 0
-            self._include_lower_bound = True
-        elif (power > 1) and (power < 2):
-            # Compound Poisson
-            self._lower_bound = 0
-            self._include_lower_bound = True
-        elif power == 2:
-            # GammaDistribution
-            self._lower_bound = 0
-            self._include_lower_bound = False
-        elif (power > 2) and (power < 3):
-            # Positive Stable
-            self._lower_bound = 0
-            self._include_lower_bound = False
-        elif power == 3:
-            # InverseGaussianDistribution
-            self._lower_bound = 0
-            self._include_lower_bound = False
-        elif power > 3:
-            # Positive Stable
-            self._lower_bound = 0
-            self._include_lower_bound = False
-        else:  # pragma: no cover
-            # this branch should be unreachable.
-            raise ValueError
-
-        self._power = power
-
-    def unit_variance(self, mu):
-        """Compute the unit variance of a Tweedie distribution v(mu)=mu**power.
-
-        Parameters
-        ----------
-        mu : array, shape (n_samples,)
-            Predicted mean.
-        """
-        return np.power(mu, self.power)
-
-    def unit_variance_derivative(self, mu):
-        """Compute the derivative of the unit variance of a Tweedie
-        distribution v(mu)=power*mu**(power-1).
-
-        Parameters
-        ----------
-        mu : array, shape (n_samples,)
-            Predicted mean.
-        """
-        return self.power * np.power(mu, self.power - 1)
-
-    def unit_deviance(self, y, mu):
-        p = self.power
-        if p == 0:
-            # NormalDistribution
-            return (y - mu)**2
-        if p == 1:
-            # PoissonDistribution
-            # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
-            return 2 * (special.xlogy(y, y/mu) - y + mu)
-        elif p == 2:
-            # GammaDistribution
-            return 2 * (np.log(mu/y) + y/mu - 1)
-        else:
-            # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
-            #    - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p))
-            return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
-                        y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
-
-
-class NormalDistribution(TweedieDistribution):
-    """Class for the Normal (aka Gaussian) distribution"""
-    def __init__(self):
-        super(NormalDistribution, self).__init__(power=0)
-
-
-class PoissonDistribution(TweedieDistribution):
-    """Class for the scaled Poisson distribution"""
-    def __init__(self):
-        super(PoissonDistribution, self).__init__(power=1)
-
-
-class GammaDistribution(TweedieDistribution):
-    """Class for the Gamma distribution"""
-    def __init__(self):
-        super(GammaDistribution, self).__init__(power=2)
-
-
-class InverseGaussianDistribution(TweedieDistribution):
-    """Class for the scaled InverseGaussianDistribution distribution"""
-    def __init__(self):
-        super(InverseGaussianDistribution, self).__init__(power=3)
-
-
-EDM_DISTRIBUTIONS = {
-    'normal': NormalDistribution,
-    'poisson': PoissonDistribution,
-    'gamma': GammaDistribution,
-    'inverse.gaussian': InverseGaussianDistribution,
-}
-
-
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """Regression via a Generalized Linear Model (GLM) with penalties.
 
diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py
new file mode 100644
index 0000000000000..f79f6163ada48
--- /dev/null
+++ b/sklearn/linear_model/_glm/link.py
@@ -0,0 +1,135 @@
+"""
+Link functions used in GLM
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# License: BSD 3 clause
+
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+from scipy.special import expit, logit
+
+
+class Link(metaclass=ABCMeta):
+    """Abstract base class for Link functions."""
+
+    @abstractmethod
+    def link(self, mu):
+        """Compute the link function g(mu).
+
+        The link function links the mean mu=E[Y] to the so called linear
+        predictor (X*w), i.e. g(mu) = linear predictor.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Usually the (predicted) mean.
+        """
+        pass
+
+    @abstractmethod
+    def derivative(self, mu):
+        """Compute the derivative of the link g'(mu).
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Usually the (predicted) mean.
+        """
+        pass
+
+    @abstractmethod
+    def inverse(self, lin_pred):
+        """Compute the inverse link function h(lin_pred).
+
+        Gives the inverse relationship between linear predictor and the mean
+        mu=E[Y], i.e. h(linear predictor) = mu.
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+        pass
+
+    @abstractmethod
+    def inverse_derivative(self, lin_pred):
+        """Compute the derivative of the inverse link function h'(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+        pass
+
+    @abstractmethod
+    def inverse_derivative2(self, lin_pred):
+        """Compute 2nd derivative of the inverse link function h''(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+        pass
+
+
+class IdentityLink(Link):
+    """The identity link function g(x)=x."""
+
+    def link(self, mu):
+        return mu
+
+    def derivative(self, mu):
+        return np.ones_like(mu)
+
+    def inverse(self, lin_pred):
+        return lin_pred
+
+    def inverse_derivative(self, lin_pred):
+        return np.ones_like(lin_pred)
+
+    def inverse_derivative2(self, lin_pred):
+        return np.zeros_like(lin_pred)
+
+
+class LogLink(Link):
+    """The log link function g(x)=log(x)."""
+
+    def link(self, mu):
+        return np.log(mu)
+
+    def derivative(self, mu):
+        return 1./mu
+
+    def inverse(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative2(self, lin_pred):
+        return np.exp(lin_pred)
+
+
+class LogitLink(Link):
+    """The logit link function g(x)=logit(x)."""
+
+    def link(self, mu):
+        return logit(mu)
+
+    def derivative(self, mu):
+        return 1. / (mu * (1 - mu))
+
+    def inverse(self, lin_pred):
+        return expit(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        ep = expit(lin_pred)
+        return ep * (1. - ep)
+
+    def inverse_derivative2(self, lin_pred):
+        ep = expit(lin_pred)
+        return ep * (1. - ep) * (1. - 2 * ep)
diff --git a/sklearn/linear_model/_glm/tests/__init__.py b/sklearn/linear_model/_glm/tests/__init__.py
new file mode 100644
index 0000000000000..588cf7e93eef0
--- /dev/null
+++ b/sklearn/linear_model/_glm/tests/__init__.py
@@ -0,0 +1 @@
+# License: BSD 3 clause
diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py
new file mode 100644
index 0000000000000..1a2efdba0e563
--- /dev/null
+++ b/sklearn/linear_model/_glm/tests/test_distribution.py
@@ -0,0 +1,61 @@
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#
+# License: BSD 3 clause
+
+from numpy.testing import assert_allclose
+from numpy.testing import assert_array_equal
+import pytest
+
+from sklearn.linear_model._glm.distribution import (
+    TweedieDistribution,
+    NormalDistribution, PoissonDistribution,
+    GammaDistribution, InverseGaussianDistribution,
+)
+
+
+@pytest.mark.parametrize(
+    'family, expected',
+    [(NormalDistribution(), [True, True, True]),
+     (PoissonDistribution(), [False, True, True]),
+     (TweedieDistribution(power=1.5), [False, True, True]),
+     (GammaDistribution(), [False, False, True]),
+     (InverseGaussianDistribution(), [False, False, True]),
+     (TweedieDistribution(power=4.5), [False, False, True])])
+def test_family_bounds(family, expected):
+    """Test the valid range of distributions at -1, 0, 1."""
+    result = family.in_y_range([-1, 0, 1])
+    assert_array_equal(result, expected)
+
+
+def test_tweedie_distribution_power():
+    with pytest.raises(ValueError, match="no distribution exists"):
+        TweedieDistribution(power=0.5)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        TweedieDistribution(power=1j)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        dist = TweedieDistribution()
+        dist.power = 1j
+
+    dist = TweedieDistribution()
+    assert dist._include_lower_bound is False
+    dist.power = 1
+    assert dist._include_lower_bound is True
+
+
+@pytest.mark.parametrize(
+    'family, chk_values',
+    [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
+     (PoissonDistribution(), [0.1, 1.5]),
+     (GammaDistribution(), [0.1, 1.5]),
+     (InverseGaussianDistribution(), [0.1, 1.5]),
+     (TweedieDistribution(power=-2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-1), [0.1, 1.5]),
+     (TweedieDistribution(power=1.5), [0.1, 1.5]),
+     (TweedieDistribution(power=2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-4), [0.1, 1.5])])
+def test_deviance_zero(family, chk_values):
+    """Test deviance(y,y) = 0 for different families."""
+    for x in chk_values:
+        assert_allclose(family.deviance(x, x), 0, atol=1e-9)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
similarity index 81%
rename from sklearn/linear_model/tests/test_glm.py
rename to sklearn/linear_model/_glm/tests/test_glm.py
index 1712f7b5e1d3d..ffac6dd11c243 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -8,11 +8,12 @@
 
 from sklearn.datasets import make_regression
 from sklearn.linear_model import GeneralizedLinearRegressor
-from sklearn.linear_model._glm import (
-    Link,
+from sklearn.linear_model._glm.link import (
     IdentityLink,
     LogLink,
     LogitLink,
+)
+from sklearn.linear_model._glm.distribution import (
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
@@ -21,8 +22,6 @@
 from sklearn.metrics import mean_absolute_error
 from sklearn.exceptions import ConvergenceWarning
 
-from sklearn.utils.testing import assert_array_equal
-
 GLM_SOLVERS = ['lbfgs']
 
 
@@ -35,77 +34,6 @@ def regression_data():
     return X, y
 
 
-@pytest.mark.parametrize('link', Link.__subclasses__())
-def test_link_properties(link):
-    """Test link inverse and derivative."""
-    rng = np.random.RandomState(42)
-    x = rng.rand(100)*100
-    link = link()  # instantiate object
-    if isinstance(link, LogitLink):
-        # careful for large x, note expit(36) = 1
-        # limit max eta to 15
-        x = x / 100 * 15
-    assert_allclose(link.link(link.inverse(x)), x)
-    # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
-    assert_allclose(link.derivative(link.inverse(x)),
-                    1./link.inverse_derivative(x))
-
-    assert (
-      link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape)
-
-    # for LogitLink, in the following x should be between 0 and 1.
-    # assert_almost_equal(link.inverse_derivative(link.link(x)),
-    #                     1./link.derivative(x), decimal=decimal)
-
-
-@pytest.mark.parametrize(
-    'family, expected',
-    [(NormalDistribution(), [True, True, True]),
-     (PoissonDistribution(), [False, True, True]),
-     (TweedieDistribution(power=1.5), [False, True, True]),
-     (GammaDistribution(), [False, False, True]),
-     (InverseGaussianDistribution(), [False, False, True]),
-     (TweedieDistribution(power=4.5), [False, False, True])])
-def test_family_bounds(family, expected):
-    """Test the valid range of distributions at -1, 0, 1."""
-    result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, expected)
-
-
-def test_tweedie_distribution_power():
-    with pytest.raises(ValueError, match="no distribution exists"):
-        TweedieDistribution(power=0.5)
-
-    with pytest.raises(TypeError, match="must be a real number"):
-        TweedieDistribution(power=1j)
-
-    with pytest.raises(TypeError, match="must be a real number"):
-        dist = TweedieDistribution()
-        dist.power = 1j
-
-    dist = TweedieDistribution()
-    assert dist._include_lower_bound is False
-    dist.power = 1
-    assert dist._include_lower_bound is True
-
-
-@pytest.mark.parametrize(
-    'family, chk_values',
-    [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
-     (PoissonDistribution(), [0.1, 1.5]),
-     (GammaDistribution(), [0.1, 1.5]),
-     (InverseGaussianDistribution(), [0.1, 1.5]),
-     (TweedieDistribution(power=-2.5), [0.1, 1.5]),
-     (TweedieDistribution(power=-1), [0.1, 1.5]),
-     (TweedieDistribution(power=1.5), [0.1, 1.5]),
-     (TweedieDistribution(power=2.5), [0.1, 1.5]),
-     (TweedieDistribution(power=-4), [0.1, 1.5])])
-def test_deviance_zero(family, chk_values):
-    """Test deviance(y,y) = 0 for different families."""
-    for x in chk_values:
-        assert_allclose(family.deviance(x, x), 0, atol=1e-9)
-
-
 def test_sample_weights_validation():
     """Test the raised errors in the validation of sample_weight."""
     # scalar value but not positive
diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py
new file mode 100644
index 0000000000000..3a2a21c4c04e0
--- /dev/null
+++ b/sklearn/linear_model/_glm/tests/test_link.py
@@ -0,0 +1,38 @@
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#
+# License: BSD 3 clause
+import numpy as np
+from numpy.testing import assert_allclose
+import pytest
+
+from sklearn.linear_model._glm.link import (
+    IdentityLink,
+    LogLink,
+    LogitLink,
+)
+
+
+LINK_FUNCTIONS = [IdentityLink, LogLink, LogitLink]
+
+
+@pytest.mark.parametrize('link', LINK_FUNCTIONS)
+def test_link_properties(link):
+    """Test link inverse and derivative."""
+    rng = np.random.RandomState(42)
+    x = rng.rand(100)*100
+    link = link()  # instantiate object
+    if isinstance(link, LogitLink):
+        # careful for large x, note expit(36) = 1
+        # limit max eta to 15
+        x = x / 100 * 15
+    assert_allclose(link.link(link.inverse(x)), x)
+    # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
+    assert_allclose(link.derivative(link.inverse(x)),
+                    1./link.inverse_derivative(x))
+
+    assert (
+      link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape)
+
+    # for LogitLink, in the following x should be between 0 and 1.
+    # assert_almost_equal(link.inverse_derivative(link.link(x)),
+    #                     1./link.derivative(x), decimal=decimal)

From 5927379debd8e591d72c5915ad434fc0592f5665 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Tue, 16 Jul 2019 18:27:36 +0200
Subject: [PATCH 088/209] Fix CI

---
 sklearn/linear_model/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 1c0df55d27c90..8fc662a48a6ae 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -81,5 +81,4 @@
            'ridge_regression',
            'RANSACRegressor',
            'GeneralizedLinearRegressor',
-           'TweedieDistribution',
            'PoissonRegressor']

From a6df2a788b5a4907b6c3a9b51a758f5b05e95257 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Tue, 16 Jul 2019 18:57:53 +0200
Subject: [PATCH 089/209] Add test_deviance_derivative

---
 sklearn/linear_model/__init__.py              |  2 +-
 sklearn/linear_model/_glm/__init__.py         |  4 +-
 .../_glm/tests/test_distribution.py           | 44 +++++++++++++++++--
 3 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 8fc662a48a6ae..e8f63e95cdfa8 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,7 +18,7 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
-from ._glm.glm import (GeneralizedLinearRegressor, PoissonRegressor)
+from ._glm import (GeneralizedLinearRegressor, PoissonRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py
index 9a88e5604de8a..ac0f2c206ffe0 100644
--- a/sklearn/linear_model/_glm/__init__.py
+++ b/sklearn/linear_model/_glm/__init__.py
@@ -1,5 +1,5 @@
 # License: BSD 3 clause
 
-from . import distribution
+from .glm import (GeneralizedLinearRegressor, PoissonRegressor)
 
-__all__ = ['distribution']
+__all__ = ['GeneralizedLinearRegressor', 'PoissonRegressor']
diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py
index 1a2efdba0e563..f457fc882a809 100644
--- a/sklearn/linear_model/_glm/tests/test_distribution.py
+++ b/sklearn/linear_model/_glm/tests/test_distribution.py
@@ -1,9 +1,12 @@
 # Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
 #
 # License: BSD 3 clause
-
-from numpy.testing import assert_allclose
-from numpy.testing import assert_array_equal
+import numpy as np
+from numpy.testing import (
+    assert_allclose,
+    assert_array_equal,
+)
+from scipy.optimize import check_grad
 import pytest
 
 from sklearn.linear_model._glm.distribution import (
@@ -59,3 +62,38 @@ def test_deviance_zero(family, chk_values):
     """Test deviance(y,y) = 0 for different families."""
     for x in chk_values:
         assert_allclose(family.deviance(x, x), 0, atol=1e-9)
+
+
+@pytest.mark.parametrize(
+    'family',
+    [NormalDistribution(),
+     PoissonDistribution(),
+     GammaDistribution(),
+     InverseGaussianDistribution(),
+     TweedieDistribution(power=-2.5),
+     TweedieDistribution(power=-1),
+     TweedieDistribution(power=1.5),
+     TweedieDistribution(power=2.5),
+     TweedieDistribution(power=-4)],
+    ids=lambda x: x.__class__.__name__
+)
+def test_deviance_derivative(family):
+    """Test deviance derivative for different families."""
+    rng = np.random.RandomState(0)
+    y_true = rng.rand(10)
+    # make data positive
+    y_true += np.abs(y_true.min()) + 1e-2
+
+    y_pred = y_true + np.fmax(rng.rand(10), 0.)
+
+    dev = family.deviance(y_true, y_pred)
+    assert isinstance(dev, float)
+    dev_derivative = family.deviance_derivative(y_true, y_pred)
+    assert dev_derivative.shape == y_pred.shape
+
+    err = check_grad(
+            lambda mu: family.deviance(y_true, mu),
+            lambda mu: family.deviance_derivative(y_true, mu),
+            y_pred,
+    ) / np.linalg.norm(dev_derivative)
+    assert err < 1e-6

From 5af89a70fd4a168f3ab1d0d1966d6357e6bc4e68 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Tue, 16 Jul 2019 19:58:54 +0200
Subject: [PATCH 090/209] Fix sklearn/linear_model/setup.py

---
 .../linear_model/plot_tweedie_regression_insurance_claims.py    | 2 +-
 sklearn/linear_model/_glm/tests/test_distribution.py            | 2 +-
 sklearn/linear_model/setup.py                                   | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 1c8dd42df336d..fba1cc42e20a7 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -40,7 +40,7 @@
 
 from sklearn.compose import ColumnTransformer
 from sklearn.linear_model import GeneralizedLinearRegressor
-from sklearn.linear_model._glm import TweedieDistribution
+from sklearn.linear_model._glm.distribution import TweedieDistribution
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py
index f457fc882a809..f984077bed5cb 100644
--- a/sklearn/linear_model/_glm/tests/test_distribution.py
+++ b/sklearn/linear_model/_glm/tests/test_distribution.py
@@ -96,4 +96,4 @@ def test_deviance_derivative(family):
             lambda mu: family.deviance_derivative(y_true, mu),
             y_pred,
     ) / np.linalg.norm(dev_derivative)
-    assert err < 1e-6
+    assert abs(err) < 1e-6
diff --git a/sklearn/linear_model/setup.py b/sklearn/linear_model/setup.py
index 8226412fdecbd..5cf7040d4c9d4 100644
--- a/sklearn/linear_model/setup.py
+++ b/sklearn/linear_model/setup.py
@@ -42,6 +42,7 @@ def configuration(parent_package='', top_path=None):
 
     # add other directories
     config.add_subpackage('tests')
+    config.add_subpackage('_glm')
 
     return config
 

From cd347d4610d00dca9e7a01de2067595a4e4b8a59 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 17 Jul 2019 14:01:44 +0200
Subject: [PATCH 091/209] Remove variance and variance_derivative methods from
 distributions

---
 sklearn/linear_model/_glm/distribution.py | 72 ++---------------------
 1 file changed, 4 insertions(+), 68 deletions(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index 2dc720124b48b..18e7cf0c0a227 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -44,16 +44,13 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
 
     Methods
     -------
+    deviance
+    deviance_derivative
     in_y_range
-    unit_variance
-    unit_variance_derivative
-    variance
-    variance_derivative
     unit_deviance
     unit_deviance_derivative
-    deviance
-    deviance_derivative
-    starting_mu
+    unit_variance
+    unit_variance_derivative
 
     _mu_deviance_derivative
 
@@ -139,47 +136,6 @@ def unit_variance_derivative(self, mu):
         """
         pass
 
-    def variance(self, mu, phi=1, weights=1):
-        r"""Compute the variance function.
-
-        The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
-        :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`,
-        with unit variance :math:`v(\mu)` and weights :math:`s_i`.
-
-        Parameters
-        ----------
-        mu : array, shape (n_samples,)
-            Predicted mean.
-
-        phi : float (default=1)
-            Dispersion parameter.
-
-        weights : array, shape (n_samples,) (default=1)
-            Weights or exposure to which variance is inverse proportional.
-        """
-        return phi/weights * self.unit_variance(mu)
-
-    def variance_derivative(self, mu, phi=1, weights=1):
-        r"""Compute the derivative of the variance w.r.t. mu.
-
-        Returns
-        :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
-        =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)`
-        and weights :math:`s_i`.
-
-        Parameters
-        ----------
-        mu : array, shape (n_samples,)
-            Predicted mean.
-
-        phi : float (default=1)
-            Dispersion parameter.
-
-        weights : array, shape (n_samples,) (default=1)
-            Weights or exposure to which variance is inverse proportional.
-        """
-        return phi/weights * self.unit_variance_derivative(mu)
-
     @abstractmethod
     def unit_deviance(self, y, mu):
         r"""Compute the unit deviance.
@@ -257,26 +213,6 @@ def deviance_derivative(self, y, mu, weights=1):
         """
         return weights * self.unit_deviance_derivative(y, mu)
 
-    def starting_mu(self, y, weights=1, ind_weight=0.5):
-        """Set starting values for the mean mu.
-
-        These may be good starting points for the (unpenalized) IRLS solver.
-
-        Parameters
-        ----------
-        y : array, shape (n_samples,)
-            Target values.
-
-        weights : array, shape (n_samples,) (default=1)
-            Weights or exposure to which variance is inverse proportional.
-
-        ind_weight : float (default=0.5)
-            Must be between 0 and 1. Specifies how much weight is given to the
-            individual observations instead of the mean of y.
-        """
-        return (ind_weight * y +
-                (1. - ind_weight) * np.average(y, weights=weights))
-
     def _mu_deviance_derivative(self, coef, X, y, weights, link):
         """Compute mu and the derivative of the deviance w.r.t coef."""
         lin_pred = _safe_lin_pred(X, coef)

From 0d7f9cd9babe928a01266f43f1f5e4e41829075b Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 17 Jul 2019 15:05:49 +0200
Subject: [PATCH 092/209] Improve coverage

---
 sklearn/linear_model/_glm/distribution.py   |  6 ++--
 sklearn/linear_model/_glm/link.py           | 10 +++----
 sklearn/linear_model/_glm/tests/test_glm.py | 31 +++++++++++++++++++++
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index 18e7cf0c0a227..6fffd816ff8f9 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -121,7 +121,7 @@ def unit_variance(self, mu):
         mu : array, shape (n_samples,)
             Predicted mean.
         """
-        pass
+        pass  # pragma: no cover
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
@@ -134,7 +134,7 @@ def unit_variance_derivative(self, mu):
         mu : array, shape (n_samples,)
             Target values.
         """
-        pass
+        pass  # pragma: no cover
 
     @abstractmethod
     def unit_deviance(self, y, mu):
@@ -153,7 +153,7 @@ def unit_deviance(self, y, mu):
         mu : array, shape (n_samples,)
             Predicted mean.
         """
-        pass
+        pass  # pragma: no cover
 
     def unit_deviance_derivative(self, y, mu):
         r"""Compute the derivative of the unit deviance w.r.t. mu.
diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py
index f79f6163ada48..ec9a7b7736eb1 100644
--- a/sklearn/linear_model/_glm/link.py
+++ b/sklearn/linear_model/_glm/link.py
@@ -26,7 +26,7 @@ def link(self, mu):
         mu : array, shape (n_samples,)
             Usually the (predicted) mean.
         """
-        pass
+        pass  # pragma: no cover
 
     @abstractmethod
     def derivative(self, mu):
@@ -37,7 +37,7 @@ def derivative(self, mu):
         mu : array, shape (n_samples,)
             Usually the (predicted) mean.
         """
-        pass
+        pass  # pragma: no cover
 
     @abstractmethod
     def inverse(self, lin_pred):
@@ -51,7 +51,7 @@ def inverse(self, lin_pred):
         lin_pred : array, shape (n_samples,)
             Usually the (fitted) linear predictor.
         """
-        pass
+        pass  # pragma: no cover
 
     @abstractmethod
     def inverse_derivative(self, lin_pred):
@@ -62,7 +62,7 @@ def inverse_derivative(self, lin_pred):
         lin_pred : array, shape (n_samples,)
             Usually the (fitted) linear predictor.
         """
-        pass
+        pass  # pragma: no cover
 
     @abstractmethod
     def inverse_derivative2(self, lin_pred):
@@ -73,7 +73,7 @@ def inverse_derivative2(self, lin_pred):
         lin_pred : array, shape (n_samples,)
             Usually the (fitted) linear predictor.
         """
-        pass
+        pass  # pragma: no cover
 
 
 class IdentityLink(Link):
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index ffac6dd11c243..a0ac77b65aa9f 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -219,6 +219,37 @@ def test_glm_log_regression(family, solver, tol):
     assert_allclose(res.coef_, coef, rtol=5e-6)
 
 
+@pytest.mark.parametrize('fit_intercept', [True, False])
+def test_warm_start(fit_intercept):
+    n_samples, n_features = 100, 10
+    n_predict = 10
+    X, y, coef = make_regression(n_samples=n_samples+n_predict,
+                                 n_features=n_features,
+                                 n_informative=n_features-2, noise=0.5,
+                                 coef=True, random_state=42)
+
+    glm1 = GeneralizedLinearRegressor(
+        warm_start=False,
+        fit_intercept=fit_intercept,
+        max_iter=1000
+    )
+    glm1.fit(X, y)
+
+    glm2 = GeneralizedLinearRegressor(
+        warm_start=True,
+        fit_intercept=fit_intercept,
+        max_iter=1
+    )
+    glm2.fit(X, y)
+    assert glm1.score(X, y) > glm2.score(X, y)
+    glm2.set_params(max_iter=1000)
+    glm2.fit(X, y)
+    assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-4, atol=1e-5)
+    assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4)
+    # TODO: investigate why this doesn't match
+    # assert glm1.n_iter_ == glm2.n_iter_ + 2
+
+
 @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)])
 @pytest.mark.parametrize('fit_intercept', [True, False])
 @pytest.mark.parametrize('solver', GLM_SOLVERS)

From dbffad84fcab9521265e85d5e7cbf9da0673b380 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 17 Jul 2019 15:44:44 +0200
Subject: [PATCH 093/209] Remove mentions of the binomial distribution

---
 sklearn/linear_model/_glm/glm.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index bff8ea43fd550..0115f9d160337 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -85,7 +85,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \
+    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} \
             or an instance of class ExponentialDispersionModel, \
             optional(default='normal')
         The distributional assumption of the GLM, i.e. which distribution from
@@ -101,8 +101,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         - 'log' for families 'poisson', 'gamma', 'inverse.gaussian'
 
-        - 'logit' for family 'binomial'
-
     fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
         Method for estimation of the dispersion parameter phi. Whether to use
         the chi squared statistic or the deviance statistic. If None, the

From 3187204dc4c1bb41a6131c49262c8380a34b9ec1 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 19 Jul 2019 17:37:11 +0200
Subject: [PATCH 094/209] Use common simple weight validation

---
 sklearn/linear_model/_glm/glm.py            | 37 +++------------------
 sklearn/linear_model/_glm/tests/test_glm.py | 19 ++---------
 2 files changed, 7 insertions(+), 49 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 0115f9d160337..546e71c583fa8 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -14,7 +14,7 @@
 from ...base import BaseEstimator, RegressorMixin
 from ...utils import check_array, check_X_y
 from ...utils.optimize import _check_optimize_result
-from ...utils.validation import check_is_fitted
+from ...utils.validation import check_is_fitted, _check_sample_weight
 from .distribution import (
         ExponentialDispersionModel,
         TweedieDistribution,
@@ -28,33 +28,6 @@
 )
 
 
-def _check_weights(sample_weight, n_samples):
-    """Check that sample weights are non-negative and have the right shape."""
-    if sample_weight is None:
-        weights = np.ones(n_samples)
-    elif np.isscalar(sample_weight):
-        if sample_weight <= 0:
-            raise ValueError("Sample weights must be non-negative.")
-        weights = sample_weight * np.ones(n_samples)
-    else:
-        _dtype = [np.float64, np.float32]
-        weights = check_array(sample_weight, accept_sparse=False,
-                              force_all_finite=True, ensure_2d=False,
-                              dtype=_dtype)
-        if weights.ndim > 1:
-            raise ValueError("Sample weight must be 1D array or scalar")
-        elif weights.shape[0] != n_samples:
-            raise ValueError("Sample weights must have the same length as "
-                             "y")
-        if not np.all(weights >= 0):
-            raise ValueError("Sample weights must be non-negative.")
-        elif not np.sum(weights) > 0:
-            raise ValueError("Sample weights must have at least one positive "
-                             "element.")
-
-    return weights
-
-
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """Regression via a Generalized Linear Model (GLM) with penalties.
 
@@ -323,7 +296,7 @@ def fit(self, X, y, sample_weight=None):
                          copy=self.copy_X)
         y = np.asarray(y, dtype=np.float64)
 
-        weights = _check_weights(sample_weight, y.shape[0])
+        weights = _check_sample_weight(sample_weight, X)
 
         n_samples, n_features = X.shape
 
@@ -458,7 +431,7 @@ def predict(self, X, sample_weight=None):
                         allow_nd=False)
         eta = self._linear_predictor(X)
         mu = self._link_instance.inverse(eta)
-        weights = _check_weights(sample_weight, X.shape[0])
+        weights = _check_sample_weight(sample_weight, X)
 
         return mu*weights
 
@@ -487,7 +460,7 @@ def estimate_phi(self, X, y, sample_weight=None):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          dtype=_dtype, y_numeric=True, multi_output=False)
         n_samples, n_features = X.shape
-        weights = _check_weights(sample_weight, n_samples)
+        weights = _check_sample_weight(sample_weight, X)
         eta = X @ self.coef_
         if self.fit_intercept is True:
             eta += self.intercept_
@@ -542,7 +515,7 @@ def score(self, X, y, sample_weight=None):
         # Note, default score defined in RegressorMixin is R^2 score.
         # TODO: make D^2 a score function in module metrics (and thereby get
         #       input validation and so on)
-        weights = _check_weights(sample_weight, y.shape[0])
+        weights = _check_sample_weight(sample_weight, X)
         mu = self.predict(X)
         dev = self._family_instance.deviance(y, mu, weights=weights)
         y_mean = np.average(y, weights=weights)
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index a0ac77b65aa9f..2909c85ba8c40 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -41,8 +41,6 @@ def test_sample_weights_validation():
     y = [1]
     weights = 0
     glm = GeneralizedLinearRegressor(fit_intercept=False)
-    with pytest.raises(ValueError, match="weights must be non-negative"):
-        glm.fit(X, y, weights)
 
     # Positive weights are accepted
     glm.fit(X, y, sample_weight=1)
@@ -54,21 +52,8 @@ def test_sample_weights_validation():
 
     # 1d but wrong length
     weights = [1, 0]
-    with pytest.raises(ValueError,
-                       match="weights must have the same length as y"):
-        glm.fit(X, y, weights)
-
-    # 1d but only zeros (sum not greater than 0)
-    weights = [0, 0]
-    X = [[0], [1]]
-    y = [1, 2]
-    with pytest.raises(ValueError,
-                       match="must have at least one positive element"):
-        glm.fit(X, y, weights)
-
-    # 5. 1d but with a negative value
-    weights = [2, -1]
-    with pytest.raises(ValueError, match="weights must be non-negative"):
+    msg = r"sample_weight.shape == \(2,\), expected \(1,\)!"
+    with pytest.raises(ValueError, match=msg):
         glm.fit(X, y, weights)
 
 

From cc03c1ad58db4108ea1bfc3d2d47225be8822bd8 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 19 Jul 2019 17:41:01 +0200
Subject: [PATCH 095/209] Simplify comments formatting

---
 sklearn/linear_model/_glm/glm.py | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 546e71c583fa8..ea3e4964529bf 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -208,12 +208,6 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : returns an instance of self.
         """
-        #######################################################################
-        # 1. input validation                                                 #
-        #######################################################################
-        # 1.1 validate arguments of __init__
-        # Guarantee that self._family_instance is an instance of class
-        # ExponentialDispersionModel
         if isinstance(self.family, ExponentialDispersionModel):
             self._family_instance = self.family
         elif self.family in EDM_DISTRIBUTIONS:
@@ -288,7 +282,6 @@ def fit(self, X, y, sample_weight=None):
         family = self._family_instance
         link = self._link_instance
 
-        # 1.2 validate arguments of fit #######################################
         _dtype = [np.float64, np.float32]
         _stype = ['csc', 'csr']
         X, y = check_X_y(X, y, accept_sparse=_stype,
@@ -300,7 +293,6 @@ def fit(self, X, y, sample_weight=None):
 
         n_samples, n_features = X.shape
 
-        # 1.4 additional validations ##########################################
         if self.check_input:
             if not np.all(family.in_y_range(y)):
                 raise ValueError("Some value(s) of y are out of the valid "
@@ -308,9 +300,8 @@ def fit(self, X, y, sample_weight=None):
                                  .format(family.__class__.__name__))
             # TODO: if alpha=0 check that X is not rank deficient
 
-        #######################################################################
-        # 2. rescaling of weights (sample_weight)                             #
-        #######################################################################
+        # rescaling of sample_weight
+        #
         # IMPORTANT NOTE: Since we want to minimize
         # 1/(2*sum(sample_weight)) * deviance + L2,
         # deviance = sum(sample_weight * unit_deviance),
@@ -319,9 +310,7 @@ def fit(self, X, y, sample_weight=None):
         weights_sum = np.sum(weights)
         weights = weights/weights_sum
 
-        #######################################################################
-        # 3. initialization of coef = (intercept_, coef_)                     #
-        #######################################################################
+        # initialization of coef = (intercept_, coef)
         # Note: Since phi=self.dispersion_ does not enter the estimation
         #       of mu_i=E[y_i], set it to 1.
 
@@ -338,12 +327,8 @@ def fit(self, X, y, sample_weight=None):
             else:
                 coef = np.zeros(n_features)
 
-        #######################################################################
-        # 4. fit                                                              #
-        #######################################################################
         # algorithms for optimization
 
-        # 4.1 L-BFGS ##########################################################
         if solver == 'lbfgs':
             def func(coef, X, y, weights, alpha, family, link):
                 mu, devp = \
@@ -371,9 +356,6 @@ def func(coef, X, y, weights, alpha, family, link):
             self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
             coef = opt_res.x
 
-        #######################################################################
-        # 5. postprocessing                                                   #
-        #######################################################################
         if self.fit_intercept:
             self.intercept_ = coef[0]
             self.coef_ = coef[1:]
@@ -425,9 +407,8 @@ def predict(self, X, sample_weight=None):
         C : array, shape (n_samples,)
             Returns predicted values times sample_weight.
         """
-        # TODO: Is copy=True necessary?
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                        dtype='numeric', copy=True, ensure_2d=True,
+                        dtype='numeric', ensure_2d=True,
                         allow_nd=False)
         eta = self._linear_predictor(X)
         mu = self._link_instance.inverse(eta)

From aa52b4a32e216bfdd516776ac6796f8c0b4e5137 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Mon, 22 Jul 2019 15:50:47 +0200
Subject: [PATCH 096/209] Refactor to use TweedieDistribition in metrics

---
 sklearn/linear_model/_glm/distribution.py | 33 ++++++++++++++---------
 sklearn/metrics/regression.py             | 14 +++-------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index 6fffd816ff8f9..b99e1b40b2871 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -330,21 +330,28 @@ def unit_variance_derivative(self, mu):
 
     def unit_deviance(self, y, mu):
         p = self.power
-        if p == 0:
-            # NormalDistribution
-            return (y - mu)**2
-        if p == 1:
-            # PoissonDistribution
-            # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
-            return 2 * (xlogy(y, y/mu) - y + mu)
+        if p < 0:
+            # 'Extreme stable', y_true any realy number, y_pred > 0
+            dev = 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p) * (2-p)) -
+                       y * np.power(mu, 1-p)/(1-p) +
+                       np.power(mu, 2-p)/(2-p))
+        elif p == 0:
+            # Normal distribution, y_true and y_pred any real number
+            dev = (y - mu)**2
+        elif p < 1:
+            raise ValueError("Tweedie deviance is only defined for p<=0 and "
+                             "p>=1.")
+        elif p == 1:
+            # Poisson distribution
+            dev = 2 * (xlogy(y, y/mu) - y + mu)
         elif p == 2:
-            # GammaDistribution
-            return 2 * (np.log(mu/y) + y/mu - 1)
+            # Gamma distribution
+            dev = 2 * (np.log(mu/y) + y/mu - 1)
         else:
-            # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
-            #    - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p))
-            return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
-                        y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
+            dev = 2 * (np.power(y, 2-p)/((1-p) * (2-p)) -
+                       y * np.power(mu, 1-p)/(1-p) +
+                       np.power(mu, 2-p)/(2-p))
+        return dev
 
 
 class NormalDistribution(TweedieDistribution):
diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py
index 2cba3d31ec84a..2f6c442c1f824 100644
--- a/sklearn/metrics/regression.py
+++ b/sklearn/metrics/regression.py
@@ -665,6 +665,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0):
     >>> mean_tweedie_deviance(y_true, y_pred, p=1)
     1.4260...
     """
+    from ..linear_model._glm.distribution import TweedieDistribution
     y_type, y_true, y_pred, _ = _check_reg_targets(
         y_true, y_pred, None, dtype=[np.float64, np.float32])
     if y_type == 'continuous-multioutput':
@@ -681,12 +682,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0):
         # 'Extreme stable', y_true any realy number, y_pred > 0
         if (y_pred <= 0).any():
             raise ValueError(message + "strictly positive y_pred.")
-        dev = 2 * (np.power(np.maximum(y_true, 0), 2-p)/((1-p) * (2-p)) -
-                   y_true * np.power(y_pred, 1-p)/(1-p) +
-                   np.power(y_pred, 2-p)/(2-p))
     elif p == 0:
-        # Normal distribution, y_true and y_pred any real number
-        dev = (y_true - y_pred)**2
+        pass
     elif p < 1:
         raise ValueError("Tweedie deviance is only defined for p<=0 and "
                          "p>=1.")
@@ -695,12 +692,10 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0):
         if (y_true < 0).any() or (y_pred <= 0).any():
             raise ValueError(message + "non-negative y_true and strictly "
                              "positive y_pred.")
-        dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred)
     elif p == 2:
         # Gamma distribution, y_true and y_pred > 0
         if (y_true <= 0).any() or (y_pred <= 0).any():
             raise ValueError(message + "strictly positive y_true and y_pred.")
-        dev = 2 * (np.log(y_pred/y_true) + y_true/y_pred - 1)
     else:
         if p < 2:
             # 1 < p < 2 is Compound Poisson, y_true >= 0, y_pred > 0
@@ -712,9 +707,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0):
                 raise ValueError(message + "strictly positive y_true and "
                                            "y_pred.")
 
-        dev = 2 * (np.power(y_true, 2-p)/((1-p) * (2-p)) -
-                   y_true * np.power(y_pred, 1-p)/(1-p) +
-                   np.power(y_pred, 2-p)/(2-p))
+    dist = TweedieDistribution(power=p)
+    dev = dist.unit_deviance(y_true, y_pred)
 
     return np.average(dev, weights=sample_weight)
 

From 816aa8f36f3f96a6616a7aac0b4d2583b71f0fc7 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 25 Jul 2019 07:54:28 +0200
Subject: [PATCH 097/209] WIP

---
 .../plot_poisson_regression_non_normal_loss.py  | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 0537704b2cf1f..6b12370a9433c 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -25,7 +25,6 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
-from scipy.special import xlogy
 
 from sklearn.compose import ColumnTransformer
 from sklearn.linear_model import GeneralizedLinearRegressor, LinearRegression
@@ -36,6 +35,7 @@
 from sklearn.ensemble import GradientBoostingRegressor
 
 from sklearn.metrics import mean_squared_error, mean_absolute_error
+from sklearn.metrics import mean_poisson_deviance
 
 
 def load_mtpl2(n_samples=100000):
@@ -140,13 +140,6 @@ def load_mtpl2(n_samples=100000):
 df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2)
 
 
-def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None):
-    y_true = np.atleast_1d(y_true)
-    y_pred = np.atleast_1d(y_pred)
-    dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred)
-    return np.average(dev, weights=sample_weights)
-
-
 eps = 1e-5
 print("MSE: %.3f" % mean_squared_error(
         df_test.Frequency.values, np.zeros(len(df_test)),
@@ -154,7 +147,7 @@ def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None):
 print("MAE: %.3f" % mean_absolute_error(
         df_test.Frequency.values, np.zeros(len(df_test)),
         df_test.Exposure.values))
-print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
         df_test.Frequency.values, eps + np.zeros(len(df_test)),
         df_test.Exposure.values))
 
@@ -175,7 +168,7 @@ def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None):
 print("MSE: %.3f" % mean_absolute_error(
           df_test.Frequency.values, linregr.predict(X_test),
           df_test.Exposure.values))
-print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
         df_test.Frequency.values, np.fmax(linregr.predict(X_test), eps),
         df_test.Exposure.values))
 
@@ -201,7 +194,7 @@ def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None):
 print("MAE: %.3f" % mean_absolute_error(
         df_test.Frequency.values, glm_freq.predict(X_test),
         df_test.Exposure.values))
-print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
         df_test.Frequency.values, glm_freq.predict(X_test),
         df_test.Exposure.values))
 
@@ -221,7 +214,7 @@ def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None):
       df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
 print("MAE: %.3f" % mean_absolute_error(
       df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
-print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
       df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
 
 ##############################################################################

From 6500c81a354c13de43ce0956240893b21c3979fb Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 25 Jul 2019 16:47:13 +0200
Subject: [PATCH 098/209] Use Poisson deviance in examples

---
 .../linear_model/plot_poisson_regression_non_normal_loss.py  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 6b12370a9433c..ba9c0c8898fcd 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -184,7 +184,7 @@ def load_mtpl2(n_samples=100000):
 #
 # Next we fit the Poisson regressor on the target variable,
 
-glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0)
+glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0, max_iter=1000)
 glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
 
 print("PoissonRegressor")
@@ -215,7 +215,8 @@ def load_mtpl2(n_samples=100000):
 print("MAE: %.3f" % mean_absolute_error(
       df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
 print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
-      df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
+      df_test.Frequency.values, np.fmax(gbr.predict(X_test), eps),
+      df_test.Exposure.values))
 
 ##############################################################################
 #

From 59a6d9dd2aeae4901f138f6d1c892c48c427f9a8 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 25 Jul 2019 18:22:26 +0200
Subject: [PATCH 099/209] Use PoissonRegressor and GammaRegressor in examples

---
 doc/modules/classes.rst                       |   2 +
 ...plot_poisson_regression_non_normal_loss.py |   4 +-
 ...lot_tweedie_regression_insurance_claims.py |  15 +-
 sklearn/linear_model/__init__.py              |   2 +-
 sklearn/linear_model/_glm/__init__.py         |   4 +-
 sklearn/linear_model/_glm/glm.py              | 146 +++++++++++++++---
 sklearn/linear_model/_glm/tests/test_glm.py   |   3 +-
 7 files changed, 145 insertions(+), 31 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 6346c0c65fadc..177cd0780f9be 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -750,6 +750,7 @@ Kernels:
    linear_model.BayesianRidge
    linear_model.ElasticNet
    linear_model.ElasticNetCV
+   linear_model.GammaRegressor
    linear_model.GeneralizedLinearRegressor
    linear_model.HuberRegressor
    linear_model.Lars
@@ -771,6 +772,7 @@ Kernels:
    linear_model.PassiveAggressiveClassifier
    linear_model.PassiveAggressiveRegressor
    linear_model.Perceptron
+   linear_model.PoissonRegressor
    linear_model.RANSACRegressor
    linear_model.Ridge
    linear_model.RidgeClassifier
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index ba9c0c8898fcd..713866a712aea 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -27,7 +27,7 @@
 import pandas as pd
 
 from sklearn.compose import ColumnTransformer
-from sklearn.linear_model import GeneralizedLinearRegressor, LinearRegression
+from sklearn.linear_model import PoissonRegressor, LinearRegression
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
@@ -184,7 +184,7 @@ def load_mtpl2(n_samples=100000):
 #
 # Next we fit the Poisson regressor on the target variable,
 
-glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0, max_iter=1000)
+glm_freq = PoissonRegressor(alpha=0, max_iter=1000)
 glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
 
 print("PoissonRegressor")
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index fba1cc42e20a7..55d0ca24ce477 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -39,7 +39,8 @@
 import pandas as pd
 
 from sklearn.compose import ColumnTransformer
-from sklearn.linear_model import GeneralizedLinearRegressor
+from sklearn.linear_model import PoissonRegressor, GammaRegressor
+from sklearn.linear_model._glm import GeneralizedLinearRegressor
 from sklearn.linear_model._glm.distribution import TweedieDistribution
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline
@@ -192,7 +193,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 
 # Some of the features are colinear, we use a weak penalization to avoid
 # numerical issues.
-glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=1e-2)
+glm_freq = PoissonRegressor(alpha=1e-2)
 glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
 
 
@@ -330,7 +331,7 @@ def score_estimator(
 mask_train = df_train["ClaimAmount"] > 0
 mask_test = df_test["ClaimAmount"] > 0
 
-glm_sev = GeneralizedLinearRegressor(family="gamma")
+glm_sev = GammaRegressor()
 
 glm_sev.fit(
     X_train[mask_train.values],
@@ -464,12 +465,16 @@ def score(self, X, y, sample_weight=None):
 # this takes a while
 params = {
     "family": [
-        TweedieDistribution(power=power) for power in np.linspace(1, 2, 8)
+        TweedieDistribution(power=power)
+        # exclude upper bound as power=2 does not support null y samples.
+        for power in np.linspace(1 + 1e-4, 2 - 1e-4, 8)
     ]
 }
 
+
 glm_total = GridSearchCV(
-    GeneralizedLinearRegressor(), cv=3, param_grid=params, n_jobs=-1
+    GeneralizedLinearRegressor(tol=1e-3, max_iter=500), cv=3,
+    param_grid=params, n_jobs=-1
 )
 glm_total.fit(
     X_train, df_train["ClaimAmount"], sample_weight=df_train["Exposure"]
diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index e8f63e95cdfa8..d4227a126e5ec 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,7 +18,7 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
-from ._glm import (GeneralizedLinearRegressor, PoissonRegressor)
+from ._glm import (GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py
index ac0f2c206ffe0..a7d4b67817730 100644
--- a/sklearn/linear_model/_glm/__init__.py
+++ b/sklearn/linear_model/_glm/__init__.py
@@ -1,5 +1,5 @@
 # License: BSD 3 clause
 
-from .glm import (GeneralizedLinearRegressor, PoissonRegressor)
+from .glm import GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor
 
-__all__ = ['GeneralizedLinearRegressor', 'PoissonRegressor']
+__all__ = ["GeneralizedLinearRegressor", "PoissonRegressor", "GammaRegressor"]
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index ea3e4964529bf..12486e188951a 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -24,7 +24,6 @@
         Link,
         IdentityLink,
         LogLink,
-        LogitLink,
 )
 
 
@@ -64,7 +63,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         The distributional assumption of the GLM, i.e. which distribution from
         the EDM, specifies the loss function to be minimized.
 
-    link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \
+    link : {'auto', 'identity', 'log'} or an instance of class Link, \
             optional (default='auto')
         The link function of the GLM, i.e. mapping from linear predictor
         (X*coef) to expectation (mu). Option 'auto' sets the link depending on
@@ -160,11 +159,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     References
     ----------
-    For the coordinate descent implementation:
-        * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
-          An Improved GLMNET for L1-regularized Logistic Regression,
-          Journal of Machine Learning Research 13 (2012) 1999-2030
-          https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models,
+       Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+
+    .. Jørgensen, B. (1992). The theory of exponential dispersion models
+       and analysis of deviance. Monografias de matemática, no. 51.  See also
+       `Exponential dispersion model.
+       <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
     """
     def __init__(self, alpha=1.0,
                  fit_intercept=True, family='normal', link='auto',
@@ -240,12 +241,10 @@ def fit(self, X, y, sample_weight=None):
                 self._link_instance = IdentityLink()
             elif self.link == 'log':
                 self._link_instance = LogLink()
-            elif self.link == 'logit':
-                self._link_instance = LogitLink()
             else:
                 raise ValueError(
                     "The link must be an instance of class Link or "
-                    "an element of ['auto', 'identity', 'log', 'logit']; "
+                    "an element of ['auto', 'identity', 'log']; "
                     "got (link={0})".format(self.link))
 
         if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
@@ -606,18 +605,127 @@ class PoissonRegressor(GeneralizedLinearRegressor):
 
     References
     ----------
-    For the coordinate descent implementation:
-        * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
-          An Improved GLMNET for L1-regularized Logistic Regression,
-          Journal of Machine Learning Research 13 (2012) 1999-2030
-          https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models,
+       Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+
+    .. Jørgensen, B. (1992). The theory of exponential dispersion models
+       and analysis of deviance. Monografias de matemática, no. 51.  See also
+       `Exponential dispersion model.
+       <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
     """
-    def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None,
-                 solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
-                 copy_X=True, check_input=True, verbose=0):
+    def __init__(self, alpha=1.0, fit_intercept=True, link='log',
+                 fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4,
+                 warm_start=False, copy_X=True, check_input=True, verbose=0):
+
+        super().__init__(alpha=alpha, fit_intercept=fit_intercept,
+                         family="poisson", link=link,
+                         fit_dispersion=fit_dispersion, solver=solver,
+                         max_iter=max_iter, tol=tol, warm_start=warm_start,
+                         copy_X=copy_X, verbose=verbose)
+
+
+class GammaRegressor(GeneralizedLinearRegressor):
+    """Regression with the response variable y following a Gamma distribution
+
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
+    fitting and predicting the mean of the target y as mu=h(X*w).
+    The fit minimizes the following objective function with L2 regularization::
+
+            1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2
+
+    with inverse link function h and s=sample_weight. Note that for
+    ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples).
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
+
+    Parameters
+    ----------
+    alpha : float, optional (default=1)
+        Constant that multiplies the penalty terms and thus determines the
+        regularization strength.
+        See the notes for the exact mathematical meaning of this
+        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        case, the design matrix X must have full column rank
+        (no collinearities).
+
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
+        Method for estimation of the dispersion parameter phi. Whether to use
+        the chi squared statistic or the deviance statistic. If None, the
+        dispersion is not estimated.
+
+    solver : {'lbfgs'}, optional (default='lbfgs')
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+    max_iter : int, optional (default=100)
+        The maximal number of iterations for solver algorithms.
+
+    tol : float, optional (default=1e-4)
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
+        where ``g_i`` is the i-th component of the gradient (derivative) of
+        the objective function.
+
+    warm_start : boolean, optional (default=False)
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` .
+
+    copy_X : boolean, optional, (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    verbose : int, optional (default=0)
+        For the lbfgs solver set verbose to any positive number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array, shape (n_features,)
+        Estimated coefficients for the linear predictor (X*coef_+intercept_) in
+        the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    dispersion_ : float
+        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
+
+    n_iter_ : int
+        Actual number of iterations used in solver.
+
+    Notes
+    -----
+    The fit itself does not need Y to be from an EDM, but only assumes
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    :ref:`User Guide <Generalized_linear_regression>`.
+
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    minimizing the deviance plus penalty term, which is equivalent to
+    (penalized) maximum likelihood estimation.
+
+
+    References
+    ----------
+    .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models,
+       Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+
+    .. Jørgensen, B. (1992). The theory of exponential dispersion models
+       and analysis of deviance. Monografias de matemática, no. 51.  See also
+       `Exponential dispersion model.
+       <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
+    """
+    def __init__(self, alpha=1.0, fit_intercept=True, link='log',
+                 fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4,
+                 warm_start=False, copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
-                         family="poisson", link='log',
+                         family="gamma", link=link,
                          fit_dispersion=fit_dispersion, solver=solver,
                          max_iter=max_iter, tol=tol, warm_start=warm_start,
                          copy_X=copy_X, verbose=verbose)
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 2909c85ba8c40..089f251d77049 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -77,8 +77,7 @@ def test_glm_family_argument(f, fam):
 
 @pytest.mark.parametrize('l, link',
                          [('identity', IdentityLink()),
-                          ('log', LogLink()),
-                          ('logit', LogitLink())])
+                          ('log', LogLink())])
 def test_glm_link_argument(l, link):
     """Test GLM link argument set as string."""
     y = np.array([0.1, 0.5])  # in range of all distributions

From 03a8a2d06acaf1b2d57bdef51991fecb7482cf75 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 26 Jul 2019 11:12:48 +0200
Subject: [PATCH 100/209] Improve documentation wording

---
 doc/modules/linear_model.rst                  | 41 ++++++++++---------
 ...plot_poisson_regression_non_normal_loss.py |  5 +--
 sklearn/linear_model/__init__.py              |  6 ++-
 sklearn/linear_model/_glm/tests/test_glm.py   |  1 -
 4 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index b6f7c2b82c1f5..0eaaab2ecded5 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -893,7 +893,7 @@ to warm-starting (see :term:`Glossary <warm_start>`).
 Generalized Linear Regression
 =============================
 
-:class:`GeneralizedLinearRegressor` generalizes linear models in two ways
+Generalized Linear Models (GLM) extend linear models in two ways
 [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear
 combination of the input variables :math:`X` via an inverse link function
 :math:`h` as
@@ -901,13 +901,31 @@ combination of the input variables :math:`X` via an inverse link function
 .. math::    \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p).
 
 Secondly, the squared loss function is replaced by the deviance :math:`D` of an
-exponential dispersion model (EDM) [11]_. The objective function being
-minimized becomes
+exponential dispersion model (EDM) [11]_. The minimized objective function is
+the penalized negative log likelihood,
 
 .. math::    \frac{1}{2 \sum s_i}D(y, \hat{y}; s) +\frac{\alpha}{2} ||w||_2
 
 with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`.
 
+The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
+likelihood as
+
+.. math::     d(y, \mu) = -2\phi\cdot
+              \left(loglike(y,\mu,\phi)
+              - loglike(y,y,\phi)\right) \\
+              D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i)
+
+===================================== ===============================  ================================= ============================================
+Distribution                          Target Domain                    Variance Function :math:`v(\mu)`  Unit Deviance :math:`d(y, \mu)`
+===================================== ===============================  ================================= ============================================
+Normal ("normal")                     :math:`y \in (-\infty, \infty)`  :math:`1`                         :math:`(y-\mu)^2`
+Poisson ("poisson")                   :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{\mu}-y+\mu)`
+Gamma ("gamma")                       :math:`y \in (0, \infty)`        :math:`\mu^2`                     :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
+Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)`        :math:`\mu^3`                     :math:`\frac{(y-\mu)^2}{y\mu^2}`
+===================================== ===============================  ================================= ============================================
+
+
 In the following use cases, a loss different from the squared loss might be
 appropriate,
 
@@ -968,23 +986,6 @@ The objective function (the penalized negative log likelihood) is
 independent of :math:`\phi` and is minimized with respect to the
 coefficients :math:`w`.
 
-The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
-likelihood as
-
-.. math::     d(y, \mu) = -2\phi\cdot
-              \left(loglike(y,\mu,\phi)
-              - loglike(y,y,\phi)\right) \\
-              D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i)
-
-===================================== ===============================  ================================= ============================================
-Distribution                          Target Domain                    Variance Function :math:`v(\mu)`  Unit Deviance :math:`d(y, \mu)`
-===================================== ===============================  ================================= ============================================
-Normal ("normal")                     :math:`y \in (-\infty, \infty)`  :math:`1`                         :math:`(y-\mu)^2`
-Poisson ("poisson")                   :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{\mu}-y+\mu)`
-Gamma ("gamma")                       :math:`y \in (0, \infty)`        :math:`\mu^2`                     :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
-Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)`        :math:`\mu^3`                     :math:`\frac{(y-\mu)^2}{y\mu^2}`
-===================================== ===============================  ================================= ============================================
-
 Two remarks:
 
 * The deviances for at least Normal, Poisson and Gamma distributions are
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 713866a712aea..695a7c7594472 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -4,9 +4,8 @@
 ======================================
 
 This example illustrates the use of linear Poisson regression
-on the French Motor Third-Party Liability Claims dataset [1] and compare
-it with learning models with least squared error.
-
+on the French Motor Third-Party Liability Claims dataset [1] and compares
+it with models learned with least squared error.
 
 We start by defining a few helper functions for loading the data and
 visualizing results.
diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index d4227a126e5ec..63a52a9c9898e 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,7 +18,8 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
-from ._glm import (GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor)
+from ._glm import (GeneralizedLinearRegressor, PoissonRegressor,
+                   GammaRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
@@ -81,4 +82,5 @@
            'ridge_regression',
            'RANSACRegressor',
            'GeneralizedLinearRegressor',
-           'PoissonRegressor']
+           'PoissonRegressor',
+           'GammaRegressor']
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 089f251d77049..542c18b65cad2 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -11,7 +11,6 @@
 from sklearn.linear_model._glm.link import (
     IdentityLink,
     LogLink,
-    LogitLink,
 )
 from sklearn.linear_model._glm.distribution import (
     TweedieDistribution,

From bbf7f38fa69a9335397fa1cdfd73d153cbc3670d Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 26 Jul 2019 11:29:54 +0200
Subject: [PATCH 101/209] Use dataframe OpenML fetcher

---
 ...plot_poisson_regression_non_normal_loss.py | 19 ++++++++-----------
 ...lot_tweedie_regression_insurance_claims.py | 19 ++++++++-----------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 695a7c7594472..347a424d8f4fe 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -25,6 +25,7 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 
+from sklearn.datasets import fetch_openml
 from sklearn.compose import ColumnTransformer
 from sklearn.linear_model import PoissonRegressor, LinearRegression
 from sklearn.model_selection import train_test_split
@@ -46,20 +47,16 @@ def load_mtpl2(n_samples=100000):
       number of samples to select (for faster run time).
     """
 
-    # Note: this should use the OpenML DataFrame fetcher in the future
-    df_freq = pd.read_csv(
-        "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv",
-        dtype={"IDpol": np.int},
-        index_col=0,
-    )
+    # freMTPL2freq dataset from https://www.openml.org/d/41214
+    df_freq = fetch_openml(data_id=41214, as_frame=True)['data']
+    df_freq['IDpol'] = df_freq['IDpol'].astype(np.int)
+    df_freq.set_index('IDpol', inplace=True)
 
-    df_sev = pd.read_csv(
-        "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff",
-        index_col=0,
-    )
+    # freMTPL2sev dataset from https://www.openml.org/d/41215
+    df_sev = fetch_openml(data_id=41215, as_frame=True)['data']
 
     # sum ClaimAmount over identical IDs
-    df_sev = df_sev.groupby(level=0).sum()
+    df_sev = df_sev.groupby('IDpol').sum()
 
     df = df_freq.join(df_sev, how="left")
     df["ClaimAmount"].fillna(0, inplace=True)
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 55d0ca24ce477..418127699dc1a 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -38,6 +38,7 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 
+from sklearn.datasets import fetch_openml
 from sklearn.compose import ColumnTransformer
 from sklearn.linear_model import PoissonRegressor, GammaRegressor
 from sklearn.linear_model._glm import GeneralizedLinearRegressor
@@ -59,20 +60,16 @@ def load_mtpl2(n_samples=100000):
       number of samples to select (for faster run time).
     """
 
-    # Note: this should use the OpenML DataFrame fetcher in the future
-    df_freq = pd.read_csv(
-        "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv",
-        dtype={"IDpol": np.int},
-        index_col=0,
-    )
+    # freMTPL2freq dataset from https://www.openml.org/d/41214
+    df_freq = fetch_openml(data_id=41214, as_frame=True)['data']
+    df_freq['IDpol'] = df_freq['IDpol'].astype(np.int)
+    df_freq.set_index('IDpol', inplace=True)
 
-    df_sev = pd.read_csv(
-        "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff",
-        index_col=0,
-    )
+    # freMTPL2sev dataset from https://www.openml.org/d/41215
+    df_sev = fetch_openml(data_id=41215, as_frame=True)['data']
 
     # sum ClaimAmount over identical IDs
-    df_sev = df_sev.groupby(level=0).sum()
+    df_sev = df_sev.groupby('IDpol').sum()
 
     df = df_freq.join(df_sev, how="left")
     df["ClaimAmount"].fillna(0, inplace=True)

From 49a3a8e116322c59f56b9d0b3fdb77028b144dd3 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 26 Jul 2019 12:00:26 +0200
Subject: [PATCH 102/209] Refactor distibution bounds

---
 sklearn/linear_model/_glm/distribution.py     | 111 +++++-------------
 .../_glm/tests/test_distribution.py           |  21 +++-
 2 files changed, 50 insertions(+), 82 deletions(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index b99e1b40b2871..e6635d2003850 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -6,6 +6,7 @@
 # License: BSD 3 clause
 
 from abc import ABCMeta, abstractmethod
+from collections import namedtuple
 import numbers
 
 import numpy as np
@@ -20,6 +21,10 @@ def _safe_lin_pred(X, coef):
         return X @ coef
 
 
+DistributionBoundary = namedtuple("DistributionBoundary",
+                                  ("value", "inclusive"))
+
+
 class ExponentialDispersionModel(metaclass=ABCMeta):
     r"""Base class for reproductive Exponential Dispersion Models (EDM).
 
@@ -35,13 +40,6 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
     unit variance :math:`v(\mu)` and
     unit deviance :math:`d(y,\mu)`.
 
-    Attributes
-    ----------
-    lower_bound
-    upper_bound
-    include_lower_bound
-    include_upper_bound
-
     Methods
     -------
     deviance
@@ -52,55 +50,33 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
     unit_variance
     unit_variance_derivative
 
-    _mu_deviance_derivative
-
     References
     ----------
-
     https://en.wikipedia.org/wiki/Exponential_dispersion_model.
     """
-    @property
-    def lower_bound(self):
-        """Get the lower bound of values for Y~EDM."""
-        return self._lower_bound
 
-    @property
-    def upper_bound(self):
-        """Get the upper bound of values for Y~EDM."""
-        return self._upper_bound
-
-    @property
-    def include_lower_bound(self):
-        """Get True if lower bound for y is included: y >= lower_bound."""
-        return self._include_lower_bound
-
-    @property
-    def include_upper_bound(self):
-        """Get True if upper bound for y is included: y <= upper_bound."""
-        return self._include_upper_bound
-
-    def in_y_range(self, x):
-        """Returns ``True`` if x is in the valid range of Y~EDM.
+    def in_y_range(self, y):
+        """Returns ``True`` if y is in the valid range of Y~EDM.
 
         Parameters
         ----------
-        x : array, shape (n_samples,)
+        y : array, shape (n_samples,)
             Target values.
         """
-        if self.include_lower_bound:
-            if self.include_upper_bound:
-                return np.logical_and(np.greater_equal(x, self.lower_bound),
-                                      np.less_equal(x, self.upper_bound))
-            else:
-                return np.logical_and(np.greater_equal(x, self.lower_bound),
-                                      np.less(x, self.upper_bound))
+        if hasattr(self, '_upper_bound'):
+            # All currently supported distributions have an upper bound at
+            # +inf, however this may need to be implemented for other
+            # distributions
+            raise NotImplementedError
+
+        if not isinstance(self._lower_bound, DistributionBoundary):
+            raise TypeError('_lower_bound attribute must be of type '
+                            'DistributionBoundary')
+
+        if self._lower_bound.inclusive:
+            return np.greater_equal(y, self._lower_bound.value)
         else:
-            if self.include_upper_bound:
-                return np.logical_and(np.greater(x, self.lower_bound),
-                                      np.less_equal(x, self.upper_bound))
-            else:
-                return np.logical_and(np.greater(x, self.lower_bound),
-                                      np.less(x, self.upper_bound))
+            return np.greater(y, self._lower_bound.value)
 
     @abstractmethod
     def unit_variance(self, mu):
@@ -265,42 +241,17 @@ def power(self, power):
             raise TypeError('power must be a real number, input was {0}'
                             .format(power))
 
-        self._upper_bound = np.Inf
-        self._include_upper_bound = False
-        if power < 0:
-            # Extreme Stable
-            self._lower_bound = -np.Inf
-            self._include_lower_bound = False
-        elif power == 0:
-            # NormalDistribution
-            self._lower_bound = -np.Inf
-            self._include_lower_bound = False
-        elif (power > 0) and (power < 1):
+        if power <= 0:
+            # Extreme Stable or Normal distribution
+            self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False)
+        elif 0 < power < 1:
             raise ValueError('For 0<power<1, no distribution exists.')
-        elif power == 1:
-            # PoissonDistribution
-            self._lower_bound = 0
-            self._include_lower_bound = True
-        elif (power > 1) and (power < 2):
-            # Compound Poisson
-            self._lower_bound = 0
-            self._include_lower_bound = True
-        elif power == 2:
-            # GammaDistribution
-            self._lower_bound = 0
-            self._include_lower_bound = False
-        elif (power > 2) and (power < 3):
-            # Positive Stable
-            self._lower_bound = 0
-            self._include_lower_bound = False
-        elif power == 3:
-            # InverseGaussianDistribution
-            self._lower_bound = 0
-            self._include_lower_bound = False
-        elif power > 3:
-            # Positive Stable
-            self._lower_bound = 0
-            self._include_lower_bound = False
+        elif 1 <= power < 2:
+            # Poisson or Compound Poisson distribution
+            self._lower_bound = DistributionBoundary(0, inclusive=True)
+        elif power >= 2:
+            # Gamma, Positive Stable, Inverse Gaussian distributions
+            self._lower_bound = DistributionBoundary(0, inclusive=False)
         else:  # pragma: no cover
             # this branch should be unreachable.
             raise ValueError
diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py
index f984077bed5cb..f9c329f35caa4 100644
--- a/sklearn/linear_model/_glm/tests/test_distribution.py
+++ b/sklearn/linear_model/_glm/tests/test_distribution.py
@@ -13,6 +13,7 @@
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
+    DistributionBoundary
 )
 
 
@@ -30,6 +31,19 @@ def test_family_bounds(family, expected):
     assert_array_equal(result, expected)
 
 
+def test_invalid_distribution_bound():
+    dist = TweedieDistribution()
+    dist._lower_bound = 0
+    with pytest.raises(TypeError,
+                       match="must be of type DistributionBoundary"):
+        dist.in_y_range([-1, 0, 1])
+
+    dist = TweedieDistribution()
+    dist._upper_bound = None
+    with pytest.raises(NotImplementedError):
+        dist.in_y_range([-1, 0, 1])
+
+
 def test_tweedie_distribution_power():
     with pytest.raises(ValueError, match="no distribution exists"):
         TweedieDistribution(power=0.5)
@@ -42,9 +56,12 @@ def test_tweedie_distribution_power():
         dist.power = 1j
 
     dist = TweedieDistribution()
-    assert dist._include_lower_bound is False
+    assert isinstance(dist._lower_bound, DistributionBoundary)
+
+    assert dist._lower_bound.inclusive is False
     dist.power = 1
-    assert dist._include_lower_bound is True
+    assert dist._lower_bound.value == 0.0
+    assert dist._lower_bound.inclusive is True
 
 
 @pytest.mark.parametrize(

From 228e8c8633d3bae7bbe9a36fa3fccb97567fd653 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 26 Jul 2019 15:13:58 +0200
Subject: [PATCH 103/209] Move deviance checks under destribution

---
 sklearn/linear_model/_glm/distribution.py     | 70 +++++++++++++++++--
 .../_glm/tests/test_distribution.py           |  3 +-
 sklearn/metrics/regression.py                 | 35 +---------
 sklearn/metrics/tests/test_regression.py      | 12 ++--
 4 files changed, 74 insertions(+), 46 deletions(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index e6635d2003850..c30996662114e 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -113,7 +113,7 @@ def unit_variance_derivative(self, mu):
         pass  # pragma: no cover
 
     @abstractmethod
-    def unit_deviance(self, y, mu):
+    def unit_deviance(self, y, mu, check_input=False):
         r"""Compute the unit deviance.
 
         The unit_deviance :math:`d(y,\mu)` can be defined by the
@@ -128,6 +128,14 @@ def unit_deviance(self, y, mu):
 
         mu : array, shape (n_samples,)
             Predicted mean.
+
+        check_input : bool, default=False
+            If True raise an exception on invalid y or mu values, otherwise
+            they will be propagated as NaN.
+        Returns
+        -------
+        deviance: array, shape (n_samples,)
+            Computed deviance
         """
         pass  # pragma: no cover
 
@@ -245,7 +253,8 @@ def power(self, power):
             # Extreme Stable or Normal distribution
             self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False)
         elif 0 < power < 1:
-            raise ValueError('For 0<power<1, no distribution exists.')
+            raise ValueError('Tweedie distribution is only defined for p<=0 '
+                             'and p>=1.')
         elif 1 <= power < 2:
             # Poisson or Compound Poisson distribution
             self._lower_bound = DistributionBoundary(0, inclusive=True)
@@ -279,15 +288,66 @@ def unit_variance_derivative(self, mu):
         """
         return self.power * np.power(mu, self.power - 1)
 
-    def unit_deviance(self, y, mu):
+    def unit_deviance(self, y, mu, check_input=False):
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,\mu)` can be defined by the
+        log-likelihood as
+        :math:`d(y,\mu) = -2\phi\cdot
+        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        check_input : bool, default=False
+            If True raise an exception on invalid y or mu values, otherwise
+            they will be propagated as NaN.
+        Returns
+        -------
+        deviance: array, shape (n_samples,)
+            Computed deviance
+        """
         p = self.power
+
+        if check_input:
+            message = ("Mean Tweedie deviance error with p={} can only be "
+                       "used on ".format(p))
+            if p < 0:
+                # 'Extreme stable', y any realy number, mu > 0
+                if (mu <= 0).any():
+                    raise ValueError(message + "strictly positive mu.")
+            elif p == 0:
+                # Normal, y and mu can be any real number
+                pass
+            elif 0 < p < 1:
+                raise ValueError("Tweedie deviance is only defined for p<=0 "
+                                 "and p>=1.")
+            elif 1 <= p < 2:
+                # Poisson and Compount poisson distribution, y >= 0, mu > 0
+                if (y < 0).any() or (mu <= 0).any():
+                    raise ValueError(message + "non-negative y and strictly "
+                                     "positive mu.")
+            elif p >= 2:
+                # Gamma and Extreme stable distribution, y and mu > 0
+                if (y <= 0).any() or (mu <= 0).any():
+                    raise ValueError(message + "strictly positive y and mu.")
+            else:  # pragma: nocover
+                # Unreachable statement
+                raise ValueError
+
         if p < 0:
-            # 'Extreme stable', y_true any realy number, y_pred > 0
+            # 'Extreme stable', y any realy number, mu > 0
             dev = 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p) * (2-p)) -
                        y * np.power(mu, 1-p)/(1-p) +
                        np.power(mu, 2-p)/(2-p))
+
         elif p == 0:
-            # Normal distribution, y_true and y_pred any real number
+            # Normal distribution, y and mu any real number
             dev = (y - mu)**2
         elif p < 1:
             raise ValueError("Tweedie deviance is only defined for p<=0 and "
diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py
index f9c329f35caa4..82e493b7a2149 100644
--- a/sklearn/linear_model/_glm/tests/test_distribution.py
+++ b/sklearn/linear_model/_glm/tests/test_distribution.py
@@ -45,7 +45,8 @@ def test_invalid_distribution_bound():
 
 
 def test_tweedie_distribution_power():
-    with pytest.raises(ValueError, match="no distribution exists"):
+    msg = "distribution is only defined for p<=0 and p>=1"
+    with pytest.raises(ValueError, match=msg):
         TweedieDistribution(power=0.5)
 
     with pytest.raises(TypeError, match="must be a real number"):
diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py
index 2f6c442c1f824..538fd7eec4631 100644
--- a/sklearn/metrics/regression.py
+++ b/sklearn/metrics/regression.py
@@ -22,9 +22,7 @@
 #          Christian Lorentzen <lorentzen.ch@googlemail.com>
 # License: BSD 3 clause
 
-
 import numpy as np
-from scipy.special import xlogy
 import warnings
 
 from ..utils.validation import (check_array, check_consistent_length,
@@ -676,39 +674,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0):
         sample_weight = column_or_1d(sample_weight)
         sample_weight = sample_weight[:, np.newaxis]
 
-    message = ("Mean Tweedie deviance error with p={} can only be used on "
-               .format(p))
-    if p < 0:
-        # 'Extreme stable', y_true any realy number, y_pred > 0
-        if (y_pred <= 0).any():
-            raise ValueError(message + "strictly positive y_pred.")
-    elif p == 0:
-        pass
-    elif p < 1:
-        raise ValueError("Tweedie deviance is only defined for p<=0 and "
-                         "p>=1.")
-    elif p == 1:
-        # Poisson distribution, y_true >= 0, y_pred > 0
-        if (y_true < 0).any() or (y_pred <= 0).any():
-            raise ValueError(message + "non-negative y_true and strictly "
-                             "positive y_pred.")
-    elif p == 2:
-        # Gamma distribution, y_true and y_pred > 0
-        if (y_true <= 0).any() or (y_pred <= 0).any():
-            raise ValueError(message + "strictly positive y_true and y_pred.")
-    else:
-        if p < 2:
-            # 1 < p < 2 is Compound Poisson, y_true >= 0, y_pred > 0
-            if (y_true < 0).any() or (y_pred <= 0).any():
-                raise ValueError(message + "non-negative y_true and strictly "
-                                           "positive y_pred.")
-        else:
-            if (y_true <= 0).any() or (y_pred <= 0).any():
-                raise ValueError(message + "strictly positive y_true and "
-                                           "y_pred.")
-
     dist = TweedieDistribution(power=p)
-    dev = dist.unit_deviance(y_true, y_pred)
+    dev = dist.unit_deviance(y_true, y_pred, check_input=True)
 
     return np.average(dev, weights=sample_weight)
 
diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index 526c27f0a036c..d46bca1301b1e 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -101,35 +101,35 @@ def test_regression_metrics_at_limits():
     assert_allclose(mean_tweedie_deviance([0], [1.], p=p),
                     2./(2.-p), rtol=1e-3)
     with pytest.raises(ValueError,
-                       match="can only be used on strictly positive y_pred."):
+                       match="can only be used on strictly positive mu."):
         mean_tweedie_deviance([0.], [0.], p=p)
     assert_almost_equal(mean_tweedie_deviance([0.], [0.], p=0), 0.00, 2)
 
-    msg = "only be used on non-negative y_true and strictly positive y_pred."
+    msg = "only be used on non-negative y and strictly positive mu."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], p=1.0)
 
     p = 1.5
     assert_allclose(mean_tweedie_deviance([0.], [1.], p=p), 2./(2.-p))
-    msg = "only be used on non-negative y_true and strictly positive y_pred."
+    msg = "only be used on non-negative y and strictly positive mu."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], p=p)
     p = 2.
     assert_allclose(mean_tweedie_deviance([1.], [1.], p=p), 0.00,
                     atol=1e-8)
-    msg = "can only be used on strictly positive y_true and y_pred."
+    msg = "can only be used on strictly positive y and mu."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], p=p)
     p = 3.
     assert_allclose(mean_tweedie_deviance([1.], [1.], p=p),
                     0.00, atol=1e-8)
 
-    msg = "can only be used on strictly positive y_true and y_pred."
+    msg = "can only be used on strictly positive y and mu."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], p=p)
 
     with pytest.raises(ValueError,
-                       match="deviance is only defined for p<=0 and p>=1."):
+                       match="is only defined for p<=0 and p>=1"):
         mean_tweedie_deviance([0.], [0.], p=0.5)
 
 

From 09a57c9cfedfebb2452c50921260c310987a0057 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 26 Jul 2019 18:12:55 +0200
Subject: [PATCH 104/209] Expose TweedieRegressor

---
 doc/modules/linear_model.rst                  |   6 +-
 ...lot_tweedie_regression_insurance_claims.py |  26 ++-
 sklearn/linear_model/__init__.py              |   7 +-
 sklearn/linear_model/_glm/__init__.py         |  14 +-
 sklearn/linear_model/_glm/glm.py              | 154 ++++++++++++++++++
 sklearn/linear_model/_glm/tests/test_glm.py   |  38 ++++-
 6 files changed, 221 insertions(+), 24 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 0eaaab2ecded5..cab918b06f3b0 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -949,10 +949,10 @@ Note that the feature matrix ``X`` should be standardized before fitting. This
 ensures that the penalty treats features equally. The estimator can be used as
 follows:
 
-    >>> from sklearn.linear_model import GeneralizedLinearRegressor
-    >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
+    >>> from sklearn.linear_model import TweedieRegressor
+    >>> reg = TweedieRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
-    GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
+    TweedieRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.coef_
     array([0.2463..., 0.4337...])
     >>> reg.intercept_
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 418127699dc1a..55a21c8d8723d 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -41,8 +41,7 @@
 from sklearn.datasets import fetch_openml
 from sklearn.compose import ColumnTransformer
 from sklearn.linear_model import PoissonRegressor, GammaRegressor
-from sklearn.linear_model._glm import GeneralizedLinearRegressor
-from sklearn.linear_model._glm.distribution import TweedieDistribution
+from sklearn.linear_model import TweedieRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
@@ -418,7 +417,6 @@ class ClaimProdEstimator:
     def __init__(self, est_freq, est_sev):
         self.est_freq = est_freq
         self.est_sev = est_sev
-        self._family_instance = TweedieDistribution(power=1.5)
 
     def predict(self, X, exposure):
         """Predict the total claim amount.
@@ -429,11 +427,14 @@ def predict(self, X, exposure):
 
     def score(self, X, y, sample_weight=None):
         """Compute D², the percentage of deviance explained."""
+        # TODO: remove this private import once d2_score is available
+        from sklearn.linear_model._glm.distribution import TweedieDistribution
+
         mu = self.predict(X, exposure=sample_weight)
-        dev = self._family_instance.deviance(y, mu, weights=sample_weight)
+        family = TweedieDistribution(power=1.5)
+        dev = family.deviance(y, mu, weights=sample_weight)
         y_mean = np.average(y, weights=sample_weight)
-        dev_null = self._family_instance.deviance(y, y_mean,
-                                                  weights=sample_weight)
+        dev_null = family.deviance(y, y_mean, weights=sample_weight)
         return 1. - dev / dev_null
 
 
@@ -459,18 +460,13 @@ def score(self, X, y, sample_weight=None):
 
 from sklearn.model_selection import GridSearchCV
 
-# this takes a while
-params = {
-    "family": [
-        TweedieDistribution(power=power)
-        # exclude upper bound as power=2 does not support null y samples.
-        for power in np.linspace(1 + 1e-4, 2 - 1e-4, 8)
-    ]
-}
+# exclude upper bound as power=2 does not support null y values.
+params = {"power": np.linspace(1 + 1e-4, 2 - 1e-4, 8)}
 
 
+# this takes a while
 glm_total = GridSearchCV(
-    GeneralizedLinearRegressor(tol=1e-3, max_iter=500), cv=3,
+    TweedieRegressor(tol=1e-3, max_iter=500), cv=3,
     param_grid=params, n_jobs=-1
 )
 glm_total.fit(
diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 63a52a9c9898e..46d1efe63de2e 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,8 +18,8 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
-from ._glm import (GeneralizedLinearRegressor, PoissonRegressor,
-                   GammaRegressor)
+from ._glm import (PoissonRegressor,
+                   GammaRegressor, TweedieRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
@@ -83,4 +83,5 @@
            'RANSACRegressor',
            'GeneralizedLinearRegressor',
            'PoissonRegressor',
-           'GammaRegressor']
+           'GammaRegressor',
+           'TweedieRegressor']
diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py
index a7d4b67817730..3b5c0d95d6124 100644
--- a/sklearn/linear_model/_glm/__init__.py
+++ b/sklearn/linear_model/_glm/__init__.py
@@ -1,5 +1,15 @@
 # License: BSD 3 clause
 
-from .glm import GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor
+from .glm import (
+    GeneralizedLinearRegressor,
+    PoissonRegressor,
+    GammaRegressor,
+    TweedieRegressor
+)
 
-__all__ = ["GeneralizedLinearRegressor", "PoissonRegressor", "GammaRegressor"]
+__all__ = [
+    "GeneralizedLinearRegressor",
+    "PoissonRegressor",
+    "GammaRegressor",
+    "TweedieRegressor"
+]
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 12486e188951a..ca33c45c76292 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -623,6 +623,15 @@ def __init__(self, alpha=1.0, fit_intercept=True, link='log',
                          max_iter=max_iter, tol=tol, warm_start=warm_start,
                          copy_X=copy_X, verbose=verbose)
 
+    @property
+    def family(self):
+        return "poisson"
+
+    @family.setter
+    def family(self, value):
+        if value != "poisson":
+            raise ValueError("PoissonRegressor.family must be 'poisson'!")
+
 
 class GammaRegressor(GeneralizedLinearRegressor):
     """Regression with the response variable y following a Gamma distribution
@@ -729,3 +738,148 @@ def __init__(self, alpha=1.0, fit_intercept=True, link='log',
                          fit_dispersion=fit_dispersion, solver=solver,
                          max_iter=max_iter, tol=tol, warm_start=warm_start,
                          copy_X=copy_X, verbose=verbose)
+
+    @property
+    def family(self):
+        return "gamma"
+
+    @family.setter
+    def family(self, value):
+        if value != "gamma":
+            raise ValueError("GammaRegressor.family must be 'gamma'!")
+
+
+class TweedieRegressor(GeneralizedLinearRegressor):
+    """Regression with the response variable y following a Tweedie distribution
+
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
+    fitting and predicting the mean of the target y as mu=h(X*w).
+    The fit minimizes the following objective function with L2 regularization::
+
+            1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2
+
+    with inverse link function h and s=sample_weight. Note that for
+    ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples).
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
+
+    Parameters
+    ----------
+    power : float (default=0)
+            The variance power: :math:`v(\mu) = \mu^{power}`.
+            For ``0<power<1``, no distribution exists.
+
+            Special cases are:
+
+            ===== ================
+            Power Distribution
+            ===== ================
+            0     Normal
+            1     Poisson
+            (0,1) Compound Poisson
+            2     Gamma
+            3     Inverse Gaussian
+
+    alpha : float, optional (default=1)
+        Constant that multiplies the penalty terms and thus determines the
+        regularization strength.
+        See the notes for the exact mathematical meaning of this
+        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        case, the design matrix X must have full column rank
+        (no collinearities).
+
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
+        Method for estimation of the dispersion parameter phi. Whether to use
+        the chi squared statistic or the deviance statistic. If None, the
+        dispersion is not estimated.
+
+    solver : {'lbfgs'}, optional (default='lbfgs')
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+    max_iter : int, optional (default=100)
+        The maximal number of iterations for solver algorithms.
+
+    tol : float, optional (default=1e-4)
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
+        where ``g_i`` is the i-th component of the gradient (derivative) of
+        the objective function.
+
+    warm_start : boolean, optional (default=False)
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` .
+
+    copy_X : boolean, optional, (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    verbose : int, optional (default=0)
+        For the lbfgs solver set verbose to any positive number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array, shape (n_features,)
+        Estimated coefficients for the linear predictor (X*coef_+intercept_) in
+        the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    dispersion_ : float
+        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
+
+    n_iter_ : int
+        Actual number of iterations used in solver.
+
+    Notes
+    -----
+    The fit itself does not need Y to be from an EDM, but only assumes
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    :ref:`User Guide <Generalized_linear_regression>`.
+
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    minimizing the deviance plus penalty term, which is equivalent to
+    (penalized) maximum likelihood estimation.
+
+
+    References
+    ----------
+    .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models,
+       Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+
+    .. Jørgensen, B. (1992). The theory of exponential dispersion models
+       and analysis of deviance. Monografias de matemática, no. 51.  See also
+       `Exponential dispersion model.
+       <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
+    """
+    def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log',
+                 fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4,
+                 warm_start=False, copy_X=True, check_input=True, verbose=0):
+
+        super().__init__(alpha=alpha, fit_intercept=fit_intercept,
+                         family=TweedieDistribution(power=power), link=link,
+                         fit_dispersion=fit_dispersion, solver=solver,
+                         max_iter=max_iter, tol=tol, warm_start=warm_start,
+                         copy_X=copy_X, verbose=verbose)
+
+    @property
+    def family(self):
+        dist = TweedieDistribution(power=self.power)
+        # TODO: make the returned object immutable
+        return dist
+
+    @family.setter
+    def family(self, value):
+        if isinstance(value, TweedieDistribution):
+            self.power = value.power
+        else:
+            raise TypeError("TweedieRegressor.family must be of type "
+                            "TweedieDistribution!")
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 542c18b65cad2..a56155fe03f22 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -7,7 +7,12 @@
 import pytest
 
 from sklearn.datasets import make_regression
-from sklearn.linear_model import GeneralizedLinearRegressor
+from sklearn.linear_model._glm import GeneralizedLinearRegressor
+from sklearn.linear_model import (
+    TweedieRegressor,
+    PoissonRegressor,
+    GammaRegressor
+)
 from sklearn.linear_model._glm.link import (
     IdentityLink,
     LogLink,
@@ -353,3 +358,34 @@ def test_convergence_warning(solver, regression_data):
                                      max_iter=1, tol=1e-20)
     with pytest.warns(ConvergenceWarning):
         est.fit(X, y)
+
+
+def test_poisson_regression_family(regression_data):
+    est = PoissonRegressor()
+    est.family == "poisson"
+
+    msg = "PoissonRegressor.family must be 'poisson'!"
+    with pytest.raises(ValueError, match=msg):
+        est.family = 0
+
+
+def test_gamma_regression_family(regression_data):
+    est = GammaRegressor()
+    est.family == "gamma"
+
+    msg = "GammaRegressor.family must be 'gamma'!"
+    with pytest.raises(ValueError, match=msg):
+        est.family = 0
+
+
+def test_tweedie_regression_family(regression_data):
+    power = 2.0
+    est = TweedieRegressor(power=power)
+    assert isinstance(est.family, TweedieDistribution)
+    assert est.family.power == power
+    msg = "TweedieRegressor.family must be of type TweedieDistribution!"
+    with pytest.raises(TypeError, match=msg):
+        est.family = None
+
+    # TODO: the following should not be allowed
+    # est.family.power = 2

From 4b485cac821d30ece504a43d0c3316140cec5d33 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 26 Jul 2019 18:32:07 +0200
Subject: [PATCH 105/209] Improve documentation

---
 doc/modules/classes.rst      |  2 +-
 doc/modules/linear_model.rst | 38 ++++++++++++++++++++++++++----------
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 177cd0780f9be..e5b61faf352e4 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -751,7 +751,6 @@ Kernels:
    linear_model.ElasticNet
    linear_model.ElasticNetCV
    linear_model.GammaRegressor
-   linear_model.GeneralizedLinearRegressor
    linear_model.HuberRegressor
    linear_model.Lars
    linear_model.LarsCV
@@ -781,6 +780,7 @@ Kernels:
    linear_model.SGDClassifier
    linear_model.SGDRegressor
    linear_model.TheilSenRegressor
+   linear_model.TweedieRegressor
 
 .. autosummary::
    :toctree: generated/
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index cab918b06f3b0..f4a48fcaf3acd 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -916,14 +916,14 @@ likelihood as
               - loglike(y,y,\phi)\right) \\
               D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i)
 
-===================================== ===============================  ================================= ============================================
-Distribution                          Target Domain                    Variance Function :math:`v(\mu)`  Unit Deviance :math:`d(y, \mu)`
-===================================== ===============================  ================================= ============================================
-Normal ("normal")                     :math:`y \in (-\infty, \infty)`  :math:`1`                         :math:`(y-\mu)^2`
-Poisson ("poisson")                   :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{\mu}-y+\mu)`
-Gamma ("gamma")                       :math:`y \in (0, \infty)`        :math:`\mu^2`                     :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
-Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)`        :math:`\mu^3`                     :math:`\frac{(y-\mu)^2}{y\mu^2}`
-===================================== ===============================  ================================= ============================================
+================= ===============================  ================================= ============================================
+Distribution       Target Domain                    Variance Function :math:`v(\mu)`  Unit Deviance :math:`d(y, \mu)`
+================= ===============================  ================================= ============================================
+Normal            :math:`y \in (-\infty, \infty)`  :math:`1`                         :math:`(y-\mu)^2`
+Poisson           :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{\mu}-y+\mu)`
+Gamma             :math:`y \in (0, \infty)`        :math:`\mu^2`                     :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
+Inverse Gaussian  :math:`y \in (0, \infty)`        :math:`\mu^3`                     :math:`\frac{(y-\mu)^2}{y\mu^2}`
+================= ===============================  ================================= ============================================
 
 
 In the following use cases, a loss different from the squared loss might be
@@ -945,14 +945,32 @@ it is convenient to apply a link function different from the identity link
 :math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the
 log-link with :math:`h(x^\top w)=\exp(x^\top w)`.
 
+:class:`linear_model.TweedieRegressor` implements a generalized linear model
+for the Tweedie distribution, that allows to model any of the above mentionned
+distribution using the appropriate power parameter `p`,
+
+ - `p = 0`: Normal distribution. Specialized solvers such as
+   :class:`linear_model.Ridge`, :class:`linear_model.ElasticNet` are generally
+   more appropriate in this case.
+
+ - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for
+   convinience however it is strictly equivalent to `TweedieRegressor(power=1)`.
+
+ - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for
+   convinience however it is also strictly equivalent to
+   `TweedieRegressor(power=2)`.
+
+ - `p = 3`: Inverse Gamma distribution.
+
+
 Note that the feature matrix ``X`` should be standardized before fitting. This
 ensures that the penalty treats features equally. The estimator can be used as
 follows:
 
     >>> from sklearn.linear_model import TweedieRegressor
-    >>> reg = TweedieRegressor(alpha=0.5, family='poisson', link='log')
+    >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
-    TweedieRegressor(alpha=0.5, family='poisson', link='log')
+    TweedieRegressor(alpha=0.5, power=1)
     >>> reg.coef_
     array([0.2463..., 0.4337...])
     >>> reg.intercept_

From aa0adf1e6304d9f8aa3497e2fa406244e8d3405b Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 26 Jul 2019 18:34:13 +0200
Subject: [PATCH 106/209] Lint

---
 sklearn/linear_model/_glm/glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index ca33c45c76292..7d7ef099cd04a 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -766,7 +766,7 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     Parameters
     ----------
     power : float (default=0)
-            The variance power: :math:`v(\mu) = \mu^{power}`.
+            The variance power: :math:`v(\\mu) = \\mu^{power}`.
             For ``0<power<1``, no distribution exists.
 
             Special cases are:

From abd47d7cf82ac1cc7d98edb635dc634af844b173 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Tue, 30 Jul 2019 14:13:46 +0200
Subject: [PATCH 107/209] Fix __init__

---
 sklearn/linear_model/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 46d1efe63de2e..9f696a14985c3 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -81,7 +81,6 @@
            'orthogonal_mp_gram',
            'ridge_regression',
            'RANSACRegressor',
-           'GeneralizedLinearRegressor',
            'PoissonRegressor',
            'GammaRegressor',
            'TweedieRegressor']

From 7a9d0674174af423b5bfe8290631db10c53a946b Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 2 Aug 2019 15:39:19 +0200
Subject: [PATCH 108/209] Update doc/modules/linear_model.rst

Co-Authored-By: Joel Nothman <joel.nothman@gmail.com>
---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index f4a48fcaf3acd..8123eac5fc4a2 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -954,7 +954,7 @@ distribution using the appropriate power parameter `p`,
    more appropriate in this case.
 
  - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for
-   convinience however it is strictly equivalent to `TweedieRegressor(power=1)`.
+   convenience however it is strictly equivalent to `TweedieRegressor(power=1)`.
 
  - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for
    convinience however it is also strictly equivalent to

From 18b45037fa6cb4d172c4e1b3cb92c1c33c0700c4 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 2 Aug 2019 15:39:29 +0200
Subject: [PATCH 109/209] Update doc/modules/linear_model.rst

Co-Authored-By: Joel Nothman <joel.nothman@gmail.com>
---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 8123eac5fc4a2..d159a8ad77039 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -957,7 +957,7 @@ distribution using the appropriate power parameter `p`,
    convenience however it is strictly equivalent to `TweedieRegressor(power=1)`.
 
  - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for
-   convinience however it is also strictly equivalent to
+   convenience however it is strictly equivalent to
    `TweedieRegressor(power=2)`.
 
  - `p = 3`: Inverse Gamma distribution.

From 29658d66ff34eb633f2728d69a956cde760c271e Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 2 Aug 2019 15:39:40 +0200
Subject: [PATCH 110/209] Update doc/modules/linear_model.rst

Co-Authored-By: Joel Nothman <joel.nothman@gmail.com>
---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index d159a8ad77039..02c01b674f467 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -965,7 +965,7 @@ distribution using the appropriate power parameter `p`,
 
 Note that the feature matrix ``X`` should be standardized before fitting. This
 ensures that the penalty treats features equally. The estimator can be used as
-follows:
+follows::
 
     >>> from sklearn.linear_model import TweedieRegressor
     >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log')

From 1ea70d3a0c2da89ff164aef9975080de66a1cebd Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 7 Aug 2019 08:22:56 +0200
Subject: [PATCH 111/209] Fix typos in documentation

---
 doc/modules/linear_model.rst              | 13 +++++++------
 sklearn/linear_model/_glm/distribution.py |  2 +-
 sklearn/linear_model/_glm/glm.py          |  2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 02c01b674f467..ce160d341af5f 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -733,8 +733,8 @@ of a single trial are modeled using a
 `logistic function <https://en.wikipedia.org/wiki/Logistic_function>`_.
 
 Logistic regression is implemented in :class:`LogisticRegression`.
-This implementation can fit binary, One-vs-Rest, or multinomial logistic 
-regression with optional :math:`\ell_1`, :math:`\ell_2` or Elastic-Net 
+This implementation can fit binary, One-vs-Rest, or multinomial logistic
+regression with optional :math:`\ell_1`, :math:`\ell_2` or Elastic-Net
 regularization.
 
 .. note::
@@ -946,18 +946,19 @@ it is convenient to apply a link function different from the identity link
 log-link with :math:`h(x^\top w)=\exp(x^\top w)`.
 
 :class:`linear_model.TweedieRegressor` implements a generalized linear model
-for the Tweedie distribution, that allows to model any of the above mentionned
-distribution using the appropriate power parameter `p`,
+for the Tweedie distribution, that allows to model any of the above mentioned
+distributions using the appropriate power parameter `p`,
 
  - `p = 0`: Normal distribution. Specialized solvers such as
    :class:`linear_model.Ridge`, :class:`linear_model.ElasticNet` are generally
    more appropriate in this case.
 
  - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for
-   convenience however it is strictly equivalent to `TweedieRegressor(power=1)`.
+   convenience. However, it is strictly equivalent to
+   `TweedieRegressor(power=1)`.
 
  - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for
-   convenience however it is strictly equivalent to
+   convenience. However, it is strictly equivalent to
    `TweedieRegressor(power=2)`.
 
  - `p = 3`: Inverse Gamma distribution.
diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index c30996662114e..c7b4ec6c7836d 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -224,7 +224,7 @@ class TweedieDistribution(ExponentialDispersionModel):
     ===== ================
     0     Normal
     1     Poisson
-    (0,1) Compound Poisson
+    (1,2) Compound Poisson
     2     Gamma
     3     Inverse Gaussian
 
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 7d7ef099cd04a..af411c38f87ca 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -776,7 +776,7 @@ class TweedieRegressor(GeneralizedLinearRegressor):
             ===== ================
             0     Normal
             1     Poisson
-            (0,1) Compound Poisson
+            (1,2) Compound Poisson
             2     Gamma
             3     Inverse Gaussian
 

From efdcb5be1ed18d681c9b7c358b6f23adb6d0f795 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 9 Aug 2019 16:58:46 +0300
Subject: [PATCH 112/209] Update doc/modules/linear_model.rst

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>
---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index ce160d341af5f..67e4fbbde182c 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -964,7 +964,7 @@ distributions using the appropriate power parameter `p`,
  - `p = 3`: Inverse Gamma distribution.
 
 
-Note that the feature matrix ``X`` should be standardized before fitting. This
+Note that the feature matrix `X` should be standardized before fitting. This
 ensures that the penalty treats features equally. The estimator can be used as
 follows::
 

From ef0d063ec406c73aaee715af4cf06e5ca1f3b78c Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 9 Aug 2019 16:59:26 +0300
Subject: [PATCH 113/209] Update doc/modules/linear_model.rst

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>
---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 67e4fbbde182c..9645690d594f6 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -945,7 +945,7 @@ it is convenient to apply a link function different from the identity link
 :math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the
 log-link with :math:`h(x^\top w)=\exp(x^\top w)`.
 
-:class:`linear_model.TweedieRegressor` implements a generalized linear model
+:class:`TweedieRegressor` implements a generalized linear model
 for the Tweedie distribution, that allows to model any of the above mentioned
 distributions using the appropriate power parameter `p`,
 

From 0125e1cbe9105f8d4f4cf08ee652add64deee384 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 9 Aug 2019 16:59:59 +0300
Subject: [PATCH 114/209] Update doc/modules/linear_model.rst

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>
---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 9645690d594f6..fe52cad236392 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -950,7 +950,7 @@ for the Tweedie distribution, that allows to model any of the above mentioned
 distributions using the appropriate power parameter `p`,
 
  - `p = 0`: Normal distribution. Specialized solvers such as
-   :class:`linear_model.Ridge`, :class:`linear_model.ElasticNet` are generally
+   :class:`Ridge`, :class:`ElasticNet` are generally
    more appropriate in this case.
 
  - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for

From 6a8a600258c8be783f02dad7519fc2aa012418eb Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 9 Aug 2019 17:02:46 +0300
Subject: [PATCH 115/209] Update
 examples/linear_model/plot_poisson_regression_non_normal_loss.py

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>
---
 .../linear_model/plot_poisson_regression_non_normal_loss.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 347a424d8f4fe..6cee852866018 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -238,7 +238,7 @@ def load_mtpl2(n_samples=100000):
     y_pred = model.predict(X_train)
 
     pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=ax[idx+1])
-    ax[idx+1].set_title(model.__class__.__name__)
+    ax[idx + 1].set_title(model.__class__.__name__)
 
 for axi in ax:
     axi.set(

From 73f3bd1f646ec1dc1b9a3808149cc185d3fdea0d Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 9 Aug 2019 17:13:00 +0300
Subject: [PATCH 116/209] Rename inverse.gaussian to inverse-gaussian

---
 sklearn/linear_model/_glm/distribution.py   | 2 +-
 sklearn/linear_model/_glm/glm.py            | 6 +++---
 sklearn/linear_model/_glm/tests/test_glm.py | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index c7b4ec6c7836d..950fa3fbb03e7 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -393,5 +393,5 @@ def __init__(self):
     'normal': NormalDistribution,
     'poisson': PoissonDistribution,
     'gamma': GammaDistribution,
-    'inverse.gaussian': InverseGaussianDistribution,
+    'inverse-gaussian': InverseGaussianDistribution,
 }
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index af411c38f87ca..86f4c544d8f84 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -57,7 +57,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} \
+    family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} \
             or an instance of class ExponentialDispersionModel, \
             optional(default='normal')
         The distributional assumption of the GLM, i.e. which distribution from
@@ -71,7 +71,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         - 'identity' for family 'normal'
 
-        - 'log' for families 'poisson', 'gamma', 'inverse.gaussian'
+        - 'log' for families 'poisson', 'gamma', 'inverse-gaussian'
 
     fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
         Method for estimation of the dispersion parameter phi. Whether to use
@@ -217,7 +217,7 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError(
                 "The family must be an instance of class"
                 " ExponentialDispersionModel or an element of"
-                " ['normal', 'poisson', 'gamma', 'inverse.gaussian']"
+                " ['normal', 'poisson', 'gamma', 'inverse-gaussian']"
                 "; got (family={0})".format(self.family))
 
         # Guarantee that self._link_instance is set to an instance of
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index a56155fe03f22..12edbefbf2833 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -65,7 +65,7 @@ def test_sample_weights_validation():
                          [('normal', NormalDistribution()),
                           ('poisson', PoissonDistribution()),
                           ('gamma', GammaDistribution()),
-                          ('inverse.gaussian', InverseGaussianDistribution())])
+                          ('inverse-gaussian', InverseGaussianDistribution())])
 def test_glm_family_argument(f, fam):
     """Test GLM family argument set as string."""
     y = np.array([0.1, 0.5])  # in range of all distributions

From 11b178fffc7d279596acd122c7b6c33ba4b84a2f Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 9 Aug 2019 17:17:42 +0300
Subject: [PATCH 117/209] Remove sample_weight parameter from predict

---
 sklearn/linear_model/_glm/glm.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 86f4c544d8f84..921317cd0965a 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -388,32 +388,25 @@ def _linear_predictor(self, X):
                         allow_nd=False)
         return X @ self.coef_ + self.intercept_
 
-    def predict(self, X, sample_weight=None):
+    def predict(self, X):
         """Predict using GLM with feature matrix X.
 
-        If sample_weight is given, returns prediction*sample_weight.
-
         Parameters
         ----------
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Samples.
 
-        sample_weight : {None, array-like}, shape (n_samples,), optional \
-                (default=None)
-
         Returns
         -------
         C : array, shape (n_samples,)
-            Returns predicted values times sample_weight.
+            Returns predicted values.
         """
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
                         dtype='numeric', ensure_2d=True,
                         allow_nd=False)
         eta = self._linear_predictor(X)
         mu = self._link_instance.inverse(eta)
-        weights = _check_sample_weight(sample_weight, X)
-
-        return mu*weights
+        return mu
 
     def estimate_phi(self, X, y, sample_weight=None):
         """Estimate/fit the dispersion parameter phi.

From 3806fbe05c164d3fd74aada0b8a068c43d3998cd Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 9 Aug 2019 17:22:20 +0300
Subject: [PATCH 118/209] Remove redundant check_array in predict

---
 sklearn/linear_model/_glm/glm.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 921317cd0965a..0ab9e8c1db777 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -384,7 +384,7 @@ def _linear_predictor(self, X):
         """
         check_is_fitted(self, "coef_")
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                        dtype='numeric', copy=True, ensure_2d=True,
+                        dtype='numeric', ensure_2d=True,
                         allow_nd=False)
         return X @ self.coef_ + self.intercept_
 
@@ -401,9 +401,7 @@ def predict(self, X):
         C : array, shape (n_samples,)
             Returns predicted values.
         """
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                        dtype='numeric', ensure_2d=True,
-                        allow_nd=False)
+        # check_array is done in _linear_predictor
         eta = self._linear_predictor(X)
         mu = self._link_instance.inverse(eta)
         return mu

From ae1c6721876fe5e9f1f14cd092fb198067899457 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 11 Aug 2019 23:11:52 +0200
Subject: [PATCH 119/209] Update doc/modules/linear_model.rst

---
 doc/modules/linear_model.rst | 66 +++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 31 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index fe52cad236392..09b657c26b915 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -900,30 +900,31 @@ combination of the input variables :math:`X` via an inverse link function
 
 .. math::    \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p).
 
-Secondly, the squared loss function is replaced by the deviance :math:`D` of an
-exponential dispersion model (EDM) [11]_. The minimized objective function is
-the penalized negative log likelihood,
+Secondly, the squared loss function is replaced by the unit deviance :math:`d`
+of a reproductive exponential dispersion model (EDM) [11]_. The minimization
+problem becomes
 
-.. math::    \frac{1}{2 \sum s_i}D(y, \hat{y}; s) +\frac{\alpha}{2} ||w||_2
+.. math::    \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, x_i)) + \frac{\alpha}{2} ||w||_2
 
 with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`.
-
-The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
+The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
 likelihood as
 
 .. math::     d(y, \mu) = -2\phi\cdot
               \left(loglike(y,\mu,\phi)
-              - loglike(y,y,\phi)\right) \\
-              D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i)
+              - loglike(y,y,\phi)\right)
+
+The following table lists some specific EDM distributions&mdash;all are Tweedie
+distributions&mdash;and some properties.
 
-================= ===============================  ================================= ============================================
-Distribution       Target Domain                    Variance Function :math:`v(\mu)`  Unit Deviance :math:`d(y, \mu)`
-================= ===============================  ================================= ============================================
-Normal            :math:`y \in (-\infty, \infty)`  :math:`1`                         :math:`(y-\mu)^2`
-Poisson           :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{\mu}-y+\mu)`
-Gamma             :math:`y \in (0, \infty)`        :math:`\mu^2`                     :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
-Inverse Gaussian  :math:`y \in (0, \infty)`        :math:`\mu^3`                     :math:`\frac{(y-\mu)^2}{y\mu^2}`
-================= ===============================  ================================= ============================================
+================= ===============================  ====================================== ============================================
+Distribution       Target Domain                    Unit Variance Function :math:`v(\mu)`  Unit Deviance :math:`d(y, \mu)`
+================= ===============================  ====================================== ============================================
+Normal            :math:`y \in (-\infty, \infty)`  :math:`1`                              :math:`(y-\mu)^2`
+Poisson           :math:`y \in [0, \infty)`        :math:`\mu`                            :math:`2(y\log\frac{y}{\mu}-y+\mu)`
+Gamma             :math:`y \in (0, \infty)`        :math:`\mu^2`                          :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
+Inverse Gaussian  :math:`y \in (0, \infty)`        :math:`\mu^3`                          :math:`\frac{(y-\mu)^2}{y\mu^2}`
+================= ===============================  ====================================== ============================================
 
 
 In the following use cases, a loss different from the squared loss might be
@@ -947,7 +948,8 @@ log-link with :math:`h(x^\top w)=\exp(x^\top w)`.
 
 :class:`TweedieRegressor` implements a generalized linear model
 for the Tweedie distribution, that allows to model any of the above mentioned
-distributions using the appropriate power parameter `p`,
+distributions using the appropriate power parameter `p`, i.e. the exponent of
+the unit variance function,
 
  - `p = 0`: Normal distribution. Specialized solvers such as
    :class:`Ridge`, :class:`ElasticNet` are generally
@@ -964,9 +966,16 @@ distributions using the appropriate power parameter `p`,
  - `p = 3`: Inverse Gamma distribution.
 
 
-Note that the feature matrix `X` should be standardized before fitting. This
-ensures that the penalty treats features equally. The estimator can be used as
-follows::
+Note:
+* The feature matrix `X` should be standardized before fitting. This
+  ensures that the penalty treats features equally.
+* If you want to model a relative frequency, i.e. counts per exposure (time,
+  volume, ...) you can do so by a Poisson distribution and passing
+  :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together
+  with :math:`s=\mathrm{exposure}` as sample weights. This is done in both
+  examples linked below.
+
+The estimator can be used as follows::
 
     >>> from sklearn.linear_model import TweedieRegressor
     >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log')
@@ -993,7 +1002,7 @@ In the unpenalized case, the assumptions are the following:
       with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter
       :math:`\phi` and sample weights :math:`s_i`.
     * The aim is to predict the expectation :math:`\mu_i` with
-      :math:`\hat{y_i} = h(\eta_i)`, linear predictor
+      :math:`\hat{y}_i = h(\eta_i)`, linear predictor
       :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`.
 
 Note that the first assumption implies
@@ -1001,21 +1010,16 @@ Note that the first assumption implies
 function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the
 same as specifying a unit variance function (they are one-to-one).
 
-The objective function (the penalized negative log likelihood) is
-independent of :math:`\phi` and is minimized with respect to the
-coefficients :math:`w`.
-
-Two remarks:
+A few remarks:
 
+* The deviance is independent of :math:`\phi`. Therefore, also the estimation
+  of the coefficients :math:`w` is independent of the dispersion parameter of
+  the EDM.
+* The minimization is equivalent to (penalized) maximum likelihood estimation.
 * The deviances for at least Normal, Poisson and Gamma distributions are
   strictly consistent scoring functions for the mean :math:`\mu`, see Eq.
   (19)-(20) in [12]_.
 
-* If you want to model a frequency, i.e. counts per exposure (time, volume, ...)
-  you can do so by a Poisson distribution and passing
-  :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together
-  with :math:`s=\mathrm{exposure}` as sample weights.
-
 
 .. topic:: References:
 

From f07c831c29bc37ab0b922e1482d9f027f84049c1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 11 Aug 2019 23:32:57 +0200
Subject: [PATCH 120/209] Remove dispersion

---
 sklearn/linear_model/_glm/glm.py            | 116 +++-----------------
 sklearn/linear_model/_glm/tests/test_glm.py |  18 ---
 2 files changed, 15 insertions(+), 119 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 0ab9e8c1db777..cf9b00527c8cf 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -73,11 +73,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         - 'log' for families 'poisson', 'gamma', 'inverse-gaussian'
 
-    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
-        Method for estimation of the dispersion parameter phi. Whether to use
-        the chi squared statistic or the deviance statistic. If None, the
-        dispersion is not estimated.
-
     solver : {'auto', 'lbfgs'}, optional (default='auto')
         Algorithm to use in the optimization problem:
 
@@ -124,9 +119,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     intercept_ : float
         Intercept (a.k.a. bias) added to linear predictor.
 
-    dispersion_ : float
-        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
-
     n_iter_ : int
         Actual number of iterations used in solver.
 
@@ -169,14 +161,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """
     def __init__(self, alpha=1.0,
                  fit_intercept=True, family='normal', link='auto',
-                 fit_dispersion=None, solver='auto', max_iter=100,
-                 tol=1e-4, warm_start=False,
+                 solver='auto', max_iter=100, tol=1e-4, warm_start=False,
                  copy_X=True, check_input=True, verbose=0):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
         self.family = family
         self.link = link
-        self.fit_dispersion = fit_dispersion
         self.solver = solver
         self.max_iter = max_iter
         self.tol = tol
@@ -310,8 +300,8 @@ def fit(self, X, y, sample_weight=None):
         weights = weights/weights_sum
 
         # initialization of coef = (intercept_, coef)
-        # Note: Since phi=self.dispersion_ does not enter the estimation
-        #       of mu_i=E[y_i], set it to 1.
+        # Note: The dispersion parameter phi does not enter the estimation
+        #       of mu_i=E[y_i].
 
         if self.warm_start and hasattr(self, 'coef_'):
             if self.fit_intercept:
@@ -363,10 +353,6 @@ def func(coef, X, y, weights, alpha, family, link):
             self.intercept_ = 0.
             self.coef_ = coef
 
-        if self.fit_dispersion in ['chisqr', 'deviance']:
-            # attention because of rescaling of weights
-            self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum
-
         return self
 
     def _linear_predictor(self, X):
@@ -406,51 +392,6 @@ def predict(self, X):
         mu = self._link_instance.inverse(eta)
         return mu
 
-    def estimate_phi(self, X, y, sample_weight=None):
-        """Estimate/fit the dispersion parameter phi.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            Training data.
-
-        y : array-like, shape (n_samples,)
-            Target values.
-
-        sample_weight : {None, array-like}, shape (n_samples,), optional \
-                (default=None)
-            Sample weights.
-
-        Returns
-        -------
-        phi : float
-            Dispersion parameter.
-        """
-        check_is_fitted(self, "coef_")
-        _dtype = [np.float64, np.float32]
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                         dtype=_dtype, y_numeric=True, multi_output=False)
-        n_samples, n_features = X.shape
-        weights = _check_sample_weight(sample_weight, X)
-        eta = X @ self.coef_
-        if self.fit_intercept is True:
-            eta += self.intercept_
-            n_features += 1
-        if n_samples <= n_features:
-            raise ValueError("Estimation of dispersion parameter phi requires"
-                             " more samples than features, got"
-                             " samples=X.shape[0]={0} and"
-                             " n_features=X.shape[1]+fit_intercept={1}."
-                             .format(n_samples, n_features))
-        mu = self._link_instance.inverse(eta)
-        if self.fit_dispersion == 'chisqr':
-            chisq = np.sum(weights*(y-mu)**2 /
-                           self._family_instance.unit_variance(mu))
-            return chisq/(n_samples - n_features)
-        elif self.fit_dispersion == 'deviance':
-            dev = self._family_instance.deviance(y, mu, weights)
-            return dev/(n_samples - n_features)
-
     def score(self, X, y, sample_weight=None):
         """Compute D^2, the percentage of deviance explained.
 
@@ -525,11 +466,6 @@ class PoissonRegressor(GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
-        Method for estimation of the dispersion parameter phi. Whether to use
-        the chi squared statistic or the deviance statistic. If None, the
-        dispersion is not estimated.
-
     solver : {'lbfgs'}, optional (default='lbfgs')
         Algorithm to use in the optimization problem:
 
@@ -564,9 +500,6 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     intercept_ : float
         Intercept (a.k.a. bias) added to linear predictor.
 
-    dispersion_ : float
-        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
-
     n_iter_ : int
         Actual number of iterations used in solver.
 
@@ -605,14 +538,13 @@ class PoissonRegressor(GeneralizedLinearRegressor):
        <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
     """
     def __init__(self, alpha=1.0, fit_intercept=True, link='log',
-                 fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4,
-                 warm_start=False, copy_X=True, check_input=True, verbose=0):
+                 solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
+                 copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
                          family="poisson", link=link,
-                         fit_dispersion=fit_dispersion, solver=solver,
-                         max_iter=max_iter, tol=tol, warm_start=warm_start,
-                         copy_X=copy_X, verbose=verbose)
+                         solver=solver, max_iter=max_iter, tol=tol,
+                         warm_start=warm_start, copy_X=copy_X, verbose=verbose)
 
     @property
     def family(self):
@@ -652,11 +584,6 @@ class GammaRegressor(GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
-        Method for estimation of the dispersion parameter phi. Whether to use
-        the chi squared statistic or the deviance statistic. If None, the
-        dispersion is not estimated.
-
     solver : {'lbfgs'}, optional (default='lbfgs')
         Algorithm to use in the optimization problem:
 
@@ -691,9 +618,6 @@ class GammaRegressor(GeneralizedLinearRegressor):
     intercept_ : float
         Intercept (a.k.a. bias) added to linear predictor.
 
-    dispersion_ : float
-        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
-
     n_iter_ : int
         Actual number of iterations used in solver.
 
@@ -721,14 +645,13 @@ class GammaRegressor(GeneralizedLinearRegressor):
        <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
     """
     def __init__(self, alpha=1.0, fit_intercept=True, link='log',
-                 fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4,
-                 warm_start=False, copy_X=True, check_input=True, verbose=0):
+                 solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
+                 copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
                          family="gamma", link=link,
-                         fit_dispersion=fit_dispersion, solver=solver,
-                         max_iter=max_iter, tol=tol, warm_start=warm_start,
-                         copy_X=copy_X, verbose=verbose)
+                         solver=solver, max_iter=max_iter, tol=tol,
+                         warm_start=warm_start, copy_X=copy_X, verbose=verbose)
 
     @property
     def family(self):
@@ -783,11 +706,6 @@ class TweedieRegressor(GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
-        Method for estimation of the dispersion parameter phi. Whether to use
-        the chi squared statistic or the deviance statistic. If None, the
-        dispersion is not estimated.
-
     solver : {'lbfgs'}, optional (default='lbfgs')
         Algorithm to use in the optimization problem:
 
@@ -822,9 +740,6 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     intercept_ : float
         Intercept (a.k.a. bias) added to linear predictor.
 
-    dispersion_ : float
-        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
-
     n_iter_ : int
         Actual number of iterations used in solver.
 
@@ -852,14 +767,13 @@ class TweedieRegressor(GeneralizedLinearRegressor):
        <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
     """
     def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log',
-                 fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4,
-                 warm_start=False, copy_X=True, check_input=True, verbose=0):
+                 solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
+                 copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
                          family=TweedieDistribution(power=power), link=link,
-                         fit_dispersion=fit_dispersion, solver=solver,
-                         max_iter=max_iter, tol=tol, warm_start=warm_start,
-                         copy_X=copy_X, verbose=verbose)
+                         solver=solver, max_iter=max_iter, tol=tol,
+                         warm_start=warm_start, copy_X=copy_X, verbose=verbose)
 
     @property
     def family(self):
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 12edbefbf2833..5bd80cbf76fcf 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -332,24 +332,6 @@ def test_solver_equivalence(params, regression_data):
     )
 
 
-def test_fit_dispersion(regression_data):
-    X, y = regression_data
-
-    est1 = GeneralizedLinearRegressor()
-    est1.fit(X, y)
-    assert not hasattr(est1, "dispersion_")
-
-    est2 = GeneralizedLinearRegressor(fit_dispersion="chisqr")
-    est2.fit(X, y)
-    assert isinstance(est2.dispersion_, float)
-
-    est3 = GeneralizedLinearRegressor(fit_dispersion="deviance")
-    est3.fit(X, y)
-    assert isinstance(est3.dispersion_, float)
-
-    assert_allclose(est2.dispersion_,  est3.dispersion_)
-
-
 @pytest.mark.parametrize("solver", GLM_SOLVERS)
 def test_convergence_warning(solver, regression_data):
     X, y = regression_data

From ebbbe9cd1c670e15aca835b92730f570c49e6e4c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 13 Aug 2019 08:44:50 +0200
Subject: [PATCH 121/209] Update doc/modules/linear_model.rst

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>
---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 09b657c26b915..f352860b6826f 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -1003,7 +1003,7 @@ In the unpenalized case, the assumptions are the following:
       :math:`\phi` and sample weights :math:`s_i`.
     * The aim is to predict the expectation :math:`\mu_i` with
       :math:`\hat{y}_i = h(\eta_i)`, linear predictor
-      :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`.
+      :math:`\eta_i=(Xw)_i` and inverse link function :math:`h`.
 
 Note that the first assumption implies
 :math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance

From 918e2574ad99724d4d4a437af5283f32442d5752 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 13 Aug 2019 08:53:57 +0200
Subject: [PATCH 122/209] Update doc/modules/linear_model.rst

---
 doc/modules/linear_model.rst | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 09b657c26b915..32b7494ba7c00 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -927,6 +927,9 @@ Inverse Gaussian  :math:`y \in (0, \infty)`        :math:`\mu^3`
 ================= ===============================  ====================================== ============================================
 
 
+Usage
+-----
+
 In the following use cases, a loss different from the squared loss might be
 appropriate,
 
@@ -944,7 +947,7 @@ Since the linear predictor :math:`Xw` can be negative and
 Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
 it is convenient to apply a link function different from the identity link
 :math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the
-log-link with :math:`h(x^\top w)=\exp(x^\top w)`.
+log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`.
 
 :class:`TweedieRegressor` implements a generalized linear model
 for the Tweedie distribution, that allows to model any of the above mentioned
@@ -1018,7 +1021,9 @@ A few remarks:
 * The minimization is equivalent to (penalized) maximum likelihood estimation.
 * The deviances for at least Normal, Poisson and Gamma distributions are
   strictly consistent scoring functions for the mean :math:`\mu`, see Eq.
-  (19)-(20) in [12]_.
+  (19)-(20) in [12]_. This means that, given an appropriate feature matrix `X`,
+  you get good (asymptotic) estimators for the expectation when using these
+  deviances.
 
 
 .. topic:: References:

From 37d0f47bac45c27d01f949a835ecddad471b8d42 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Sat, 17 Aug 2019 00:14:20 +0300
Subject: [PATCH 123/209] Use double `` when necessary

---
 doc/modules/linear_model.rst     | 10 +++++-----
 sklearn/linear_model/_glm/glm.py |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index d0e3f9542a641..7bb684e8bcf87 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -951,22 +951,22 @@ log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`.
 
 :class:`TweedieRegressor` implements a generalized linear model
 for the Tweedie distribution, that allows to model any of the above mentioned
-distributions using the appropriate power parameter `p`, i.e. the exponent of
+distributions using the appropriate power parameter ``p``, i.e. the exponent of
 the unit variance function,
 
- - `p = 0`: Normal distribution. Specialized solvers such as
+ - ``p = 0``: Normal distribution. Specialized solvers such as
    :class:`Ridge`, :class:`ElasticNet` are generally
    more appropriate in this case.
 
- - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for
+ - ``p = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed for
    convenience. However, it is strictly equivalent to
    `TweedieRegressor(power=1)`.
 
- - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for
+ - ``p = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for
    convenience. However, it is strictly equivalent to
    `TweedieRegressor(power=2)`.
 
- - `p = 3`: Inverse Gamma distribution.
+ - ``p = 3``: Inverse Gamma distribution.
 
 
 Note:
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index cf9b00527c8cf..baed5fecb2b97 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -397,7 +397,7 @@ def score(self, X, y, sample_weight=None):
 
         D^2 is a generalization of the coefficient of determination R^2.
         R^2 uses squared error and D^2 deviance. Note that those two are equal
-        for family='normal'.
+        for ``family='normal'``.
 
         D^2 is defined as
         :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,

From 9c337f25ea389c15e32ce943292fbe7b953a95b3 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Sat, 17 Aug 2019 00:16:59 +0300
Subject: [PATCH 124/209] ax -> axes in
 plot_poisson_regression_non_normal_loss.py

---
 .../plot_poisson_regression_non_normal_loss.py       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 6cee852866018..5c0b64faea255 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -228,19 +228,19 @@ def load_mtpl2(n_samples=100000):
 # histogram of observed target values with that of predicted values,
 
 
-fig, ax = plt.subplots(1, 4, figsize=(16, 3))
+fig, axes = plt.subplots(1, 4, figsize=(16, 3))
 
-df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=ax[0])
+df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=axes[0])
 
-ax[0].set_title('Experimental data')
+axes[0].set_title('Experimental data')
 
 for idx, model in enumerate([linregr, glm_freq, gbr]):
     y_pred = model.predict(X_train)
 
-    pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=ax[idx+1])
-    ax[idx + 1].set_title(model.__class__.__name__)
+    pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=axes[idx+1])
+    axes[idx + 1].set_title(model.__class__.__name__)
 
-for axi in ax:
+for axi in axes:
     axi.set(
         yscale='log',
         xlabel="y (Frequency)"

From 5e05935caf29ff6bffb25ab5c83a1e4c855b83b0 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sat, 17 Aug 2019 00:21:13 +0300
Subject: [PATCH 125/209] Update sklearn/linear_model/_glm/distribution.py

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>
---
 sklearn/linear_model/_glm/distribution.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index 950fa3fbb03e7..b1845eb13e921 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -206,7 +206,7 @@ def _mu_deviance_derivative(self, coef, X, y, weights, link):
         if coef.size == X.shape[1] + 1:
             devp = np.concatenate(([temp.sum()], temp @ X))
         else:
-            devp = temp @ X  # sampe as X.T @ temp
+            devp = temp @ X  # same as X.T @ temp
         return mu, devp
 
 

From 4a6821393715e42150fb8673f3d45840002d0114 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Sat, 17 Aug 2019 00:26:47 +0300
Subject: [PATCH 126/209] Remove solver=auto

---
 sklearn/linear_model/_glm/glm.py | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index baed5fecb2b97..4786d231336b6 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -73,19 +73,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         - 'log' for families 'poisson', 'gamma', 'inverse-gaussian'
 
-    solver : {'auto', 'lbfgs'}, optional (default='auto')
+    solver : 'lbfgs', optional (default='lbfgs')
         Algorithm to use in the optimization problem:
 
-        'auto'
-            Sets 'lbfgs'
-
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
 
-
-        Note that all solvers except lbfgs use the fisher matrix, i.e. the
-        expected Hessian instead of the Hessian matrix.
-
     max_iter : int, optional (default=100)
         The maximal number of iterations for solver algorithms.
 
@@ -161,7 +154,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """
     def __init__(self, alpha=1.0,
                  fit_intercept=True, family='normal', link='auto',
-                 solver='auto', max_iter=100, tol=1e-4, warm_start=False,
+                 solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
                  copy_X=True, check_input=True, verbose=0):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
@@ -243,14 +236,11 @@ def fit(self, X, y, sample_weight=None):
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
-        if self.solver not in ['auto', 'lbfgs']:
+        if self.solver not in ['lbfgs']:
             raise ValueError("GeneralizedLinearRegressor supports only solvers"
-                             "'auto', 'lbfgs';"
-                             " got {0}".format(self.solver))
+                             "'lbfgs'; got {0}".format(self.solver))
         solver = self.solver
-        if self.solver == 'auto':
-            solver = 'lbfgs'
-        if (not isinstance(self.max_iter, int)
+        if (not isinstance(self.max_iter, numbers.Integral)
                 or self.max_iter <= 0):
             raise ValueError("Maximum number of iteration must be a positive "
                              "integer;"

From 8ee5c85a0140175e354ca9f8dd4db08f23606b9f Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sat, 17 Aug 2019 00:38:56 +0300
Subject: [PATCH 127/209] Update sklearn/linear_model/_glm/glm.py

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>
---
 sklearn/linear_model/_glm/glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index baed5fecb2b97..f770337e40e62 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -384,7 +384,7 @@ def predict(self, X):
 
         Returns
         -------
-        C : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Returns predicted values.
         """
         # check_array is done in _linear_predictor

From a1f8aabb6042aac39929a65faaac78e3b6b68e32 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Sat, 17 Aug 2019 00:39:53 +0300
Subject: [PATCH 128/209] More review comments

---
 sklearn/linear_model/_glm/glm.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index f81408a83a5cf..25f018c3e0eb4 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -261,16 +261,14 @@ def fit(self, X, y, sample_weight=None):
         family = self._family_instance
         link = self._link_instance
 
-        _dtype = [np.float64, np.float32]
-        _stype = ['csc', 'csr']
-        X, y = check_X_y(X, y, accept_sparse=_stype,
-                         dtype=_dtype, y_numeric=True, multi_output=False,
-                         copy=self.copy_X)
+        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'],
+                         dtype=[np.float64, np.float32],
+                         y_numeric=True, multi_output=False, copy=self.copy_X)
         y = np.asarray(y, dtype=np.float64)
 
         weights = _check_sample_weight(sample_weight, X)
 
-        n_samples, n_features = X.shape
+        _, n_features = X.shape
 
         if self.check_input:
             if not np.all(family.in_y_range(y)):
@@ -287,7 +285,7 @@ def fit(self, X, y, sample_weight=None):
         # we rescale weights such that sum(weights) = 1 and this becomes
         # 1/2*deviance + L2 with deviance=sum(weights * unit_deviance)
         weights_sum = np.sum(weights)
-        weights = weights/weights_sum
+        weights = weights / weights_sum
 
         # initialization of coef = (intercept_, coef)
         # Note: The dispersion parameter phi does not enter the estimation
@@ -355,7 +353,7 @@ def _linear_predictor(self, X):
 
         Returns
         -------
-        C : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Returns predicted values of linear predictor.
         """
         check_is_fitted(self, "coef_")

From c0999ead8e8e834456f4a0f61ec77cec94790f4d Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Sat, 17 Aug 2019 00:44:36 +0300
Subject: [PATCH 129/209] Addressing reviews in tests

---
 sklearn/linear_model/_glm/glm.py            |  4 ----
 sklearn/linear_model/_glm/tests/test_glm.py | 16 ++++++++--------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 25f018c3e0eb4..c1f7aecd32e4f 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -287,10 +287,6 @@ def fit(self, X, y, sample_weight=None):
         weights_sum = np.sum(weights)
         weights = weights / weights_sum
 
-        # initialization of coef = (intercept_, coef)
-        # Note: The dispersion parameter phi does not enter the estimation
-        #       of mu_i=E[y_i].
-
         if self.warm_start and hasattr(self, 'coef_'):
             if self.fit_intercept:
                 coef = np.concatenate((np.array([self.intercept_]),
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 5bd80cbf76fcf..c4e8c883ff379 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -61,17 +61,17 @@ def test_sample_weights_validation():
         glm.fit(X, y, weights)
 
 
-@pytest.mark.parametrize('f, fam',
+@pytest.mark.parametrize('name, instance',
                          [('normal', NormalDistribution()),
                           ('poisson', PoissonDistribution()),
                           ('gamma', GammaDistribution()),
                           ('inverse-gaussian', InverseGaussianDistribution())])
-def test_glm_family_argument(f, fam):
+def test_glm_family_argument(name, instance):
     """Test GLM family argument set as string."""
     y = np.array([0.1, 0.5])  # in range of all distributions
     X = np.array([[1], [2]])
-    glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
-    assert isinstance(glm._family_instance, fam.__class__)
+    glm = GeneralizedLinearRegressor(family=name, alpha=0).fit(X, y)
+    assert isinstance(glm._family_instance, instance.__class__)
 
     glm = GeneralizedLinearRegressor(family='not a family',
                                      fit_intercept=False)
@@ -79,15 +79,15 @@ def test_glm_family_argument(f, fam):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize('l, link',
+@pytest.mark.parametrize('name, instance',
                          [('identity', IdentityLink()),
                           ('log', LogLink())])
-def test_glm_link_argument(l, link):
+def test_glm_link_argument(name, instance):
     """Test GLM link argument set as string."""
     y = np.array([0.1, 0.5])  # in range of all distributions
     X = np.array([[1], [2]])
-    glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y)
-    assert isinstance(glm._link_instance, link.__class__)
+    glm = GeneralizedLinearRegressor(family='normal', link=name).fit(X, y)
+    assert isinstance(glm._link_instance, instance.__class__)
 
     glm = GeneralizedLinearRegressor(family='normal', link='not a link')
     with pytest.raises(ValueError, match="link must be"):

From e09e3368d5e3c173569dba1669cbf97334d20db0 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Sat, 17 Aug 2019 00:50:37 +0300
Subject: [PATCH 130/209] More comments in tests

---
 sklearn/linear_model/_glm/tests/test_glm.py | 52 ++++-----------------
 1 file changed, 9 insertions(+), 43 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index c4e8c883ff379..ebcab6395e5b4 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -23,7 +23,6 @@
     GammaDistribution, InverseGaussianDistribution,
 )
 from sklearn.linear_model import Ridge
-from sklearn.metrics import mean_absolute_error
 from sklearn.exceptions import ConvergenceWarning
 
 GLM_SOLVERS = ['lbfgs']
@@ -183,8 +182,7 @@ def test_glm_identity_regression(solver):
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
     glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
-                                     fit_intercept=False, solver=solver,
-                                     tol=1e-7)
+                                     fit_intercept=False, solver=solver)
     res = glm.fit(X, y)
     assert_allclose(res.coef_, coef, rtol=1e-6)
 
@@ -242,18 +240,13 @@ def test_warm_start(fit_intercept):
 @pytest.mark.parametrize('fit_intercept', [True, False])
 @pytest.mark.parametrize('solver', GLM_SOLVERS)
 def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
-    """Test ridge regression for Normal distributions.
-
-    Case n_samples >> n_features
-
-    Compare to test_ridge in test_ridge.py.
-    """
+    """Compare with Ridge regression for Normal distributions."""
     alpha = 1.0
     n_predict = 10
-    X, y, coef = make_regression(n_samples=n_samples+n_predict,
-                                 n_features=n_features,
-                                 n_informative=n_features-2, noise=0.5,
-                                 coef=True, random_state=42)
+    X, y, _ = make_regression(n_samples=n_samples+n_predict,
+                              n_features=n_features,
+                              n_informative=n_features-2, noise=0.5,
+                              coef=True, random_state=42)
     y = y[0:n_samples]
     X, T = X[0:n_samples], X[n_samples:]
 
@@ -279,10 +272,9 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
 
 
 @pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-7)])
-def test_poisson_ridge(solver, tol):
-    """Test ridge regression with poisson family and LogLink.
-
-    Compare to R's glmnet"""
+def test_poisson_glmnet(solver, tol):
+    """Compare Poisson regression with L2 regularization and LogLink to glmnet
+    """
     # library("glmnet")
     # options(digits=10)
     # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
@@ -306,32 +298,6 @@ def test_poisson_ridge(solver, tol):
     assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5)
 
 
-@pytest.mark.parametrize(
-        "params",
-        [
-            {"solver": "lbfgs"},
-        ],
-        ids=lambda params: ', '.join("%s=%s" % (key, val)
-                                     for key,  val in params.items())
-)
-def test_solver_equivalence(params, regression_data):
-    X, y = regression_data
-    est_ref = GeneralizedLinearRegressor()
-    est_ref.fit(X, y)
-
-    estimator = GeneralizedLinearRegressor(**params)
-
-    estimator.fit(X, y)
-
-    assert_allclose(estimator.intercept_, est_ref.intercept_, rtol=1e-4)
-    assert_allclose(estimator.coef_, est_ref.coef_, rtol=1e-4)
-    assert_allclose(
-        mean_absolute_error(estimator.predict(X), y),
-        mean_absolute_error(est_ref.predict(X), y),
-        rtol=1e-4
-    )
-
-
 @pytest.mark.parametrize("solver", GLM_SOLVERS)
 def test_convergence_warning(solver, regression_data):
     X, y = regression_data

From 6601d3049d3637c1823ff709d5416b306386d46a Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 17 Aug 2019 15:49:25 +0200
Subject: [PATCH 131/209] Update linear_model.rst

---
 doc/modules/linear_model.rst | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 7bb684e8bcf87..8b3764e7cefd0 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -914,8 +914,8 @@ likelihood as
               \left(loglike(y,\mu,\phi)
               - loglike(y,y,\phi)\right)
 
-The following table lists some specific EDM distributions&mdash;all are Tweedie
-distributions&mdash;and some properties.
+The following table lists some specific EDM distributions—all are Tweedie
+distributions—and some of their properties.
 
 ================= ===============================  ====================================== ============================================
 Distribution       Target Domain                    Unit Variance Function :math:`v(\mu)`  Unit Deviance :math:`d(y, \mu)`
@@ -943,7 +943,7 @@ appropriate,
     you might try an Inverse Gaussian deviance (or even higher variance powers
     of the Tweedie family).
 
-Since the linear predictor :math:`Xw` can be negative and
+Since the linear predictor :math:`x^\top w` can be negative and
 Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
 it is convenient to apply a link function different from the identity link
 :math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the
@@ -951,32 +951,33 @@ log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`.
 
 :class:`TweedieRegressor` implements a generalized linear model
 for the Tweedie distribution, that allows to model any of the above mentioned
-distributions using the appropriate power parameter ``p``, i.e. the exponent of
-the unit variance function,
+distributions using the appropriate ``power`` parameter, i.e. the exponent
+of the unit variance function,
 
- - ``p = 0``: Normal distribution. Specialized solvers such as
+ - ``power = 0``: Normal distribution. Specialized solvers such as
    :class:`Ridge`, :class:`ElasticNet` are generally
    more appropriate in this case.
 
- - ``p = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed for
+ - ``power = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed for
    convenience. However, it is strictly equivalent to
    `TweedieRegressor(power=1)`.
 
- - ``p = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for
+ - ``power = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for
    convenience. However, it is strictly equivalent to
    `TweedieRegressor(power=2)`.
 
- - ``p = 3``: Inverse Gamma distribution.
+ - ``power = 3``: Inverse Gamma distribution.
 
 
-Note:
-* The feature matrix `X` should be standardized before fitting. This
-  ensures that the penalty treats features equally.
-* If you want to model a relative frequency, i.e. counts per exposure (time,
-  volume, ...) you can do so by a Poisson distribution and passing
-  :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together
-  with :math:`s=\mathrm{exposure}` as sample weights. This is done in both
-  examples linked below.
+.. note::
+
+   * The feature matrix `X` should be standardized before fitting. This
+     ensures that the penalty treats features equally.
+   * If you want to model a relative frequency, i.e. counts per exposure (time,
+     volume, ...) you can do so by a Poisson distribution and passing
+     :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values
+     together with :math:`s=\mathrm{exposure}` as sample weights. This is done
+     in both examples linked below.
 
 The estimator can be used as follows::
 

From 5174dae1cbc7ea9442243f15f3887f8252520f46 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 17 Aug 2019 18:55:24 +0200
Subject: [PATCH 132/209] Address check_is_fitted deprication of attributes

---
 sklearn/linear_model/_glm/glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index c1f7aecd32e4f..13ca3673c8b3d 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -352,7 +352,7 @@ def _linear_predictor(self, X):
         y_pred : array, shape (n_samples,)
             Returns predicted values of linear predictor.
         """
-        check_is_fitted(self, "coef_")
+        check_is_fitted(self)
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
                         dtype='numeric', ensure_2d=True,
                         allow_nd=False)

From 61dc13fdd5b71fe0e1d30e5a9b58934441a9dcb1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 17 Aug 2019 20:18:21 +0200
Subject: [PATCH 133/209] No LaTeX in docstrings

---
 sklearn/linear_model/_glm/distribution.py | 89 ++++++++++-------------
 sklearn/linear_model/_glm/glm.py          | 66 ++++++++---------
 2 files changed, 70 insertions(+), 85 deletions(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index b1845eb13e921..1baf99a465326 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -26,19 +26,15 @@ def _safe_lin_pred(X, coef):
 
 
 class ExponentialDispersionModel(metaclass=ABCMeta):
-    r"""Base class for reproductive Exponential Dispersion Models (EDM).
+    """Base class for reproductive Exponential Dispersion Models (EDM).
 
-    The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
+    The pdf of Y∼EDM(μ, φ) is given by::
 
-    .. math:: p(y| \theta, \phi) = c(y, \phi)
-        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
-        = \tilde{c}(y, \phi)
-            \exp\left(-\frac{d(y, \mu)}{2\phi}\right)
+        p(y| θ, φ) = c1(y, φ) * exp((θy-A(θ))/φ)
+        = c2(y, φ) * exp(-d(y, μ)/(2φ))
 
-    with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`,
-    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`,
-    unit variance :math:`v(\mu)` and
-    unit deviance :math:`d(y,\mu)`.
+    with mean E[Y] = A'(θ) = μ, variance Var[Y] = φ * v(μ),
+    unit variance v(μ), unit deviance d(y,μ) and dispersion parameter φ.
 
     Methods
     -------
@@ -56,7 +52,7 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
     """
 
     def in_y_range(self, y):
-        """Returns ``True`` if y is in the valid range of Y~EDM.
+        """Returns ``True`` if y is in the valid range of Y∼EDM.
 
         Parameters
         ----------
@@ -80,17 +76,13 @@ def in_y_range(self, y):
 
     @abstractmethod
     def unit_variance(self, mu):
-        r"""Compute the unit variance function.
+        """Compute the unit variance function.
 
-        The unit variance :math:`v(\mu)` determines the variance as
-        a function of the mean :math:`\mu` by
-        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`.
-        It can also be derived from the unit deviance :math:`d(y,\mu)` as
+        The unit variance v(μ) determines the variance as a function of the
+        mean μ by Var[Y_i] = φ/s_i * v(μ_i).
+        It can also be derived from the unit deviance d(y,μ) as::
 
-        .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{
-            \partial\mu^2}}\big|_{y=\mu}
-
-        See also :func:`variance`.
+            v(μ) = 2/(∂^2 d(y,μ)/(∂ μ^2))|_{y=μ}
 
         Parameters
         ----------
@@ -101,9 +93,9 @@ def unit_variance(self, mu):
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
-        r"""Compute the derivative of the unit variance w.r.t. mu.
+        """Compute the derivative of the unit variance w.r.t. mu.
 
-        Return :math:`v'(\mu)`.
+        Return v'(μ).
 
         Parameters
         ----------
@@ -114,12 +106,11 @@ def unit_variance_derivative(self, mu):
 
     @abstractmethod
     def unit_deviance(self, y, mu, check_input=False):
-        r"""Compute the unit deviance.
+        """Compute the unit deviance.
+
+        The unit_deviance d(y,μ) can be defined by the log-likelihood as::
 
-        The unit_deviance :math:`d(y,\mu)` can be defined by the
-        log-likelihood as
-        :math:`d(y,\mu) = -2\phi\cdot
-        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
+        d(y,μ) = -2φ * (loglike(y,μ,φ) - loglike(y,y,φ))
 
         Parameters
         ----------
@@ -140,11 +131,10 @@ def unit_deviance(self, y, mu, check_input=False):
         pass  # pragma: no cover
 
     def unit_deviance_derivative(self, y, mu):
-        r"""Compute the derivative of the unit deviance w.r.t. mu.
+        """Compute the derivative of the unit deviance w.r.t. mu.
 
         The derivative of the unit deviance is given by
-        :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
-        with unit variance :math:`v(\mu)`.
+        ∂ d(y,μ)/(∂ μ) = -2(y-μ)/v(μ) with unit variance v(μ).
 
         Parameters
         ----------
@@ -157,14 +147,13 @@ def unit_deviance_derivative(self, y, mu):
         return -2 * (y - mu) / self.unit_variance(mu)
 
     def deviance(self, y, mu, weights=1):
-        r"""Compute the deviance.
+        """Compute the deviance.
 
         The deviance is a weighted sum of the per sample unit deviances,
-        :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)`
-        with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`.
-        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
-        \left(loglike(y,\mu,\frac{phi}{s})
-        - loglike(y,y,\frac{phi}{s})\right)`.
+        D = sum_i s_i * d(y_i,μ_i)
+        with weights s_i and unit deviance d(y,μ).
+        In terms of the log-likelihood it is
+        D = -2φ * (loglike(y,μ,φ/s) - loglike(y,y,φ/s)).
 
         Parameters
         ----------
@@ -182,7 +171,7 @@ def deviance(self, y, mu, weights=1):
     def deviance_derivative(self, y, mu, weights=1):
         """Compute the derivative of the deviance w.r.t. mu.
 
-        It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`.
+        It gives ∂ D(y, μ; weights)/(∂ μ).
 
         Parameters
         ----------
@@ -211,11 +200,10 @@ def _mu_deviance_derivative(self, coef, X, y, weights, link):
 
 
 class TweedieDistribution(ExponentialDispersionModel):
-    r"""A class for the Tweedie distribution.
+    """A class for the Tweedie distribution.
 
-    A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely
-    defined by it's mean-variance relationship
-    :math:`\mathrm{Var}[Y] \propto \mu^power`.
+    A Tweedie distribution with mean μ=E[Y] is uniquely defined by it's
+    mean-variance relationship Var[Y] ∝ μ^power.
 
     Special cases are:
 
@@ -231,8 +219,7 @@ class TweedieDistribution(ExponentialDispersionModel):
     Parameters
     ----------
     power : float (default=0)
-            The variance power of the `unit_variance`
-            :math:`v(\mu) = \mu^{power}`.
+            The variance power of the unit variance v(μ) = μ^power.
             For ``0<power<1``, no distribution exists.
     """
     def __init__(self, power=0):
@@ -253,8 +240,8 @@ def power(self, power):
             # Extreme Stable or Normal distribution
             self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False)
         elif 0 < power < 1:
-            raise ValueError('Tweedie distribution is only defined for p<=0 '
-                             'and p>=1.')
+            raise ValueError('Tweedie distribution is only defined for '
+                             'power<=0 and p>=1.')
         elif 1 <= power < 2:
             # Poisson or Compound Poisson distribution
             self._lower_bound = DistributionBoundary(0, inclusive=True)
@@ -279,7 +266,7 @@ def unit_variance(self, mu):
 
     def unit_variance_derivative(self, mu):
         """Compute the derivative of the unit variance of a Tweedie
-        distribution v(mu)=power*mu**(power-1).
+        distribution v(mu)=power * mu**(power-1).
 
         Parameters
         ----------
@@ -289,12 +276,10 @@ def unit_variance_derivative(self, mu):
         return self.power * np.power(mu, self.power - 1)
 
     def unit_deviance(self, y, mu, check_input=False):
-        r"""Compute the unit deviance.
+        """Compute the unit deviance.
 
-        The unit_deviance :math:`d(y,\mu)` can be defined by the
-        log-likelihood as
-        :math:`d(y,\mu) = -2\phi\cdot
-        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
+        The unit deviance d(y,μ) can be defined by the log-likelihood as
+        d(y,μ) = -2φ * (loglike(y,μ,φ) - loglike(y,y,φ)).
 
         Parameters
         ----------
@@ -328,7 +313,7 @@ def unit_deviance(self, y, mu, check_input=False):
                 raise ValueError("Tweedie deviance is only defined for p<=0 "
                                  "and p>=1.")
             elif 1 <= p < 2:
-                # Poisson and Compount poisson distribution, y >= 0, mu > 0
+                # Poisson and Compound poisson distribution, y >= 0, mu > 0
                 if (y < 0).any() or (mu <= 0).any():
                     raise ValueError(message + "non-negative y and strictly "
                                      "positive mu.")
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 13ca3673c8b3d..fb653b5e4361f 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -31,7 +31,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """Regression via a Generalized Linear Model (GLM) with penalties.
 
     GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
-    fitting and predicting the mean of the target y as mu=h(X*w). Therefore,
+    fitting and predicting the mean of the target y as μ=h(X*w). Therefore,
     the fit minimizes the following objective function with L2
     priors as regularizer::
 
@@ -118,16 +118,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     Notes
     -----
     The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
-    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
-    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and
+    Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of
+    and given by the specific EDM, see
     :ref:`User Guide <Generalized_linear_regression>`.
 
-    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    The parameters w (``coef_`` and ``intercept_``) are estimated by
     minimizing the deviance plus penalty term, which is equivalent to
     (penalized) maximum likelihood estimation.
 
-    For alpha > 0, the feature matrix X should be standardized in order to
+    For ``alpha > 0``, the feature matrix X should be standardized in order to
     penalize features equally strong. Call
     :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
 
@@ -138,7 +138,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
     sample_weight=s)``. The weights are necessary for the right (finite
     sample) mean.
-    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
+    Consider ȳ = (sum_i s_i y_i)(sum_i s_i),
     in this case one might say that y has a 'scaled' Poisson distributions.
     The same holds for other distributions.
 
@@ -183,9 +183,9 @@ def fit(self, X, y, sample_weight=None):
                 optional (default=None)
             Individual weights w_i for each sample. Note that for an
             Exponential Dispersion Model (EDM), one has
-            Var[Y_i]=phi/w_i * v(mu).
-            If Y_i ~ EDM(mu, phi/w_i), then
-            sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
+            Var[Y_i]=φ/w_i * v(mu).
+            If Y_i ~ EDM(mu, φ/w_i), then
+            sum(w*Y)/sum(w) ~ EDM(mu, φ/sum(w)), i.e. the mean of y is a
             weighted average with weights=sample_weight.
 
         Returns
@@ -218,7 +218,7 @@ def fit(self, X, y, sample_weight=None):
                     raise ValueError("No default link known for the "
                                      "specified distribution family. Please "
                                      "set link manually, i.e. not to 'auto'; "
-                                     "got (link='auto', family={}"
+                                     "got (link='auto', family={})"
                                      .format(self.family))
             elif self.link == 'identity':
                 self._link_instance = IdentityLink()
@@ -383,11 +383,10 @@ def score(self, X, y, sample_weight=None):
         R^2 uses squared error and D^2 deviance. Note that those two are equal
         for ``family='normal'``.
 
-        D^2 is defined as
-        :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
-        :math:`D_{null}` is the null deviance, i.e. the deviance of a model
-        with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
-        The mean :math:`\\bar{y}` is averaged by sample_weight.
+        D^2 is defined as D^2 = 1 - D(y_true,y_pred) / D_null,
+        D_null is the null deviance, i.e. the deviance of a model
+        with intercept alone, which corresponds to y_pred = ȳ.
+        The mean ȳ is averaged by sample_weight.
         Best possible score is 1.0 and it can be negative (because the model
         can be arbitrarily worse).
 
@@ -490,17 +489,18 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     Notes
     -----
     The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
-    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
-    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and
+    Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of
+    and given by the specific EDM, see
     :ref:`User Guide <Generalized_linear_regression>`.
 
-    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    The parameters w (``coef_`` and ``intercept_``) are estimated by
     minimizing the deviance plus penalty term, which is equivalent to
     (penalized) maximum likelihood estimation.
 
-    For alpha > 0, the feature matrix X should be standardized in order to
-    penalize features equally strong.
+    For ``alpha > 0``, the feature matrix X should be standardized in order to
+    penalize features equally strong. Call
+    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
 
     If the target y is a ratio, appropriate sample weights s should be
     provided.
@@ -508,7 +508,7 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     weights s=exposure (time, money, persons years, ...). Then you fit
     y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``.
     The weights are necessary for the right (finite sample) mean.
-    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
+    Consider ȳ = (sum_i s_i y_i)(sum_i s_i),
     in this case one might say that y has a 'scaled' Poisson distributions.
 
     References
@@ -608,12 +608,12 @@ class GammaRegressor(GeneralizedLinearRegressor):
     Notes
     -----
     The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
-    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
-    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and
+    Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of
+    and given by the specific EDM, see
     :ref:`User Guide <Generalized_linear_regression>`.
 
-    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    The parameters w (``coef_`` and ``intercept_``) are estimated by
     minimizing the deviance plus penalty term, which is equivalent to
     (penalized) maximum likelihood estimation.
 
@@ -664,7 +664,7 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     Parameters
     ----------
     power : float (default=0)
-            The variance power: :math:`v(\\mu) = \\mu^{power}`.
+            The variance power: v(μ) = μ^{power}.
             For ``0<power<1``, no distribution exists.
 
             Special cases are:
@@ -682,7 +682,7 @@ class TweedieRegressor(GeneralizedLinearRegressor):
         Constant that multiplies the penalty terms and thus determines the
         regularization strength.
         See the notes for the exact mathematical meaning of this
-        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this
         case, the design matrix X must have full column rank
         (no collinearities).
 
@@ -730,12 +730,12 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     Notes
     -----
     The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
-    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
-    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and
+    Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of
+    and given by the specific EDM, see
     :ref:`User Guide <Generalized_linear_regression>`.
 
-    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    The parameters w (``coef_`` and ``intercept_``) are estimated by
     minimizing the deviance plus penalty term, which is equivalent to
     (penalized) maximum likelihood estimation.
 

From 44524cafc0aac0289267f1099b2ec2f8d53fe6c7 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 17 Aug 2019 20:28:54 +0200
Subject: [PATCH 134/209] Replace Tweedie p->power

---
 sklearn/linear_model/_glm/distribution.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index 1baf99a465326..e82f787d7e710 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -241,7 +241,7 @@ def power(self, power):
             self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False)
         elif 0 < power < 1:
             raise ValueError('Tweedie distribution is only defined for '
-                             'power<=0 and p>=1.')
+                             'power<=0 and power>=1.')
         elif 1 <= power < 2:
             # Poisson or Compound Poisson distribution
             self._lower_bound = DistributionBoundary(0, inclusive=True)

From 58d240973a175ee705d4e8cfbd4e9b6495eca609 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 17 Aug 2019 20:34:11 +0200
Subject: [PATCH 135/209] Replace Tweedie p->power

---
 sklearn/linear_model/_glm/distribution.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index e82f787d7e710..e0b3511734cee 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -300,7 +300,7 @@ def unit_deviance(self, y, mu, check_input=False):
         p = self.power
 
         if check_input:
-            message = ("Mean Tweedie deviance error with p={} can only be "
+            message = ("Mean Tweedie deviance error with power={} can only be "
                        "used on ".format(p))
             if p < 0:
                 # 'Extreme stable', y any realy number, mu > 0
@@ -310,8 +310,8 @@ def unit_deviance(self, y, mu, check_input=False):
                 # Normal, y and mu can be any real number
                 pass
             elif 0 < p < 1:
-                raise ValueError("Tweedie deviance is only defined for p<=0 "
-                                 "and p>=1.")
+                raise ValueError("Tweedie deviance is only defined for "
+                                 "power<=0 and power>=1.")
             elif 1 <= p < 2:
                 # Poisson and Compound poisson distribution, y >= 0, mu > 0
                 if (y < 0).any() or (mu <= 0).any():
@@ -335,8 +335,8 @@ def unit_deviance(self, y, mu, check_input=False):
             # Normal distribution, y and mu any real number
             dev = (y - mu)**2
         elif p < 1:
-            raise ValueError("Tweedie deviance is only defined for p<=0 and "
-                             "p>=1.")
+            raise ValueError("Tweedie deviance is only defined for power<=0 "
+                             "and power>=1.")
         elif p == 1:
             # Poisson distribution
             dev = 2 * (xlogy(y, y/mu) - y + mu)

From ee351e1f7137bc225d8f964558009612303b37db Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 18 Aug 2019 00:11:57 +0200
Subject: [PATCH 136/209] Fix tests due to Tweedie p->power

---
 sklearn/linear_model/_glm/tests/test_distribution.py | 2 +-
 sklearn/metrics/tests/test_regression.py             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py
index 82e493b7a2149..ed81c8328c87a 100644
--- a/sklearn/linear_model/_glm/tests/test_distribution.py
+++ b/sklearn/linear_model/_glm/tests/test_distribution.py
@@ -45,7 +45,7 @@ def test_invalid_distribution_bound():
 
 
 def test_tweedie_distribution_power():
-    msg = "distribution is only defined for p<=0 and p>=1"
+    msg = "distribution is only defined for power<=0 and power>=1"
     with pytest.raises(ValueError, match=msg):
         TweedieDistribution(power=0.5)
 
diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index 8e18321f80303..8febf9064e58e 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -133,7 +133,7 @@ def test_regression_metrics_at_limits():
         mean_tweedie_deviance([0.], [0.], p=p)
 
     with pytest.raises(ValueError,
-                       match="is only defined for p<=0 and p>=1"):
+                       match="is only defined for power<=0 and power>=1"):
         mean_tweedie_deviance([0.], [0.], p=0.5)
 
 

From 33fe9be737001045eb9fad5616f678fb27fb8d79 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 18 Aug 2019 12:09:24 +0200
Subject: [PATCH 137/209] Simplify super(...)

---
 sklearn/linear_model/_glm/distribution.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index e0b3511734cee..7ce51deb0921d 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -353,25 +353,25 @@ def unit_deviance(self, y, mu, check_input=False):
 class NormalDistribution(TweedieDistribution):
     """Class for the Normal (aka Gaussian) distribution"""
     def __init__(self):
-        super(NormalDistribution, self).__init__(power=0)
+        super().__init__(power=0)
 
 
 class PoissonDistribution(TweedieDistribution):
     """Class for the scaled Poisson distribution"""
     def __init__(self):
-        super(PoissonDistribution, self).__init__(power=1)
+        super().__init__(power=1)
 
 
 class GammaDistribution(TweedieDistribution):
     """Class for the Gamma distribution"""
     def __init__(self):
-        super(GammaDistribution, self).__init__(power=2)
+        super().__init__(power=2)
 
 
 class InverseGaussianDistribution(TweedieDistribution):
     """Class for the scaled InverseGaussianDistribution distribution"""
     def __init__(self):
-        super(InverseGaussianDistribution, self).__init__(power=3)
+        super().__init__(power=3)
 
 
 EDM_DISTRIBUTIONS = {

From 94272e79d565672416fbc6772aa8c0eb9e7ce519 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 18 Aug 2019 12:26:12 +0200
Subject: [PATCH 138/209] Replace Link.link(..) by __call__(..)

---
 sklearn/linear_model/_glm/glm.py             | 8 ++++----
 sklearn/linear_model/_glm/link.py            | 8 ++++----
 sklearn/linear_model/_glm/tests/test_link.py | 4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index fb653b5e4361f..4d7a05095cb27 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -49,7 +49,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         Constant that multiplies the penalty terms and thus determines the
         regularization strength.
         See the notes for the exact mathematical meaning of this
-        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this
         case, the design matrix X must have full column rank
         (no collinearities).
 
@@ -296,7 +296,7 @@ def fit(self, X, y, sample_weight=None):
         else:
             if self.fit_intercept:
                 coef = np.zeros(n_features+1)
-                coef[0] = link.link(np.average(y, weights=weights))
+                coef[0] = link(np.average(y, weights=weights))
             else:
                 coef = np.zeros(n_features)
 
@@ -441,7 +441,7 @@ class PoissonRegressor(GeneralizedLinearRegressor):
         Constant that multiplies the penalty terms and thus determines the
         regularization strength.
         See the notes for the exact mathematical meaning of this
-        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this
         case, the design matrix X must have full column rank
         (no collinearities).
 
@@ -560,7 +560,7 @@ class GammaRegressor(GeneralizedLinearRegressor):
         Constant that multiplies the penalty terms and thus determines the
         regularization strength.
         See the notes for the exact mathematical meaning of this
-        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this
         case, the design matrix X must have full column rank
         (no collinearities).
 
diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py
index ec9a7b7736eb1..1ef485cc85a55 100644
--- a/sklearn/linear_model/_glm/link.py
+++ b/sklearn/linear_model/_glm/link.py
@@ -15,7 +15,7 @@ class Link(metaclass=ABCMeta):
     """Abstract base class for Link functions."""
 
     @abstractmethod
-    def link(self, mu):
+    def __call__(self, mu):
         """Compute the link function g(mu).
 
         The link function links the mean mu=E[Y] to the so called linear
@@ -79,7 +79,7 @@ def inverse_derivative2(self, lin_pred):
 class IdentityLink(Link):
     """The identity link function g(x)=x."""
 
-    def link(self, mu):
+    def __call__(self, mu):
         return mu
 
     def derivative(self, mu):
@@ -98,7 +98,7 @@ def inverse_derivative2(self, lin_pred):
 class LogLink(Link):
     """The log link function g(x)=log(x)."""
 
-    def link(self, mu):
+    def __call__(self, mu):
         return np.log(mu)
 
     def derivative(self, mu):
@@ -117,7 +117,7 @@ def inverse_derivative2(self, lin_pred):
 class LogitLink(Link):
     """The logit link function g(x)=logit(x)."""
 
-    def link(self, mu):
+    def __call__(self, mu):
         return logit(mu)
 
     def derivative(self, mu):
diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py
index 3a2a21c4c04e0..feafb151e14a2 100644
--- a/sklearn/linear_model/_glm/tests/test_link.py
+++ b/sklearn/linear_model/_glm/tests/test_link.py
@@ -25,7 +25,7 @@ def test_link_properties(link):
         # careful for large x, note expit(36) = 1
         # limit max eta to 15
         x = x / 100 * 15
-    assert_allclose(link.link(link.inverse(x)), x)
+    assert_allclose(link(link.inverse(x)), x)
     # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
     assert_allclose(link.derivative(link.inverse(x)),
                     1./link.inverse_derivative(x))
@@ -34,5 +34,5 @@ def test_link_properties(link):
       link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape)
 
     # for LogitLink, in the following x should be between 0 and 1.
-    # assert_almost_equal(link.inverse_derivative(link.link(x)),
+    # assert_almost_equal(link.inverse_derivative(link(x)),
     #                     1./link.derivative(x), decimal=decimal)

From 2457039f82ea308e634e4853484df17029f311ac Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 18 Aug 2019 12:45:35 +0200
Subject: [PATCH 139/209] Replace 1. -> 1

---
 sklearn/linear_model/_glm/glm.py             |  2 +-
 sklearn/linear_model/_glm/link.py            |  8 ++++----
 sklearn/linear_model/_glm/tests/test_link.py | 11 ++++++-----
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 4d7a05095cb27..b8d87799f8a93 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -415,7 +415,7 @@ def score(self, X, y, sample_weight=None):
         dev = self._family_instance.deviance(y, mu, weights=weights)
         y_mean = np.average(y, weights=weights)
         dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
-        return 1. - dev / dev_null
+        return 1 - dev / dev_null
 
     def _more_tags(self):
         return {"requires_positive_y": True}
diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py
index 1ef485cc85a55..04b485c067cca 100644
--- a/sklearn/linear_model/_glm/link.py
+++ b/sklearn/linear_model/_glm/link.py
@@ -102,7 +102,7 @@ def __call__(self, mu):
         return np.log(mu)
 
     def derivative(self, mu):
-        return 1./mu
+        return 1 / mu
 
     def inverse(self, lin_pred):
         return np.exp(lin_pred)
@@ -121,15 +121,15 @@ def __call__(self, mu):
         return logit(mu)
 
     def derivative(self, mu):
-        return 1. / (mu * (1 - mu))
+        return 1 / (mu * (1 - mu))
 
     def inverse(self, lin_pred):
         return expit(lin_pred)
 
     def inverse_derivative(self, lin_pred):
         ep = expit(lin_pred)
-        return ep * (1. - ep)
+        return ep * (1 - ep)
 
     def inverse_derivative2(self, lin_pred):
         ep = expit(lin_pred)
-        return ep * (1. - ep) * (1. - 2 * ep)
+        return ep * (1 - ep) * (1 - 2 * ep)
diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py
index feafb151e14a2..a631509baca79 100644
--- a/sklearn/linear_model/_glm/tests/test_link.py
+++ b/sklearn/linear_model/_glm/tests/test_link.py
@@ -19,20 +19,21 @@
 def test_link_properties(link):
     """Test link inverse and derivative."""
     rng = np.random.RandomState(42)
-    x = rng.rand(100)*100
+    x = rng.rand(100) * 100
     link = link()  # instantiate object
     if isinstance(link, LogitLink):
         # careful for large x, note expit(36) = 1
         # limit max eta to 15
         x = x / 100 * 15
     assert_allclose(link(link.inverse(x)), x)
-    # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
+    # if g(h(x)) = x, then g'(h(x)) = 1/h'(x)
+    # g = link, h = link.inverse
     assert_allclose(link.derivative(link.inverse(x)),
-                    1./link.inverse_derivative(x))
+                    1 / link.inverse_derivative(x))
 
     assert (
       link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape)
 
-    # for LogitLink, in the following x should be between 0 and 1.
+    # for LogitLink, in the following x should be between 0 and 1
     # assert_almost_equal(link.inverse_derivative(link(x)),
-    #                     1./link.derivative(x), decimal=decimal)
+    #                     1 / link.derivative(x), decimal=decimal)

From 6396d2c4495681d7d774f7c7de7054b336ba2709 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 18 Aug 2019 13:46:45 +0200
Subject: [PATCH 140/209] Fix table in TweedieRegressor

---
 sklearn/linear_model/_glm/glm.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index b8d87799f8a93..f538aa6121747 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -664,19 +664,24 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     Parameters
     ----------
     power : float (default=0)
-            The variance power: v(μ) = μ^{power}.
+            The variance power: v(μ) = μ^power.
             For ``0<power<1``, no distribution exists.
 
             Special cases are:
 
-            ===== ================
-            Power Distribution
-            ===== ================
-            0     Normal
-            1     Poisson
-            (1,2) Compound Poisson
-            2     Gamma
-            3     Inverse Gaussian
+            +-------+------------------+
+            | Power | Distribution     |
+            +=======+==================+
+            | 0     | Normal           |
+            +-------+------------------+
+            | 1     | Poisson          |
+            +-------+------------------+
+            | (1,2) | Compound Poisson |
+            +-------+------------------+
+            | 2     | Gamma            |
+            +-------+------------------+
+            | 3     | Inverse Gaussian |
+            +-------+------------------+
 
     alpha : float, optional (default=1)
         Constant that multiplies the penalty terms and thus determines the

From 8be0387a8ed1f35b6c2e978c2500c02b9d74b7b4 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 22 Aug 2019 12:02:28 +0300
Subject: [PATCH 141/209] Improve docstring in
 plot_tweedie_regression_insurance_claims.py

---
 .../linear_model/plot_tweedie_regression_insurance_claims.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 55a21c8d8723d..24b3afcadf120 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -85,8 +85,8 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 
     Parameters
     ----------
-    df : DataFrame with at least three columns named feature, weight and
-         observed
+    df : DataFrame
+        input data
     feature: str
         a column name of df for the feature to be plotted
     weight : str

From da66fd5dc0fb8fc89825e5cf75057eed70c09aed Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 22 Aug 2019 12:23:40 +0300
Subject: [PATCH 142/209] Use train_test_split in tests

---
 sklearn/linear_model/_glm/tests/test_glm.py | 36 +++++++++++----------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index ebcab6395e5b4..d6c7c7c3f767e 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -24,6 +24,7 @@
 )
 from sklearn.linear_model import Ridge
 from sklearn.exceptions import ConvergenceWarning
+from sklearn.model_selection import train_test_split
 
 GLM_SOLVERS = ['lbfgs']
 
@@ -207,9 +208,8 @@ def test_glm_log_regression(family, solver, tol):
 
 @pytest.mark.parametrize('fit_intercept', [True, False])
 def test_warm_start(fit_intercept):
-    n_samples, n_features = 100, 10
-    n_predict = 10
-    X, y, coef = make_regression(n_samples=n_samples+n_predict,
+    n_samples, n_features = 110, 10
+    X, y, coef = make_regression(n_samples=n_samples,
                                  n_features=n_features,
                                  n_informative=n_features-2, noise=0.5,
                                  coef=True, random_state=42)
@@ -230,7 +230,7 @@ def test_warm_start(fit_intercept):
     assert glm1.score(X, y) > glm2.score(X, y)
     glm2.set_params(max_iter=1000)
     glm2.fit(X, y)
-    assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-4, atol=1e-5)
+    assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5)
     assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4)
     # TODO: investigate why this doesn't match
     # assert glm1.n_iter_ == glm2.n_iter_ + 2
@@ -242,13 +242,14 @@ def test_warm_start(fit_intercept):
 def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
     """Compare with Ridge regression for Normal distributions."""
     alpha = 1.0
-    n_predict = 10
-    X, y, _ = make_regression(n_samples=n_samples+n_predict,
-                              n_features=n_features,
-                              n_informative=n_features-2, noise=0.5,
-                              coef=True, random_state=42)
-    y = y[0:n_samples]
-    X, T = X[0:n_samples], X[n_samples:]
+    test_size = 10
+    X, y = make_regression(n_samples=n_samples + test_size,
+                           n_features=n_features,
+                           n_informative=n_features-2, noise=0.5,
+                           random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=test_size, random_state=0
+    )
 
     if n_samples > n_features:
         ridge_params = {"solver": "svd"}
@@ -258,17 +259,18 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
     ridge = Ridge(alpha=alpha*n_samples, normalize=False,
                   random_state=42, **ridge_params)
-    ridge.fit(X, y)
+    ridge.fit(X_train, y_train)
 
     glm = GeneralizedLinearRegressor(alpha=1.0, family='normal',
                                      link='identity', fit_intercept=True,
-                                     max_iter=300, solver=solver, tol=1e-6,
-                                     check_input=False)
-    glm.fit(X, y)
+                                     solver=solver, check_input=False,
+                                     max_iter=300)
+    glm.fit(X_train, y_train)
     assert glm.coef_.shape == (X.shape[1], )
-    assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6)
+    assert_allclose(glm.coef_, ridge.coef_, atol=5e-5)
     assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
-    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
+    assert_allclose(glm.predict(X_train), ridge.predict(X_train), rtol=5e-5)
+    assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=5e-5)
 
 
 @pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-7)])

From b9bc170b2ef7673d6b40103a6583cd9a56d7c517 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 22 Aug 2019 14:07:32 +0300
Subject: [PATCH 143/209] Fix TODO in test_warm_start

---
 sklearn/linear_model/_glm/tests/test_glm.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index d6c7c7c3f767e..4975d1454b922 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -232,8 +232,7 @@ def test_warm_start(fit_intercept):
     glm2.fit(X, y)
     assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5)
     assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4)
-    # TODO: investigate why this doesn't match
-    # assert glm1.n_iter_ == glm2.n_iter_ + 2
+    assert glm1.n_iter_ == glm2.n_iter_
 
 
 @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)])

From ab6c5d82ea6149f8acc20834abcc735c630a516c Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 22 Aug 2019 14:31:06 +0300
Subject: [PATCH 144/209] Revert "No LaTeX in docstrings"

This reverts commit 61dc13fdd5b71fe0e1d30e5a9b58934441a9dcb1.
---
 sklearn/linear_model/_glm/distribution.py | 85 +++++++++++++----------
 sklearn/linear_model/_glm/glm.py          | 66 +++++++++---------
 2 files changed, 83 insertions(+), 68 deletions(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index 7ce51deb0921d..5754cb391ec61 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -26,15 +26,19 @@ def _safe_lin_pred(X, coef):
 
 
 class ExponentialDispersionModel(metaclass=ABCMeta):
-    """Base class for reproductive Exponential Dispersion Models (EDM).
+    r"""Base class for reproductive Exponential Dispersion Models (EDM).
 
-    The pdf of Y∼EDM(μ, φ) is given by::
+    The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
 
-        p(y| θ, φ) = c1(y, φ) * exp((θy-A(θ))/φ)
-        = c2(y, φ) * exp(-d(y, μ)/(2φ))
+    .. math:: p(y| \theta, \phi) = c(y, \phi)
+        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
+        = \tilde{c}(y, \phi)
+            \exp\left(-\frac{d(y, \mu)}{2\phi}\right)
 
-    with mean E[Y] = A'(θ) = μ, variance Var[Y] = φ * v(μ),
-    unit variance v(μ), unit deviance d(y,μ) and dispersion parameter φ.
+    with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`,
+    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`,
+    unit variance :math:`v(\mu)` and
+    unit deviance :math:`d(y,\mu)`.
 
     Methods
     -------
@@ -52,7 +56,7 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
     """
 
     def in_y_range(self, y):
-        """Returns ``True`` if y is in the valid range of Y∼EDM.
+        """Returns ``True`` if y is in the valid range of Y~EDM.
 
         Parameters
         ----------
@@ -76,13 +80,17 @@ def in_y_range(self, y):
 
     @abstractmethod
     def unit_variance(self, mu):
-        """Compute the unit variance function.
+        r"""Compute the unit variance function.
 
-        The unit variance v(μ) determines the variance as a function of the
-        mean μ by Var[Y_i] = φ/s_i * v(μ_i).
-        It can also be derived from the unit deviance d(y,μ) as::
+        The unit variance :math:`v(\mu)` determines the variance as
+        a function of the mean :math:`\mu` by
+        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`.
+        It can also be derived from the unit deviance :math:`d(y,\mu)` as
 
-            v(μ) = 2/(∂^2 d(y,μ)/(∂ μ^2))|_{y=μ}
+        .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{
+            \partial\mu^2}}\big|_{y=\mu}
+
+        See also :func:`variance`.
 
         Parameters
         ----------
@@ -93,9 +101,9 @@ def unit_variance(self, mu):
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
-        """Compute the derivative of the unit variance w.r.t. mu.
+        r"""Compute the derivative of the unit variance w.r.t. mu.
 
-        Return v'(μ).
+        Return :math:`v'(\mu)`.
 
         Parameters
         ----------
@@ -106,11 +114,12 @@ def unit_variance_derivative(self, mu):
 
     @abstractmethod
     def unit_deviance(self, y, mu, check_input=False):
-        """Compute the unit deviance.
-
-        The unit_deviance d(y,μ) can be defined by the log-likelihood as::
+        r"""Compute the unit deviance.
 
-        d(y,μ) = -2φ * (loglike(y,μ,φ) - loglike(y,y,φ))
+        The unit_deviance :math:`d(y,\mu)` can be defined by the
+        log-likelihood as
+        :math:`d(y,\mu) = -2\phi\cdot
+        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
 
         Parameters
         ----------
@@ -131,10 +140,11 @@ def unit_deviance(self, y, mu, check_input=False):
         pass  # pragma: no cover
 
     def unit_deviance_derivative(self, y, mu):
-        """Compute the derivative of the unit deviance w.r.t. mu.
+        r"""Compute the derivative of the unit deviance w.r.t. mu.
 
         The derivative of the unit deviance is given by
-        ∂ d(y,μ)/(∂ μ) = -2(y-μ)/v(μ) with unit variance v(μ).
+        :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
+        with unit variance :math:`v(\mu)`.
 
         Parameters
         ----------
@@ -147,13 +157,14 @@ def unit_deviance_derivative(self, y, mu):
         return -2 * (y - mu) / self.unit_variance(mu)
 
     def deviance(self, y, mu, weights=1):
-        """Compute the deviance.
+        r"""Compute the deviance.
 
         The deviance is a weighted sum of the per sample unit deviances,
-        D = sum_i s_i * d(y_i,μ_i)
-        with weights s_i and unit deviance d(y,μ).
-        In terms of the log-likelihood it is
-        D = -2φ * (loglike(y,μ,φ/s) - loglike(y,y,φ/s)).
+        :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)`
+        with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`.
+        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
+        \left(loglike(y,\mu,\frac{phi}{s})
+        - loglike(y,y,\frac{phi}{s})\right)`.
 
         Parameters
         ----------
@@ -171,7 +182,7 @@ def deviance(self, y, mu, weights=1):
     def deviance_derivative(self, y, mu, weights=1):
         """Compute the derivative of the deviance w.r.t. mu.
 
-        It gives ∂ D(y, μ; weights)/(∂ μ).
+        It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`.
 
         Parameters
         ----------
@@ -200,10 +211,11 @@ def _mu_deviance_derivative(self, coef, X, y, weights, link):
 
 
 class TweedieDistribution(ExponentialDispersionModel):
-    """A class for the Tweedie distribution.
+    r"""A class for the Tweedie distribution.
 
-    A Tweedie distribution with mean μ=E[Y] is uniquely defined by it's
-    mean-variance relationship Var[Y] ∝ μ^power.
+    A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely
+    defined by it's mean-variance relationship
+    :math:`\mathrm{Var}[Y] \propto \mu^power`.
 
     Special cases are:
 
@@ -219,7 +231,8 @@ class TweedieDistribution(ExponentialDispersionModel):
     Parameters
     ----------
     power : float (default=0)
-            The variance power of the unit variance v(μ) = μ^power.
+            The variance power of the `unit_variance`
+            :math:`v(\mu) = \mu^{power}`.
             For ``0<power<1``, no distribution exists.
     """
     def __init__(self, power=0):
@@ -266,7 +279,7 @@ def unit_variance(self, mu):
 
     def unit_variance_derivative(self, mu):
         """Compute the derivative of the unit variance of a Tweedie
-        distribution v(mu)=power * mu**(power-1).
+        distribution v(mu)=power*mu**(power-1).
 
         Parameters
         ----------
@@ -276,10 +289,12 @@ def unit_variance_derivative(self, mu):
         return self.power * np.power(mu, self.power - 1)
 
     def unit_deviance(self, y, mu, check_input=False):
-        """Compute the unit deviance.
+        r"""Compute the unit deviance.
 
-        The unit deviance d(y,μ) can be defined by the log-likelihood as
-        d(y,μ) = -2φ * (loglike(y,μ,φ) - loglike(y,y,φ)).
+        The unit_deviance :math:`d(y,\mu)` can be defined by the
+        log-likelihood as
+        :math:`d(y,\mu) = -2\phi\cdot
+        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
 
         Parameters
         ----------
@@ -313,7 +328,7 @@ def unit_deviance(self, y, mu, check_input=False):
                 raise ValueError("Tweedie deviance is only defined for "
                                  "power<=0 and power>=1.")
             elif 1 <= p < 2:
-                # Poisson and Compound poisson distribution, y >= 0, mu > 0
+                # Poisson and Compount poisson distribution, y >= 0, mu > 0
                 if (y < 0).any() or (mu <= 0).any():
                     raise ValueError(message + "non-negative y and strictly "
                                      "positive mu.")
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index f538aa6121747..69e68c1eda0f8 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -31,7 +31,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """Regression via a Generalized Linear Model (GLM) with penalties.
 
     GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
-    fitting and predicting the mean of the target y as μ=h(X*w). Therefore,
+    fitting and predicting the mean of the target y as mu=h(X*w). Therefore,
     the fit minimizes the following objective function with L2
     priors as regularizer::
 
@@ -118,16 +118,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     Notes
     -----
     The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and
-    Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of
-    and given by the specific EDM, see
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
     :ref:`User Guide <Generalized_linear_regression>`.
 
-    The parameters w (``coef_`` and ``intercept_``) are estimated by
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
     minimizing the deviance plus penalty term, which is equivalent to
     (penalized) maximum likelihood estimation.
 
-    For ``alpha > 0``, the feature matrix X should be standardized in order to
+    For alpha > 0, the feature matrix X should be standardized in order to
     penalize features equally strong. Call
     :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
 
@@ -138,7 +138,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
     sample_weight=s)``. The weights are necessary for the right (finite
     sample) mean.
-    Consider ȳ = (sum_i s_i y_i)(sum_i s_i),
+    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
     in this case one might say that y has a 'scaled' Poisson distributions.
     The same holds for other distributions.
 
@@ -183,9 +183,9 @@ def fit(self, X, y, sample_weight=None):
                 optional (default=None)
             Individual weights w_i for each sample. Note that for an
             Exponential Dispersion Model (EDM), one has
-            Var[Y_i]=φ/w_i * v(mu).
-            If Y_i ~ EDM(mu, φ/w_i), then
-            sum(w*Y)/sum(w) ~ EDM(mu, φ/sum(w)), i.e. the mean of y is a
+            Var[Y_i]=phi/w_i * v(mu).
+            If Y_i ~ EDM(mu, phi/w_i), then
+            sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
             weighted average with weights=sample_weight.
 
         Returns
@@ -218,7 +218,7 @@ def fit(self, X, y, sample_weight=None):
                     raise ValueError("No default link known for the "
                                      "specified distribution family. Please "
                                      "set link manually, i.e. not to 'auto'; "
-                                     "got (link='auto', family={})"
+                                     "got (link='auto', family={}"
                                      .format(self.family))
             elif self.link == 'identity':
                 self._link_instance = IdentityLink()
@@ -383,10 +383,11 @@ def score(self, X, y, sample_weight=None):
         R^2 uses squared error and D^2 deviance. Note that those two are equal
         for ``family='normal'``.
 
-        D^2 is defined as D^2 = 1 - D(y_true,y_pred) / D_null,
-        D_null is the null deviance, i.e. the deviance of a model
-        with intercept alone, which corresponds to y_pred = ȳ.
-        The mean ȳ is averaged by sample_weight.
+        D^2 is defined as
+        :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
+        :math:`D_{null}` is the null deviance, i.e. the deviance of a model
+        with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
+        The mean :math:`\\bar{y}` is averaged by sample_weight.
         Best possible score is 1.0 and it can be negative (because the model
         can be arbitrarily worse).
 
@@ -489,18 +490,17 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     Notes
     -----
     The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and
-    Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of
-    and given by the specific EDM, see
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
     :ref:`User Guide <Generalized_linear_regression>`.
 
-    The parameters w (``coef_`` and ``intercept_``) are estimated by
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
     minimizing the deviance plus penalty term, which is equivalent to
     (penalized) maximum likelihood estimation.
 
-    For ``alpha > 0``, the feature matrix X should be standardized in order to
-    penalize features equally strong. Call
-    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
+    For alpha > 0, the feature matrix X should be standardized in order to
+    penalize features equally strong.
 
     If the target y is a ratio, appropriate sample weights s should be
     provided.
@@ -508,7 +508,7 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     weights s=exposure (time, money, persons years, ...). Then you fit
     y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``.
     The weights are necessary for the right (finite sample) mean.
-    Consider ȳ = (sum_i s_i y_i)(sum_i s_i),
+    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
     in this case one might say that y has a 'scaled' Poisson distributions.
 
     References
@@ -608,12 +608,12 @@ class GammaRegressor(GeneralizedLinearRegressor):
     Notes
     -----
     The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and
-    Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of
-    and given by the specific EDM, see
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
     :ref:`User Guide <Generalized_linear_regression>`.
 
-    The parameters w (``coef_`` and ``intercept_``) are estimated by
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
     minimizing the deviance plus penalty term, which is equivalent to
     (penalized) maximum likelihood estimation.
 
@@ -664,7 +664,7 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     Parameters
     ----------
     power : float (default=0)
-            The variance power: v(μ) = μ^power.
+            The variance power: :math:`v(\\mu) = \\mu^{power}`.
             For ``0<power<1``, no distribution exists.
 
             Special cases are:
@@ -687,7 +687,7 @@ class TweedieRegressor(GeneralizedLinearRegressor):
         Constant that multiplies the penalty terms and thus determines the
         regularization strength.
         See the notes for the exact mathematical meaning of this
-        parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
         case, the design matrix X must have full column rank
         (no collinearities).
 
@@ -735,12 +735,12 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     Notes
     -----
     The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and
-    Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of
-    and given by the specific EDM, see
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
     :ref:`User Guide <Generalized_linear_regression>`.
 
-    The parameters w (``coef_`` and ``intercept_``) are estimated by
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
     minimizing the deviance plus penalty term, which is equivalent to
     (penalized) maximum likelihood estimation.
 

From b424a070f696262174959c1927c921d301fd8ba5 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 22 Aug 2019 14:42:39 +0300
Subject: [PATCH 145/209] Remove n_iter_ check when warm start.

---
 sklearn/linear_model/_glm/glm.py            | 2 +-
 sklearn/linear_model/_glm/tests/test_glm.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 69e68c1eda0f8..38f27eec397e8 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -218,7 +218,7 @@ def fit(self, X, y, sample_weight=None):
                     raise ValueError("No default link known for the "
                                      "specified distribution family. Please "
                                      "set link manually, i.e. not to 'auto'; "
-                                     "got (link='auto', family={}"
+                                     "got (link='auto', family={})"
                                      .format(self.family))
             elif self.link == 'identity':
                 self._link_instance = IdentityLink()
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 4975d1454b922..879d382ce28dc 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -232,7 +232,6 @@ def test_warm_start(fit_intercept):
     glm2.fit(X, y)
     assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5)
     assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4)
-    assert glm1.n_iter_ == glm2.n_iter_
 
 
 @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)])

From 95a90580c9d034e7bcb9c488cc19d09ac692deca Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 22 Aug 2019 15:02:00 +0300
Subject: [PATCH 146/209] Rename variable L2 -> coef_scaled

---
 sklearn/linear_model/_glm/glm.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 38f27eec397e8..a7128eb78df07 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -304,15 +304,16 @@ def fit(self, X, y, sample_weight=None):
 
         if solver == 'lbfgs':
             def func(coef, X, y, weights, alpha, family, link):
-                mu, devp = \
-                    family._mu_deviance_derivative(coef, X, y, weights, link)
+                mu, devp = family._mu_deviance_derivative(
+                    coef, X, y, weights, link
+                )
                 dev = family.deviance(y, mu, weights)
                 intercept = (coef.size == X.shape[1] + 1)
                 idx = 1 if intercept else 0  # offset if coef[0] is intercept
-                L2 = alpha * coef[idx:]
-                obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2)
+                coef_scaled = alpha * coef[idx:]
+                obj = 0.5 * dev + 0.5 * (coef[idx:] @ coef_scaled)
                 objp = 0.5 * devp
-                objp[idx:] += L2
+                objp[idx:] += coef_scaled
                 return obj, objp
 
             args = (X, y, weights, self.alpha, family, link)

From 59eceb42d874d94d0bb7fdced4f3e05fd030dc62 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 22 Aug 2019 15:59:54 +0300
Subject: [PATCH 147/209] Minor fixes

---
 .../plot_tweedie_regression_insurance_claims.py      |  3 ---
 sklearn/linear_model/_glm/distribution.py            | 12 ++++++------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 24b3afcadf120..24451069eeb68 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -181,9 +181,6 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 # events occuring with a constant rate in a given time interval (``Exposure``).
 # Here we model the frequency ``y = ClaimNb / Exposure``,
 # which is still a (scaled) Poisson distribution.
-#
-# A very important property of the Poisson distribution is its mean-variance
-# relation: The variance is proportional to the mean.
 
 df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2)
 
diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index 5754cb391ec61..8779cd616cc5a 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -342,9 +342,9 @@ def unit_deviance(self, y, mu, check_input=False):
 
         if p < 0:
             # 'Extreme stable', y any realy number, mu > 0
-            dev = 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p) * (2-p)) -
-                       y * np.power(mu, 1-p)/(1-p) +
-                       np.power(mu, 2-p)/(2-p))
+            dev = 2 * (np.power(np.maximum(y, 0), 2-p) / ((1-p) * (2-p))
+                       - y * np.power(mu, 1-p) / (1-p)
+                       + np.power(mu, 2-p) / (2-p))
 
         elif p == 0:
             # Normal distribution, y and mu any real number
@@ -359,9 +359,9 @@ def unit_deviance(self, y, mu, check_input=False):
             # Gamma distribution
             dev = 2 * (np.log(mu/y) + y/mu - 1)
         else:
-            dev = 2 * (np.power(y, 2-p)/((1-p) * (2-p)) -
-                       y * np.power(mu, 1-p)/(1-p) +
-                       np.power(mu, 2-p)/(2-p))
+            dev = 2 * (np.power(y, 2-p) / ((1-p) * (2-p))
+                       - y * np.power(mu, 1-p) / (1-p)
+                       + np.power(mu, 2-p) / (2-p))
         return dev
 
 

From 04f30f40e572d6e919fef8a4c2d4bcb27d562b6b Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 28 Aug 2019 10:13:27 +0300
Subject: [PATCH 148/209] Better wording in example

---
 .../linear_model/plot_poisson_regression_non_normal_loss.py   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 5c0b64faea255..d7f9b65660453 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -5,7 +5,9 @@
 
 This example illustrates the use of linear Poisson regression
 on the French Motor Third-Party Liability Claims dataset [1] and compares
-it with models learned with least squared error.
+it with models learned with least squared error. The goal is to predict the
+number of insurance claims (or frequency) following car accidents for a user
+given historical data over a population of users.
 
 We start by defining a few helper functions for loading the data and
 visualizing results.

From 3630b5277f68cd9805ec5f3af21b7fe9336cd46d Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 28 Aug 2019 11:25:19 +0300
Subject: [PATCH 149/209] Improvements in
 plot_poisson_regression_non_normal_loss.py

---
 ...plot_poisson_regression_non_normal_loss.py | 95 ++++++++++---------
 1 file changed, 48 insertions(+), 47 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index d7f9b65660453..6a98030817245 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -28,6 +28,7 @@
 import pandas as pd
 
 from sklearn.datasets import fetch_openml
+from sklearn.dummy import DummyRegressor
 from sklearn.compose import ColumnTransformer
 from sklearn.linear_model import PoissonRegressor, LinearRegression
 from sklearn.model_selection import train_test_split
@@ -78,7 +79,7 @@ def load_mtpl2(n_samples=100000):
 # containing the number of claims (``ClaimNb``) with the freMTPL2sev table
 # containing the claim amount (``ClaimAmount``) for the same user ids.
 
-df = load_mtpl2(n_samples=100000)
+df = load_mtpl2(n_samples=50000)
 
 # Note: filter out claims with zero amount, as the severity model
 # requires a strictly positive target values.
@@ -117,8 +118,6 @@ def load_mtpl2(n_samples=100000):
 # (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``,
 # which is still a (scaled) Poisson distribution.
 #
-# A very important property of the Poisson distribution is its mean-variance
-# relation: The variance is proportional to the mean.
 
 df["Frequency"] = df.ClaimNb / df.Exposure
 
@@ -135,49 +134,50 @@ def load_mtpl2(n_samples=100000):
 # To evaluate the pertinence of the used metrics, we will consider as a
 # baseline an estimator that returns 0 for any input.
 
-df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2)
+df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)
 
+dummy = DummyRegressor(strategy='constant', constant=0)
+dummy.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
 
-eps = 1e-5
-print("MSE: %.3f" % mean_squared_error(
-        df_test.Frequency.values, np.zeros(len(df_test)),
-        df_test.Exposure.values))
-print("MAE: %.3f" % mean_absolute_error(
-        df_test.Frequency.values, np.zeros(len(df_test)),
-        df_test.Exposure.values))
-print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
-        df_test.Frequency.values, eps + np.zeros(len(df_test)),
-        df_test.Exposure.values))
+##############################################################################
+#
+# The Poisson deviance cannot be computed on negative values predicted by the
+# model, so we set the minimum predicted value to eps,
+
+
+def score_estimator(estimator, df_test, eps=1e-5):
+    """Score an estimatr on the test set"""
+
+    print("MSE: %.3f" % mean_squared_error(
+              df_test.Frequency.values, estimator.predict(X_test),
+              df_test.Exposure.values))
+    print("MAE: %.3f" % mean_absolute_error(
+              df_test.Frequency.values, estimator.predict(X_test),
+              df_test.Exposure.values))
 
+    print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
+            df_test.Frequency.values, np.fmax(estimator.predict(X_test), eps),
+            df_test.Exposure.values))
+
+
+print("DummyRegressor")
+score_estimator(dummy, df_test)
 
 ##############################################################################
 #
 # We start by modeling the target variable with the least squares linear
 # regression model,
 
-
 linregr = LinearRegression()
 linregr.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
 
-print("LinearRegression")
-print("MSE: %.3f" % mean_squared_error(
-          df_test.Frequency.values, linregr.predict(X_test),
-          df_test.Exposure.values))
-print("MSE: %.3f" % mean_absolute_error(
-          df_test.Frequency.values, linregr.predict(X_test),
-          df_test.Exposure.values))
-print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
-        df_test.Frequency.values, np.fmax(linregr.predict(X_test), eps),
-        df_test.Exposure.values))
-
-##############################################################################
-#
-# The Poisson deviance cannot be computed because negative values are
-# predicted by the model,
 
 print('Number Negatives: %s / total: %s' % (
       (linregr.predict(X_test) < 0).sum(), X_test.shape[0]))
 
+print("LinearRegression")
+score_estimator(linregr, df_test)
+
 ##############################################################################
 #
 # Next we fit the Poisson regressor on the target variable,
@@ -186,15 +186,7 @@ def load_mtpl2(n_samples=100000):
 glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
 
 print("PoissonRegressor")
-print("MSE: %.3f" % mean_squared_error(
-        df_test.Frequency.values, glm_freq.predict(X_test),
-        df_test.Exposure.values))
-print("MAE: %.3f" % mean_absolute_error(
-        df_test.Frequency.values, glm_freq.predict(X_test),
-        df_test.Exposure.values))
-print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
-        df_test.Frequency.values, glm_freq.predict(X_test),
-        df_test.Exposure.values))
+score_estimator(glm_freq, df_test)
 
 ##############################################################################
 #
@@ -202,19 +194,13 @@ def load_mtpl2(n_samples=100000):
 # still minimizes the least square error.
 
 
-gbr = GradientBoostingRegressor(max_depth=3)
+gbr = GradientBoostingRegressor()
 gbr.fit(X_train, df_train.Frequency.values,
         sample_weight=df_train.Exposure.values)
 
 
 print("GradientBoostingRegressor")
-print("MSE: %.3f" % mean_squared_error(
-      df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
-print("MAE: %.3f" % mean_absolute_error(
-      df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
-print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
-      df_test.Frequency.values, np.fmax(gbr.predict(X_test), eps),
-      df_test.Exposure.values))
+score_estimator(gbr, df_test)
 
 ##############################################################################
 #
@@ -231,6 +217,7 @@ def load_mtpl2(n_samples=100000):
 
 
 fig, axes = plt.subplots(1, 4, figsize=(16, 3))
+fig.subplots_adjust(bottom=0.2)
 
 df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=axes[0])
 
@@ -247,3 +234,17 @@ def load_mtpl2(n_samples=100000):
         yscale='log',
         xlabel="y (Frequency)"
     )
+
+##############################################################################
+#
+# The experimental data presents a long tail distribution for ``y``. In all
+# models we predict the mean expected value, so we will have necessairily fewer
+# extreme values. Additionally normal distribution used in ``Ridge`` and
+# ``GradientBoostingRegressor`` has a constant variance, while for the Poisson
+# distribution used in ``PoissonRegressor``, the variance is proportional to
+# the mean predicted value.
+#
+# Thus, among the considered estimators,
+# ``PoissonRegressor`` and ``GradientBoostingRegressor`` are better suited for
+# modeling the long tail distribution of the data as compared to the ``Ridge``
+# estimator.

From 516eadba0cbddd9c016f66fc6dce41773f0350e7 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 28 Aug 2019 12:28:06 +0300
Subject: [PATCH 150/209] Improvements in
 plot_tweedie_regression_insurance_claims.py

---
 ...plot_poisson_regression_non_normal_loss.py |  4 +--
 ...lot_tweedie_regression_insurance_claims.py | 30 +++++++++++++++----
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 6a98030817245..9deca6c25032a 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -116,8 +116,8 @@ def load_mtpl2(n_samples=100000):
 # as a Poisson distribution. It is then assumed to be the number of discrete
 # events occurring with a constant rate in a given time interval
 # (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``,
-# which is still a (scaled) Poisson distribution.
-#
+# which is still a (scaled) Poisson distribution, and use ``Exposure`` as
+# `sample_weight`.
 
 df["Frequency"] = df.ClaimNb / df.Exposure
 
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 24451069eeb68..f866518b69db8 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -80,7 +80,7 @@ def load_mtpl2(n_samples=100000):
 
 
 def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
-                  title=None, ax=None):
+                  title=None, ax=None, fill_legend=False):
     """Plot observed and predicted - aggregated per feature level.
 
     Parameters
@@ -95,6 +95,8 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
         a column name of df with the observed target
     predicted : frame
         a dataframe, with the same index as df, with the predicted target
+    fill_legend : bool, default=False
+        wgether to show fill_between legend
     """
     # aggregate observed and predicted variables by feature level
     df_ = df.loc[:, [feature, weight]].copy()
@@ -109,13 +111,15 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 
     ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax)
     y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8
-    ax.fill_between(
+    p2 = ax.fill_between(
         df_.index,
         0,
         y_max * df_[weight] / df_[weight].values.max(),
         color="g",
         alpha=0.1,
     )
+    if fill_legend:
+        ax.legend([p2], ["{} distribution".format(feature)])
     ax.set(
         ylabel=y_label if y_label is not None else None,
         title=title if title is not None else "Train: Observed vs Predicted",
@@ -132,7 +136,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 # containing the claim amount (``ClaimAmount``) for the same policy ids
 # (``IDpol``).
 
-df = load_mtpl2(n_samples=100000)
+df = load_mtpl2(n_samples=60000)
 
 # Note: filter out claims with zero amount, as the severity model
 # requires a strictly positive target values.
@@ -180,9 +184,10 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 # as a Poisson distribution. It is then assumed to be the number of discrete
 # events occuring with a constant rate in a given time interval (``Exposure``).
 # Here we model the frequency ``y = ClaimNb / Exposure``,
-# which is still a (scaled) Poisson distribution.
+# which is still a (scaled) Poisson distribution, and use ``Exposure`` as
+# `sample_weight`.
 
-df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2)
+df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)
 
 # Some of the features are colinear, we use a weak penalization to avoid
 # numerical issues.
@@ -200,6 +205,7 @@ def mean_deviance(estimator, y, y_pred, weights):
 def score_estimator(
     estimator, X_train, X_test, df_train, df_test, target, weights
 ):
+    """Evaluate an estimator on train and test sets with different metrics"""
     res = []
 
     for subset_label, X, df in [
@@ -282,6 +288,7 @@ def score_estimator(
     y_label="Claim Frequency",
     title="test data",
     ax=ax[0, 1],
+    fill_legend=True
 )
 
 plot_obs_pred(
@@ -293,6 +300,7 @@ def score_estimator(
     y_label="Claim Frequency",
     title="test data",
     ax=ax[1, 0],
+    fill_legend=True
 )
 
 plot_obs_pred(
@@ -304,11 +312,17 @@ def score_estimator(
     y_label="Claim Frequency",
     title="test data",
     ax=ax[1, 1],
+    fill_legend=True
 )
 
 
 ##############################################################################
 #
+# According to the observed data, the frequency of accidents is higher for
+# drivers younger than 30 years old, and it positively correlated with the
+# `BonusMalus` variable. Out model is able to mostly correctly model
+# this behaviour.
+#
 # 3. Severity model -  Gamma Distribution
 # ---------------------------------------
 # The mean claim amount or severity (`AvgClaimAmount`) can be empirically
@@ -392,11 +406,15 @@ def score_estimator(
     y_label="Average Claim Severity",
     title="test data",
     ax=ax[1],
+    fill_legend=True
 )
 
 
 ##############################################################################
 #
+# Overall the drivers age (``DrivAge``) has a weak impact on the claim
+# severity, both in observed and predicted data.
+#
 # 4. Total Claims Amount -- Compound Poisson distribution
 # -------------------------------------------------------
 #
@@ -517,3 +535,5 @@ def score(self, X, y, sample_weight=None):
     )
 
 print(pd.DataFrame(res).set_index("subset").T)
+
+plt.show()

From 5e14928507f04893d131942c57e566c5a5789517 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 28 Aug 2019 12:35:28 +0300
Subject: [PATCH 151/209] Drop unused ExponentialDispersionModel._upper_bound

---
 sklearn/linear_model/_glm/distribution.py            | 7 +------
 sklearn/linear_model/_glm/tests/test_distribution.py | 5 -----
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index 8779cd616cc5a..e3dfd1fcd3cf5 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -63,11 +63,7 @@ def in_y_range(self, y):
         y : array, shape (n_samples,)
             Target values.
         """
-        if hasattr(self, '_upper_bound'):
-            # All currently supported distributions have an upper bound at
-            # +inf, however this may need to be implemented for other
-            # distributions
-            raise NotImplementedError
+        # Note that currently supported distributions have +inf upper bound
 
         if not isinstance(self._lower_bound, DistributionBoundary):
             raise TypeError('_lower_bound attribute must be of type '
@@ -236,7 +232,6 @@ class TweedieDistribution(ExponentialDispersionModel):
             For ``0<power<1``, no distribution exists.
     """
     def __init__(self, power=0):
-        # validate power and set _upper_bound, _include_upper_bound attrs
         self.power = power
 
     @property
diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py
index ed81c8328c87a..fa45a972f6af1 100644
--- a/sklearn/linear_model/_glm/tests/test_distribution.py
+++ b/sklearn/linear_model/_glm/tests/test_distribution.py
@@ -38,11 +38,6 @@ def test_invalid_distribution_bound():
                        match="must be of type DistributionBoundary"):
         dist.in_y_range([-1, 0, 1])
 
-    dist = TweedieDistribution()
-    dist._upper_bound = None
-    with pytest.raises(NotImplementedError):
-        dist.in_y_range([-1, 0, 1])
-
 
 def test_tweedie_distribution_power():
     msg = "distribution is only defined for power<=0 and power>=1"

From 6cc1df53361827053be2655982d9606a2f372843 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 28 Aug 2019 13:10:55 +0300
Subject: [PATCH 152/209] Move notes and references from docstrings to user
 manual

---
 doc/modules/linear_model.rst     |  12 ++++
 sklearn/linear_model/_glm/glm.py | 117 -------------------------------
 2 files changed, 12 insertions(+), 117 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index fc92cd2002948..b127a2c345a36 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -986,6 +986,18 @@ of the unit variance function,
      :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values
      together with :math:`s=\mathrm{exposure}` as sample weights. This is done
      in both examples linked below.
+   * The fit itself does not need Y to be from an EDM, but only assumes
+     the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+     :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`.
+   * If the target y is a ratio, appropriate sample weights s should be
+     provided.
+     As an example, consider Poisson distributed counts z (integers) and
+     weights s=exposure (time, money, persons years, ...). Then you fit
+     y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``.
+     The weights are necessary for the right (finite sample) mean.
+     Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
+     in this case one might say that y has a 'scaled' Poisson distributions.
+     The same holds for other distributions.
 
 The estimator can be used as follows::
 
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index a7128eb78df07..826f5924776b3 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -114,43 +114,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     n_iter_ : int
         Actual number of iterations used in solver.
-
-    Notes
-    -----
-    The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
-    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
-    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
-    :ref:`User Guide <Generalized_linear_regression>`.
-
-    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
-    minimizing the deviance plus penalty term, which is equivalent to
-    (penalized) maximum likelihood estimation.
-
-    For alpha > 0, the feature matrix X should be standardized in order to
-    penalize features equally strong. Call
-    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
-
-    If the target y is a ratio, appropriate sample weights s should be
-    provided.
-    As an example, consider Poisson distributed counts z (integers) and
-    weights s=exposure (time, money, persons years, ...). Then you fit
-    y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
-    sample_weight=s)``. The weights are necessary for the right (finite
-    sample) mean.
-    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
-    in this case one might say that y has a 'scaled' Poisson distributions.
-    The same holds for other distributions.
-
-    References
-    ----------
-    .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models,
-       Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
-
-    .. Jørgensen, B. (1992). The theory of exponential dispersion models
-       and analysis of deviance. Monografias de matemática, no. 51.  See also
-       `Exponential dispersion model.
-       <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
     """
     def __init__(self, alpha=1.0,
                  fit_intercept=True, family='normal', link='auto',
@@ -487,40 +450,6 @@ class PoissonRegressor(GeneralizedLinearRegressor):
 
     n_iter_ : int
         Actual number of iterations used in solver.
-
-    Notes
-    -----
-    The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
-    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
-    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
-    :ref:`User Guide <Generalized_linear_regression>`.
-
-    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
-    minimizing the deviance plus penalty term, which is equivalent to
-    (penalized) maximum likelihood estimation.
-
-    For alpha > 0, the feature matrix X should be standardized in order to
-    penalize features equally strong.
-
-    If the target y is a ratio, appropriate sample weights s should be
-    provided.
-    As an example, consider Poisson distributed counts z (integers) and
-    weights s=exposure (time, money, persons years, ...). Then you fit
-    y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``.
-    The weights are necessary for the right (finite sample) mean.
-    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
-    in this case one might say that y has a 'scaled' Poisson distributions.
-
-    References
-    ----------
-    .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models,
-       Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
-
-    .. Jørgensen, B. (1992). The theory of exponential dispersion models
-       and analysis of deviance. Monografias de matemática, no. 51.  See also
-       `Exponential dispersion model.
-       <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
     """
     def __init__(self, alpha=1.0, fit_intercept=True, link='log',
                  solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
@@ -605,29 +534,6 @@ class GammaRegressor(GeneralizedLinearRegressor):
 
     n_iter_ : int
         Actual number of iterations used in solver.
-
-    Notes
-    -----
-    The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
-    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
-    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
-    :ref:`User Guide <Generalized_linear_regression>`.
-
-    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
-    minimizing the deviance plus penalty term, which is equivalent to
-    (penalized) maximum likelihood estimation.
-
-
-    References
-    ----------
-    .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models,
-       Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
-
-    .. Jørgensen, B. (1992). The theory of exponential dispersion models
-       and analysis of deviance. Monografias de matemática, no. 51.  See also
-       `Exponential dispersion model.
-       <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
     """
     def __init__(self, alpha=1.0, fit_intercept=True, link='log',
                  solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
@@ -732,29 +638,6 @@ class TweedieRegressor(GeneralizedLinearRegressor):
 
     n_iter_ : int
         Actual number of iterations used in solver.
-
-    Notes
-    -----
-    The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
-    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
-    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
-    :ref:`User Guide <Generalized_linear_regression>`.
-
-    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
-    minimizing the deviance plus penalty term, which is equivalent to
-    (penalized) maximum likelihood estimation.
-
-
-    References
-    ----------
-    .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models,
-       Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
-
-    .. Jørgensen, B. (1992). The theory of exponential dispersion models
-       and analysis of deviance. Monografias de matemática, no. 51.  See also
-       `Exponential dispersion model.
-       <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
     """
     def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log',
                  solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,

From 752d6aa90bc034426b0348a4fd12f0fc3421027d Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 28 Aug 2019 14:01:04 +0300
Subject: [PATCH 153/209] More explanatory comments in the code

---
 sklearn/linear_model/_glm/glm.py            | 22 +++++++++++++++++----
 sklearn/linear_model/_glm/tests/test_glm.py |  3 +++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 826f5924776b3..54560dbae0867 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -462,6 +462,10 @@ def __init__(self, alpha=1.0, fit_intercept=True, link='log',
 
     @property
     def family(self):
+        # We use a property with a setter, since the GLM solver relies
+        # on self.family attribute, but we can't set it in __init__ according
+        # to scikit-learn API constraints. This attribute is made read-only
+        # to disallow changing distribution to other than Poisson.
         return "poisson"
 
     @family.setter
@@ -546,6 +550,10 @@ def __init__(self, alpha=1.0, fit_intercept=True, link='log',
 
     @property
     def family(self):
+        # We use a property with a setter, since the GLM solver relies
+        # on self.family attribute, but we can't set it in __init__ according
+        # to scikit-learn API constraints. This attribute is made read-only
+        # to disallow changing distribution to other than Gamma.
         return "gamma"
 
     @family.setter
@@ -571,7 +579,10 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     Parameters
     ----------
     power : float (default=0)
-            The variance power: :math:`v(\\mu) = \\mu^{power}`.
+            The power determines the underlying target distribution. By
+            definition it links distribution variance (:math:`v`) and
+            mean (:math:`\\mu`): :math:`v(\\mu) = \\mu^{power}`.
+
             For ``0<power<1``, no distribution exists.
 
             Special cases are:
@@ -629,9 +640,8 @@ class TweedieRegressor(GeneralizedLinearRegressor):
 
     Attributes
     ----------
-    coef_ : array, shape (n_features,)
-        Estimated coefficients for the linear predictor (X*coef_+intercept_) in
-        the GLM.
+    coef_ : array, shape (n_features,) Estimated coefficients for the linear
+        predictor (X*coef_+intercept_) in the GLM.
 
     intercept_ : float
         Intercept (a.k.a. bias) added to linear predictor.
@@ -650,6 +660,10 @@ def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log',
 
     @property
     def family(self):
+        # We use a property with a setter, since the GLM solver relies
+        # on self.family attribute, but we can't set it in __init__ according
+        # to scikit-learn API constraints. This also ensures that self.power
+        # and self.family.power are identical by construction.
         dist = TweedieDistribution(power=self.power)
         # TODO: make the returned object immutable
         return dist
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 879d382ce28dc..93d7ee1f08324 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -230,6 +230,9 @@ def test_warm_start(fit_intercept):
     assert glm1.score(X, y) > glm2.score(X, y)
     glm2.set_params(max_iter=1000)
     glm2.fit(X, y)
+    # The two model are not exactly identical since the lbfgs solver
+    # computes the approximate hessian from previous iterations, which
+    # will not be strictly identical in the case of a warm start.
     assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5)
     assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4)
 

From 38a4ad4e44ed4c4b27c48306afc74b85710dd889 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 28 Aug 2019 14:18:43 +0300
Subject: [PATCH 154/209] Fix requires_positive_y tag

---
 sklearn/linear_model/_glm/glm.py            | 11 ++++++++++-
 sklearn/linear_model/_glm/tests/test_glm.py | 13 +++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 54560dbae0867..7c518bb3f8149 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -383,7 +383,16 @@ def score(self, X, y, sample_weight=None):
         return 1 - dev / dev_null
 
     def _more_tags(self):
-        return {"requires_positive_y": True}
+        # create the _family_instance if fit wasn't called yet.
+        if hasattr(self, '_family_instance'):
+            _family_instance = self._family_instance
+        elif isinstance(self.family, ExponentialDispersionModel):
+            _family_instance = self.family
+        elif self.family in EDM_DISTRIBUTIONS:
+            _family_instance = EDM_DISTRIBUTIONS[self.family]()
+        else:
+            raise ValueError
+        return {"requires_positive_y": not _family_instance.in_y_range(-1.0)}
 
 
 class PoissonRegressor(GeneralizedLinearRegressor):
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 93d7ee1f08324..898d3c4edf9c0 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -340,3 +340,16 @@ def test_tweedie_regression_family(regression_data):
 
     # TODO: the following should not be allowed
     # est.family.power = 2
+
+
+@pytest.mark.parametrize(
+        'estimator, value',
+        [
+            (PoissonRegressor(), True),
+            (GammaRegressor(), True),
+            (TweedieRegressor(power=1.5), True),
+            (TweedieRegressor(power=0), False)
+        ],
+)
+def test_tags(estimator, value):
+    assert estimator._get_tags()['requires_positive_y'] is value

From c15a1cc573a555cc1a2e35c0063f6795cc2f7d84 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 28 Aug 2019 14:56:06 +0300
Subject: [PATCH 155/209] Remove Link.inverse_derivative2

---
 doc/modules/linear_model.rst                 |  4 ++--
 sklearn/linear_model/_glm/link.py            | 21 --------------------
 sklearn/linear_model/_glm/tests/test_link.py |  7 -------
 3 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index b127a2c345a36..622ab335059ab 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -919,8 +919,8 @@ The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
 likelihood as
 
 .. math::     d(y, \mu) = -2\phi\cdot
-              \left(loglike(y,\mu,\phi)
-              - loglike(y,y,\phi)\right)
+              \left( log P(y|\mu,\phi)
+              - log P(y|y,\phi)\right)
 
 The following table lists some specific EDM distributions—all are Tweedie
 distributions—and some of their properties.
diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py
index 04b485c067cca..b257036ac4727 100644
--- a/sklearn/linear_model/_glm/link.py
+++ b/sklearn/linear_model/_glm/link.py
@@ -64,17 +64,6 @@ def inverse_derivative(self, lin_pred):
         """
         pass  # pragma: no cover
 
-    @abstractmethod
-    def inverse_derivative2(self, lin_pred):
-        """Compute 2nd derivative of the inverse link function h''(lin_pred).
-
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        pass  # pragma: no cover
-
 
 class IdentityLink(Link):
     """The identity link function g(x)=x."""
@@ -91,9 +80,6 @@ def inverse(self, lin_pred):
     def inverse_derivative(self, lin_pred):
         return np.ones_like(lin_pred)
 
-    def inverse_derivative2(self, lin_pred):
-        return np.zeros_like(lin_pred)
-
 
 class LogLink(Link):
     """The log link function g(x)=log(x)."""
@@ -110,9 +96,6 @@ def inverse(self, lin_pred):
     def inverse_derivative(self, lin_pred):
         return np.exp(lin_pred)
 
-    def inverse_derivative2(self, lin_pred):
-        return np.exp(lin_pred)
-
 
 class LogitLink(Link):
     """The logit link function g(x)=logit(x)."""
@@ -129,7 +112,3 @@ def inverse(self, lin_pred):
     def inverse_derivative(self, lin_pred):
         ep = expit(lin_pred)
         return ep * (1 - ep)
-
-    def inverse_derivative2(self, lin_pred):
-        ep = expit(lin_pred)
-        return ep * (1 - ep) * (1 - 2 * ep)
diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py
index a631509baca79..36219e09b58e3 100644
--- a/sklearn/linear_model/_glm/tests/test_link.py
+++ b/sklearn/linear_model/_glm/tests/test_link.py
@@ -30,10 +30,3 @@ def test_link_properties(link):
     # g = link, h = link.inverse
     assert_allclose(link.derivative(link.inverse(x)),
                     1 / link.inverse_derivative(x))
-
-    assert (
-      link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape)
-
-    # for LogitLink, in the following x should be between 0 and 1
-    # assert_almost_equal(link.inverse_derivative(link(x)),
-    #                     1 / link.derivative(x), decimal=decimal)

From 37de07b7b78fe7ced78da6ed068fde3e4a08425b Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 30 Aug 2019 12:41:44 +0300
Subject: [PATCH 156/209] Rename p to power parameter in mean_tweedie_deviance

---
 doc/modules/model_evaluation.rst         | 45 ++++++++---------
 sklearn/metrics/regression.py            | 25 +++++-----
 sklearn/metrics/tests/test_regression.py | 61 ++++++++++++------------
 3 files changed, 67 insertions(+), 64 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 2db5053e08cce..1fe8ece51df67 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -2028,14 +2028,14 @@ Mean Poisson, Gamma, and Tweedie deviances
 The :func:`mean_tweedie_deviance` function computes the `mean Tweedie
 deviance error
 <https://en.wikipedia.org/wiki/Tweedie_distribution#The_Tweedie_deviance>`_
-with power parameter `p`. This is a metric that elicits predicted expectation
+with a ``power`` parameter. This is a metric that elicits predicted expectation
 values of regression targets.
 
 Following special cases exist,
 
-- when `p=0` it is equivalent to :func:`mean_squared_error`.
-- when `p=1` it is equivalent to :func:`mean_poisson_deviance`.
-- when `p=2` it is equivalent to :func:`mean_gamma_deviance`.
+- when ``power=0`` it is equivalent to :func:`mean_squared_error`.
+- when ``power=1`` it is equivalent to :func:`mean_poisson_deviance`.
+- when ``power=2`` it is equivalent to :func:`mean_gamma_deviance`.
 
 If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample,
 and :math:`y_i` is the corresponding true value, then the mean Tweedie
@@ -2046,48 +2046,49 @@ deviance error (D) estimated over :math:`n_{\text{samples}}` is defined as
   \text{D}(y, \hat{y}) = \frac{1}{n_\text{samples}}
   \sum_{i=0}^{n_\text{samples} - 1}
   \begin{cases}
-  (y_i-\hat{y}_i)^2, & \text{for }p=0\text{ (Normal)}\\
-  2(y_i \log(y/\hat{y}_i) + \hat{y}_i - y_i),  & \text{for }p=1\text{ (Poisson)}\\
-  2(\log(\hat{y}_i/y_i) + y_i/\hat{y}_i - 1),  & \text{for }p=2\text{ (Gamma)}\\
+  (y_i-\hat{y}_i)^2, & \text{for }\text{power}=0\text{ (Normal)}\\
+  2(y_i \log(y/\hat{y}_i) + \hat{y}_i - y_i),  & \text{for power}=1\text{ (Poisson)}\\
+  2(\log(\hat{y}_i/y_i) + y_i/\hat{y}_i - 1),  & \text{for power}=2\text{ (Gamma)}\\
   2\left(\frac{\max(y_i,0)^{2-p}}{(1-p)(2-p)}-
   \frac{y\,\hat{y}^{1-p}_i}{1-p}+\frac{\hat{y}^{2-p}_i}{2-p}\right),
   & \text{otherwise}
   \end{cases}
 
-Tweedie deviance is a homogeneous function of degree ``2-p``.
-Thus, Gamma distribution with `p=2` means that simultaneously scaling `y_true`
-and `y_pred` has no effect on the deviance. For Poisson distribution `p=1`
-the deviance scales linearly, and for Normal distribution (`p=0`),
-quadratically.  In general, the higher `p` the less weight is given to extreme
-deviations between true and predicted targets.
+Tweedie deviance is a homogeneous function of degree ``2-power``.
+Thus, Gamma distribution with ``power=2`` means that simultaneously scaling
+``y_true`` and ``y_pred`` has no effect on the deviance. For Poisson
+distribution ``power=1`` the deviance scales linearly, and for Normal
+distribution (``power=0``), quadratically.  In general, the higher
+``power`` the less weight is given to extreme deviations between true
+and predicted targets.
 
 For instance, let's compare the two predictions 1.0 and 100 that are both
 50% of their corresponding true value.
 
-The mean squared error (``p=0``) is very sensitive to the
+The mean squared error (``power=0``) is very sensitive to the
 prediction difference of the second point,::
 
     >>> from sklearn.metrics import mean_tweedie_deviance
-    >>> mean_tweedie_deviance([1.0], [1.5], p=0)
+    >>> mean_tweedie_deviance([1.0], [1.5], power=0)
     0.25
-    >>> mean_tweedie_deviance([100.], [150.], p=0)
+    >>> mean_tweedie_deviance([100.], [150.], power=0)
     2500.0
 
 If we increase ``p`` to 1,::
 
-    >>> mean_tweedie_deviance([1.0], [1.5], p=1)
+    >>> mean_tweedie_deviance([1.0], [1.5], power=1)
     0.18...
-    >>> mean_tweedie_deviance([100.], [150.], p=1)
+    >>> mean_tweedie_deviance([100.], [150.], power=1)
     18.9...
 
-the difference in errors decreases. Finally, by setting, ``p=2``::
+the difference in errors decreases. Finally, by setting, ``power=2``::
 
-    >>> mean_tweedie_deviance([1.0], [1.5], p=2)
+    >>> mean_tweedie_deviance([1.0], [1.5], power=2)
     0.14...
-    >>> mean_tweedie_deviance([100.], [150.], p=2)
+    >>> mean_tweedie_deviance([100.], [150.], power=2)
     0.14...
 
-we would get identical errors. The deviance when `p=2` is thus only
+we would get identical errors. The deviance when ``power=2`` is thus only
 sensitive to relative errors.
 
 .. _clustering_metrics:
diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py
index a1ae25fec278c..73db0acc945e6 100644
--- a/sklearn/metrics/regression.py
+++ b/sklearn/metrics/regression.py
@@ -624,7 +624,7 @@ def max_error(y_true, y_pred):
     return np.max(np.abs(y_true - y_pred))
 
 
-def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0):
+def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0):
     """Mean Tweedie deviance regression loss.
 
     Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
@@ -640,20 +640,21 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0):
     sample_weight : array-like, shape (n_samples,), optional
         Sample weights.
 
-    p : float, optional
-        Tweedie power parameter. Either p <= 0 or p >= 1.
+    power : float, default=0
+        Tweedie power parameter. Either power <= 0 or power >= 1.
 
         The higher `p` the less weight is given to extreme
         deviations between true and predicted targets.
 
-        - p < 0: Extreme stable distribution. Requires: y_pred > 0.
-        - p = 0 : Normal distribution, output corresponds to
+        - power < 0: Extreme stable distribution. Requires: y_pred > 0.
+        - power = 0 : Normal distribution, output corresponds to
           mean_squared_error. y_true and y_pred can be any real numbers.
-        - p = 1 : Poisson distribution. Requires: y_true >= 0 and y_pred > 0.
+        - power = 1 : Poisson distribution. Requires: y_true >= 0 and
+          y_pred > 0.
         - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0
           and y_pred > 0.
-        - p = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.
-        - p = 3 : Inverse Gaussian distribution. Requires: y_true > 0
+        - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.
+        - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0
           and y_pred > 0.
         - otherwise : Positive stable distribution. Requires: y_true > 0
           and y_pred > 0.
@@ -668,7 +669,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0):
     >>> from sklearn.metrics import mean_tweedie_deviance
     >>> y_true = [2, 0, 1, 4]
     >>> y_pred = [0.5, 0.5, 2., 2.]
-    >>> mean_tweedie_deviance(y_true, y_pred, p=1)
+    >>> mean_tweedie_deviance(y_true, y_pred, power=1)
     1.4260...
     """
     from ..linear_model._glm.distribution import TweedieDistribution
@@ -682,7 +683,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0):
         sample_weight = column_or_1d(sample_weight)
         sample_weight = sample_weight[:, np.newaxis]
 
-    dist = TweedieDistribution(power=p)
+    dist = TweedieDistribution(power=power)
     dev = dist.unit_deviance(y_true, y_pred, check_input=True)
 
     return np.average(dev, weights=sample_weight)
@@ -721,7 +722,7 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None):
     1.4260...
     """
     return mean_tweedie_deviance(
-        y_true, y_pred, sample_weight=sample_weight, p=1
+        y_true, y_pred, sample_weight=sample_weight, power=1
     )
 
 
@@ -759,5 +760,5 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None):
     1.0568...
     """
     return mean_tweedie_deviance(
-        y_true, y_pred, sample_weight=sample_weight, p=2
+        y_true, y_pred, sample_weight=sample_weight, power=2
     )
diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index 6dc31676357f5..0f987a088bb84 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -36,7 +36,7 @@ def test_regression_metrics(n_samples=50):
     assert_almost_equal(max_error(y_true, y_pred), 1.)
     assert_almost_equal(r2_score(y_true, y_pred),  0.995, 2)
     assert_almost_equal(explained_variance_score(y_true, y_pred), 1.)
-    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=0),
+    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=0),
                         mean_squared_error(y_true, y_pred))
 
     # Tweedie deviance needs positive y_pred, except for p=0,
@@ -45,15 +45,15 @@ def test_regression_metrics(n_samples=50):
     y_true = np.arange(1, 1 + n_samples)
     y_pred = 2 * y_true
     n = n_samples
-    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=-1),
+    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=-1),
                         5/12 * n * (n**2 + 2 * n + 1))
-    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=1),
+    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=1),
                         (n + 1) * (1 - np.log(2)))
-    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=2),
+    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=2),
                         2 * np.log(2) - 1)
-    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=3/2),
+    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=3/2),
                         ((6 * np.sqrt(2) - 8) / n) * np.sqrt(y_true).sum())
-    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=3),
+    assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=3),
                         np.sum(1 / y_true) / (4 * n))
 
 
@@ -101,40 +101,41 @@ def test_regression_metrics_at_limits():
                         mean_squared_log_error, [1., -2., 3.], [1., 2., 3.])
 
     # Tweedie deviance error
-    p = -1.2
-    assert_allclose(mean_tweedie_deviance([0], [1.], p=p),
-                    2./(2.-p), rtol=1e-3)
+    power = -1.2
+    assert_allclose(mean_tweedie_deviance([0], [1.], power=power),
+                    2 / (2 - power), rtol=1e-3)
     with pytest.raises(ValueError,
                        match="can only be used on strictly positive mu."):
-        mean_tweedie_deviance([0.], [0.], p=p)
-    assert_almost_equal(mean_tweedie_deviance([0.], [0.], p=0), 0.00, 2)
+        mean_tweedie_deviance([0.], [0.], power=power)
+    assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2)
 
     msg = "only be used on non-negative y and strictly positive mu."
     with pytest.raises(ValueError, match=msg):
-        mean_tweedie_deviance([0.], [0.], p=1.0)
+        mean_tweedie_deviance([0.], [0.], power=1.0)
 
-    p = 1.5
-    assert_allclose(mean_tweedie_deviance([0.], [1.], p=p), 2./(2.-p))
+    power = 1.5
+    assert_allclose(mean_tweedie_deviance([0.], [1.], power=power),
+                    2 / (2 - power))
     msg = "only be used on non-negative y and strictly positive mu."
     with pytest.raises(ValueError, match=msg):
-        mean_tweedie_deviance([0.], [0.], p=p)
-    p = 2.
-    assert_allclose(mean_tweedie_deviance([1.], [1.], p=p), 0.00,
+        mean_tweedie_deviance([0.], [0.], power=power)
+    power = 2.
+    assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00,
                     atol=1e-8)
     msg = "can only be used on strictly positive y and mu."
     with pytest.raises(ValueError, match=msg):
-        mean_tweedie_deviance([0.], [0.], p=p)
-    p = 3.
-    assert_allclose(mean_tweedie_deviance([1.], [1.], p=p),
+        mean_tweedie_deviance([0.], [0.], power=power)
+    power = 3.
+    assert_allclose(mean_tweedie_deviance([1.], [1.], power=power),
                     0.00, atol=1e-8)
 
     msg = "can only be used on strictly positive y and mu."
     with pytest.raises(ValueError, match=msg):
-        mean_tweedie_deviance([0.], [0.], p=p)
+        mean_tweedie_deviance([0.], [0.], power=power)
 
     with pytest.raises(ValueError,
                        match="is only defined for power<=0 and power>=1"):
-        mean_tweedie_deviance([0.], [0.], p=0.5)
+        mean_tweedie_deviance([0.], [0.], power=0.5)
 
 
 def test__check_reg_targets():
@@ -274,21 +275,21 @@ def test_tweedie_deviance_continuity():
     y_true = np.random.RandomState(0).rand(n_samples) + 0.1
     y_pred = np.random.RandomState(1).rand(n_samples) + 0.1
 
-    assert_allclose(mean_tweedie_deviance(y_true, y_pred, p=0 - 1e-10),
-                    mean_tweedie_deviance(y_true, y_pred, p=0))
+    assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=0 - 1e-10),
+                    mean_tweedie_deviance(y_true, y_pred, power=0))
 
     # Ws we get closer to the limit, with 1e-12 difference the absolute
     # tolerance to pass the below check increases. There are likely
     # numerical precision issues on the edges of different definition
     # regions.
-    assert_allclose(mean_tweedie_deviance(y_true, y_pred, p=1 + 1e-10),
-                    mean_tweedie_deviance(y_true, y_pred, p=1),
+    assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=1 + 1e-10),
+                    mean_tweedie_deviance(y_true, y_pred, power=1),
                     atol=1e-6)
 
-    assert_allclose(mean_tweedie_deviance(y_true, y_pred, p=2 - 1e-10),
-                    mean_tweedie_deviance(y_true, y_pred, p=2),
+    assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=2 - 1e-10),
+                    mean_tweedie_deviance(y_true, y_pred, power=2),
                     atol=1e-6)
 
-    assert_allclose(mean_tweedie_deviance(y_true, y_pred, p=2 + 1e-10),
-                    mean_tweedie_deviance(y_true, y_pred, p=2),
+    assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=2 + 1e-10),
+                    mean_tweedie_deviance(y_true, y_pred, power=2),
                     atol=1e-6)

From adbf997ba53d68e53a8d5997ffecd98ce4e13863 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 30 Aug 2019 12:07:41 +0200
Subject: [PATCH 157/209] Rename predicted mean mu to y_pred

---
 sklearn/linear_model/_glm/distribution.py     | 173 +++++++++---------
 sklearn/linear_model/_glm/glm.py              |  37 ++--
 sklearn/linear_model/_glm/link.py             |  42 ++---
 .../_glm/tests/test_distribution.py           |   4 +-
 4 files changed, 132 insertions(+), 124 deletions(-)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index e3dfd1fcd3cf5..a5e42bcee5d1c 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -28,17 +28,17 @@ def _safe_lin_pred(X, coef):
 class ExponentialDispersionModel(metaclass=ABCMeta):
     r"""Base class for reproductive Exponential Dispersion Models (EDM).
 
-    The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
+    The pdf of :math:`Y\sim \mathrm{EDM}(y_\textrm{pred}, \phi)` is given by
 
     .. math:: p(y| \theta, \phi) = c(y, \phi)
         \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
         = \tilde{c}(y, \phi)
-            \exp\left(-\frac{d(y, \mu)}{2\phi}\right)
+            \exp\left(-\frac{d(y, y_\textrm{pred})}{2\phi}\right)
 
-    with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`,
-    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`,
-    unit variance :math:`v(\mu)` and
-    unit deviance :math:`d(y,\mu)`.
+    with mean :math:`\mathrm{E}[Y] = A'(\theta) = y_\textrm{pred}`,
+    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(y_\textrm{pred})`,
+    unit variance :math:`v(y_\textrm{pred})` and
+    unit deviance :math:`d(y,y_\textrm{pred})`.
 
     Methods
     -------
@@ -75,58 +75,60 @@ def in_y_range(self, y):
             return np.greater(y, self._lower_bound.value)
 
     @abstractmethod
-    def unit_variance(self, mu):
+    def unit_variance(self, y_pred):
         r"""Compute the unit variance function.
 
-        The unit variance :math:`v(\mu)` determines the variance as
-        a function of the mean :math:`\mu` by
-        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`.
-        It can also be derived from the unit deviance :math:`d(y,\mu)` as
+        The unit variance :math:`v(y_\textrm{pred})` determines the variance as
+        a function of the mean :math:`y_\textrm{pred}` by
+        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(y_\textrm{pred}_i)`.
+        It can also be derived from the unit deviance
+        :math:`d(y,y_\textrm{pred})` as
 
-        .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{
-            \partial\mu^2}}\big|_{y=\mu}
+        .. math:: v(y_\textrm{pred}) = \frac{2}{
+            \frac{\partial^2 d(y,y_\textrm{pred})}{
+            \partialy_\textrm{pred}^2}}\big|_{y=y_\textrm{pred}}
 
         See also :func:`variance`.
 
         Parameters
         ----------
-        mu : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Predicted mean.
         """
         pass  # pragma: no cover
 
     @abstractmethod
-    def unit_variance_derivative(self, mu):
-        r"""Compute the derivative of the unit variance w.r.t. mu.
+    def unit_variance_derivative(self, y_pred):
+        r"""Compute the derivative of the unit variance w.r.t. y_pred.
 
-        Return :math:`v'(\mu)`.
+        Return :math:`v'(y_\textrm{pred})`.
 
         Parameters
         ----------
-        mu : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Target values.
         """
         pass  # pragma: no cover
 
     @abstractmethod
-    def unit_deviance(self, y, mu, check_input=False):
+    def unit_deviance(self, y, y_pred, check_input=False):
         r"""Compute the unit deviance.
 
-        The unit_deviance :math:`d(y,\mu)` can be defined by the
+        The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
         log-likelihood as
-        :math:`d(y,\mu) = -2\phi\cdot
-        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
+        :math:`d(y,y_\textrm{pred}) = -2\phi\cdot
+        \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
 
         Parameters
         ----------
         y : array, shape (n_samples,)
             Target values.
 
-        mu : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Predicted mean.
 
         check_input : bool, default=False
-            If True raise an exception on invalid y or mu values, otherwise
+            If True raise an exception on invalid y or y_pred values, otherwise
             they will be propagated as NaN.
         Returns
         -------
@@ -135,31 +137,33 @@ def unit_deviance(self, y, mu, check_input=False):
         """
         pass  # pragma: no cover
 
-    def unit_deviance_derivative(self, y, mu):
-        r"""Compute the derivative of the unit deviance w.r.t. mu.
+    def unit_deviance_derivative(self, y, y_pred):
+        r"""Compute the derivative of the unit deviance w.r.t. y_pred.
 
         The derivative of the unit deviance is given by
-        :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
-        with unit variance :math:`v(\mu)`.
+        :math:`\frac{\partial}{\partialy_\textrm{pred}}d(y,y_\textrm{pred})
+             = -2\frac{y-y_\textrm{pred}}{v(y_\textrm{pred})}`
+        with unit variance :math:`v(y_\textrm{pred})`.
 
         Parameters
         ----------
         y : array, shape (n_samples,)
             Target values.
 
-        mu : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Predicted mean.
         """
-        return -2 * (y - mu) / self.unit_variance(mu)
+        return -2 * (y - y_pred) / self.unit_variance(y_pred)
 
-    def deviance(self, y, mu, weights=1):
+    def deviance(self, y, y_pred, weights=1):
         r"""Compute the deviance.
 
         The deviance is a weighted sum of the per sample unit deviances,
-        :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)`
-        with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`.
+        :math:`D = \sum_i s_i \cdot d(y_i, y_\textrm{pred}_i)`
+        with weights :math:`s_i` and unit deviance
+        :math:`d(y,y_\textrm{pred})`.
         In terms of the log-likelihood it is :math:`D = -2\phi\cdot
-        \left(loglike(y,\mu,\frac{phi}{s})
+        \left(loglike(y,y_\textrm{pred},\frac{phi}{s})
         - loglike(y,y,\frac{phi}{s})\right)`.
 
         Parameters
@@ -167,51 +171,52 @@ def deviance(self, y, mu, weights=1):
         y : array, shape (n_samples,)
             Target values.
 
-        mu : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Predicted mean.
 
         weights : array, shape (n_samples,) (default=1)
             Weights or exposure to which variance is inverse proportional.
         """
-        return np.sum(weights * self.unit_deviance(y, mu))
+        return np.sum(weights * self.unit_deviance(y, y_pred))
 
-    def deviance_derivative(self, y, mu, weights=1):
-        """Compute the derivative of the deviance w.r.t. mu.
+    def deviance_derivative(self, y, y_pred, weights=1):
+        r"""Compute the derivative of the deviance w.r.t. y_pred.
 
-        It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`.
+        It gives :math:`\frac{\partial}{\partial y_\textrm{pred}}
+        D(y, \y_\textrm{pred}; weights)`.
 
         Parameters
         ----------
         y : array, shape (n_samples,)
             Target values.
 
-        mu : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Predicted mean.
 
         weights : array, shape (n_samples,) (default=1)
             Weights or exposure to which variance is inverse proportional.
         """
-        return weights * self.unit_deviance_derivative(y, mu)
+        return weights * self.unit_deviance_derivative(y, y_pred)
 
-    def _mu_deviance_derivative(self, coef, X, y, weights, link):
-        """Compute mu and the derivative of the deviance w.r.t coef."""
+    def _y_pred_deviance_derivative(self, coef, X, y, weights, link):
+        """Compute y_pred and the derivative of the deviance w.r.t coef."""
         lin_pred = _safe_lin_pred(X, coef)
-        mu = link.inverse(lin_pred)
+        y_pred = link.inverse(lin_pred)
         d1 = link.inverse_derivative(lin_pred)
-        temp = d1 * self.deviance_derivative(y, mu, weights)
+        temp = d1 * self.deviance_derivative(y, y_pred, weights)
         if coef.size == X.shape[1] + 1:
             devp = np.concatenate(([temp.sum()], temp @ X))
         else:
             devp = temp @ X  # same as X.T @ temp
-        return mu, devp
+        return y_pred, devp
 
 
 class TweedieDistribution(ExponentialDispersionModel):
     r"""A class for the Tweedie distribution.
 
-    A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely
-    defined by it's mean-variance relationship
-    :math:`\mathrm{Var}[Y] \propto \mu^power`.
+    A Tweedie distribution with mean :math:`y_\textrm{pred}=\mathrm{E}[Y]`
+    is uniquely defined by it's mean-variance relationship
+    :math:`\mathrm{Var}[Y] \propto y_\textrm{pred}^power`.
 
     Special cases are:
 
@@ -228,7 +233,7 @@ class TweedieDistribution(ExponentialDispersionModel):
     ----------
     power : float (default=0)
             The variance power of the `unit_variance`
-            :math:`v(\mu) = \mu^{power}`.
+            :math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`.
             For ``0<power<1``, no distribution exists.
     """
     def __init__(self, power=0):
@@ -262,45 +267,46 @@ def power(self, power):
 
         self._power = power
 
-    def unit_variance(self, mu):
-        """Compute the unit variance of a Tweedie distribution v(mu)=mu**power.
+    def unit_variance(self, y_pred):
+        """Compute the unit variance of a Tweedie distribution
+        v(y_\textrm{pred})=y_\textrm{pred}**power.
 
         Parameters
         ----------
-        mu : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Predicted mean.
         """
-        return np.power(mu, self.power)
+        return np.power(y_pred, self.power)
 
-    def unit_variance_derivative(self, mu):
+    def unit_variance_derivative(self, y_pred):
         """Compute the derivative of the unit variance of a Tweedie
-        distribution v(mu)=power*mu**(power-1).
+        distribution v(y_pred)=power*y_pred**(power-1).
 
         Parameters
         ----------
-        mu : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Predicted mean.
         """
-        return self.power * np.power(mu, self.power - 1)
+        return self.power * np.power(y_pred, self.power - 1)
 
-    def unit_deviance(self, y, mu, check_input=False):
+    def unit_deviance(self, y, y_pred, check_input=False):
         r"""Compute the unit deviance.
 
-        The unit_deviance :math:`d(y,\mu)` can be defined by the
+        The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
         log-likelihood as
-        :math:`d(y,\mu) = -2\phi\cdot
-        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
+        :math:`d(y,y_\textrm{pred}) = -2\phi\cdot
+        \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
 
         Parameters
         ----------
         y : array, shape (n_samples,)
             Target values.
 
-        mu : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Predicted mean.
 
         check_input : bool, default=False
-            If True raise an exception on invalid y or mu values, otherwise
+            If True raise an exception on invalid y or y_pred values, otherwise
             they will be propagated as NaN.
         Returns
         -------
@@ -313,50 +319,51 @@ def unit_deviance(self, y, mu, check_input=False):
             message = ("Mean Tweedie deviance error with power={} can only be "
                        "used on ".format(p))
             if p < 0:
-                # 'Extreme stable', y any realy number, mu > 0
-                if (mu <= 0).any():
-                    raise ValueError(message + "strictly positive mu.")
+                # 'Extreme stable', y any realy number, y_pred > 0
+                if (y_pred <= 0).any():
+                    raise ValueError(message + "strictly positive y_pred.")
             elif p == 0:
-                # Normal, y and mu can be any real number
+                # Normal, y and y_pred can be any real number
                 pass
             elif 0 < p < 1:
                 raise ValueError("Tweedie deviance is only defined for "
                                  "power<=0 and power>=1.")
             elif 1 <= p < 2:
-                # Poisson and Compount poisson distribution, y >= 0, mu > 0
-                if (y < 0).any() or (mu <= 0).any():
+                # Poisson and Compount poisson distribution, y >= 0, y_pred > 0
+                if (y < 0).any() or (y_pred <= 0).any():
                     raise ValueError(message + "non-negative y and strictly "
-                                     "positive mu.")
+                                     "positive y_pred.")
             elif p >= 2:
-                # Gamma and Extreme stable distribution, y and mu > 0
-                if (y <= 0).any() or (mu <= 0).any():
-                    raise ValueError(message + "strictly positive y and mu.")
+                # Gamma and Extreme stable distribution, y and y_pred > 0
+                if (y <= 0).any() or (y_pred <= 0).any():
+                    raise ValueError(message
+                                     + "strictly positive y and y_pred.")
             else:  # pragma: nocover
                 # Unreachable statement
                 raise ValueError
 
         if p < 0:
-            # 'Extreme stable', y any realy number, mu > 0
+            # 'Extreme stable', y any realy number, y_pred > 0
             dev = 2 * (np.power(np.maximum(y, 0), 2-p) / ((1-p) * (2-p))
-                       - y * np.power(mu, 1-p) / (1-p)
-                       + np.power(mu, 2-p) / (2-p))
+                       - y * np.power(y_pred, 1-p) / (1-p)
+                       + np.power(y_pred, 2-p) / (2-p))
 
         elif p == 0:
-            # Normal distribution, y and mu any real number
-            dev = (y - mu)**2
+            # Normal distribution, y and y_pred any real number
+            dev = (y - y_pred)**2
         elif p < 1:
             raise ValueError("Tweedie deviance is only defined for power<=0 "
                              "and power>=1.")
         elif p == 1:
             # Poisson distribution
-            dev = 2 * (xlogy(y, y/mu) - y + mu)
+            dev = 2 * (xlogy(y, y/y_pred) - y + y_pred)
         elif p == 2:
             # Gamma distribution
-            dev = 2 * (np.log(mu/y) + y/mu - 1)
+            dev = 2 * (np.log(y_pred/y) + y/y_pred - 1)
         else:
             dev = 2 * (np.power(y, 2-p) / ((1-p) * (2-p))
-                       - y * np.power(mu, 1-p) / (1-p)
-                       + np.power(mu, 2-p) / (2-p))
+                       - y * np.power(y_pred, 1-p) / (1-p)
+                       + np.power(y_pred, 2-p) / (2-p))
         return dev
 
 
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 7c518bb3f8149..cab89766c0b92 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -31,8 +31,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """Regression via a Generalized Linear Model (GLM) with penalties.
 
     GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
-    fitting and predicting the mean of the target y as mu=h(X*w). Therefore,
-    the fit minimizes the following objective function with L2
+    fitting and predicting the mean of the target y as y_pred=h(X*w).
+    Therefore, the fit minimizes the following objective function with L2
     priors as regularizer::
 
             1/(2*sum(s)) * deviance(y, h(X*w); s)
@@ -66,8 +66,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     link : {'auto', 'identity', 'log'} or an instance of class Link, \
             optional (default='auto')
         The link function of the GLM, i.e. mapping from linear predictor
-        (X*coef) to expectation (mu). Option 'auto' sets the link depending on
-        the chosen family as follows:
+        (X*coef) to expectation (y_pred). Option 'auto' sets the link
+        depending on the chosen family as follows:
 
         - 'identity' for family 'normal'
 
@@ -146,9 +146,9 @@ def fit(self, X, y, sample_weight=None):
                 optional (default=None)
             Individual weights w_i for each sample. Note that for an
             Exponential Dispersion Model (EDM), one has
-            Var[Y_i]=phi/w_i * v(mu).
-            If Y_i ~ EDM(mu, phi/w_i), then
-            sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
+            Var[Y_i]=phi/w_i * v(y_pred).
+            If Y_i ~ EDM(y_pred, phi/w_i), then
+            sum(w*Y)/sum(w) ~ EDM(y_pred, phi/sum(w)), i.e. the mean of y is a
             weighted average with weights=sample_weight.
 
         Returns
@@ -267,10 +267,10 @@ def fit(self, X, y, sample_weight=None):
 
         if solver == 'lbfgs':
             def func(coef, X, y, weights, alpha, family, link):
-                mu, devp = family._mu_deviance_derivative(
+                y_pred, devp = family._y_pred_deviance_derivative(
                     coef, X, y, weights, link
                 )
-                dev = family.deviance(y, mu, weights)
+                dev = family.deviance(y, y_pred, weights)
                 intercept = (coef.size == X.shape[1] + 1)
                 idx = 1 if intercept else 0  # offset if coef[0] is intercept
                 coef_scaled = alpha * coef[idx:]
@@ -337,8 +337,8 @@ def predict(self, X):
         """
         # check_array is done in _linear_predictor
         eta = self._linear_predictor(X)
-        mu = self._link_instance.inverse(eta)
-        return mu
+        y_pred = self._link_instance.inverse(eta)
+        return y_pred
 
     def score(self, X, y, sample_weight=None):
         """Compute D^2, the percentage of deviance explained.
@@ -376,8 +376,8 @@ def score(self, X, y, sample_weight=None):
         # TODO: make D^2 a score function in module metrics (and thereby get
         #       input validation and so on)
         weights = _check_sample_weight(sample_weight, X)
-        mu = self.predict(X)
-        dev = self._family_instance.deviance(y, mu, weights=weights)
+        y_pred = self.predict(X)
+        dev = self._family_instance.deviance(y, y_pred, weights=weights)
         y_mean = np.average(y, weights=weights)
         dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
         return 1 - dev / dev_null
@@ -399,7 +399,7 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     """Regression with the response variable y following a Poisson distribution
 
     GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
-    fitting and predicting the mean of the target y as mu=h(X*w).
+    fitting and predicting the mean of the target y as y_pred=h(X*w).
     The fit minimizes the following objective function with L2 regularization::
 
             1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2
@@ -487,7 +487,7 @@ class GammaRegressor(GeneralizedLinearRegressor):
     """Regression with the response variable y following a Gamma distribution
 
     GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
-    fitting and predicting the mean of the target y as mu=h(X*w).
+    fitting and predicting the mean of the target y as y_pred=h(X*w).
     The fit minimizes the following objective function with L2 regularization::
 
             1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2
@@ -572,10 +572,10 @@ def family(self, value):
 
 
 class TweedieRegressor(GeneralizedLinearRegressor):
-    """Regression with the response variable y following a Tweedie distribution
+    r"""Regression with the response variable y following a Tweedie distribution
 
     GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
-    fitting and predicting the mean of the target y as mu=h(X*w).
+    fitting and predicting the mean of the target y as y_pred=h(X*w).
     The fit minimizes the following objective function with L2 regularization::
 
             1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2
@@ -590,7 +590,8 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     power : float (default=0)
             The power determines the underlying target distribution. By
             definition it links distribution variance (:math:`v`) and
-            mean (:math:`\\mu`): :math:`v(\\mu) = \\mu^{power}`.
+            mean (:math:`\y_\textrm{pred}`):
+            :math:`v(\y_\textrm{pred}) = \y_\textrm{pred}^{power}`.
 
             For ``0<power<1``, no distribution exists.
 
diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py
index b257036ac4727..cfdc6f181a832 100644
--- a/sklearn/linear_model/_glm/link.py
+++ b/sklearn/linear_model/_glm/link.py
@@ -15,26 +15,26 @@ class Link(metaclass=ABCMeta):
     """Abstract base class for Link functions."""
 
     @abstractmethod
-    def __call__(self, mu):
-        """Compute the link function g(mu).
+    def __call__(self, y_pred):
+        """Compute the link function g(y_pred).
 
-        The link function links the mean mu=E[Y] to the so called linear
-        predictor (X*w), i.e. g(mu) = linear predictor.
+        The link function links the mean y_pred=E[Y] to the so called linear
+        predictor (X*w), i.e. g(y_pred) = linear predictor.
 
         Parameters
         ----------
-        mu : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Usually the (predicted) mean.
         """
         pass  # pragma: no cover
 
     @abstractmethod
-    def derivative(self, mu):
-        """Compute the derivative of the link g'(mu).
+    def derivative(self, y_pred):
+        """Compute the derivative of the link g'(y_pred).
 
         Parameters
         ----------
-        mu : array, shape (n_samples,)
+        y_pred : array, shape (n_samples,)
             Usually the (predicted) mean.
         """
         pass  # pragma: no cover
@@ -44,7 +44,7 @@ def inverse(self, lin_pred):
         """Compute the inverse link function h(lin_pred).
 
         Gives the inverse relationship between linear predictor and the mean
-        mu=E[Y], i.e. h(linear predictor) = mu.
+        y_pred=E[Y], i.e. h(linear predictor) = y_pred.
 
         Parameters
         ----------
@@ -68,11 +68,11 @@ def inverse_derivative(self, lin_pred):
 class IdentityLink(Link):
     """The identity link function g(x)=x."""
 
-    def __call__(self, mu):
-        return mu
+    def __call__(self, y_pred):
+        return y_pred
 
-    def derivative(self, mu):
-        return np.ones_like(mu)
+    def derivative(self, y_pred):
+        return np.ones_like(y_pred)
 
     def inverse(self, lin_pred):
         return lin_pred
@@ -84,11 +84,11 @@ def inverse_derivative(self, lin_pred):
 class LogLink(Link):
     """The log link function g(x)=log(x)."""
 
-    def __call__(self, mu):
-        return np.log(mu)
+    def __call__(self, y_pred):
+        return np.log(y_pred)
 
-    def derivative(self, mu):
-        return 1 / mu
+    def derivative(self, y_pred):
+        return 1 / y_pred
 
     def inverse(self, lin_pred):
         return np.exp(lin_pred)
@@ -100,11 +100,11 @@ def inverse_derivative(self, lin_pred):
 class LogitLink(Link):
     """The logit link function g(x)=logit(x)."""
 
-    def __call__(self, mu):
-        return logit(mu)
+    def __call__(self, y_pred):
+        return logit(y_pred)
 
-    def derivative(self, mu):
-        return 1 / (mu * (1 - mu))
+    def derivative(self, y_pred):
+        return 1 / (y_pred * (1 - y_pred))
 
     def inverse(self, lin_pred):
         return expit(lin_pred)
diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py
index fa45a972f6af1..97c3a485ef4bb 100644
--- a/sklearn/linear_model/_glm/tests/test_distribution.py
+++ b/sklearn/linear_model/_glm/tests/test_distribution.py
@@ -105,8 +105,8 @@ def test_deviance_derivative(family):
     assert dev_derivative.shape == y_pred.shape
 
     err = check_grad(
-            lambda mu: family.deviance(y_true, mu),
-            lambda mu: family.deviance_derivative(y_true, mu),
+            lambda y_pred: family.deviance(y_true, y_pred),
+            lambda y_pred: family.deviance_derivative(y_true, y_pred),
             y_pred,
     ) / np.linalg.norm(dev_derivative)
     assert abs(err) < 1e-6

From 47dbc842a21eb5f63139ecadc14f6b783c2255ab Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 30 Aug 2019 12:13:43 +0200
Subject: [PATCH 158/209] Fix link parameter documentation in TweedieRegression

---
 sklearn/linear_model/_glm/glm.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index cab89766c0b92..d6da8b8b80949 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -460,12 +460,12 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     n_iter_ : int
         Actual number of iterations used in solver.
     """
-    def __init__(self, alpha=1.0, fit_intercept=True, link='log',
+    def __init__(self, alpha=1.0, fit_intercept=True,
                  solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
                  copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
-                         family="poisson", link=link,
+                         family="poisson", link='log',
                          solver=solver, max_iter=max_iter, tol=tol,
                          warm_start=warm_start, copy_X=copy_X, verbose=verbose)
 
@@ -548,12 +548,12 @@ class GammaRegressor(GeneralizedLinearRegressor):
     n_iter_ : int
         Actual number of iterations used in solver.
     """
-    def __init__(self, alpha=1.0, fit_intercept=True, link='log',
-                 solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
+    def __init__(self, alpha=1.0, fit_intercept=True, solver='lbfgs',
+                 max_iter=100, tol=1e-4, warm_start=False,
                  copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
-                         family="gamma", link=link,
+                         family="gamma", link='log',
                          solver=solver, max_iter=max_iter, tol=tol,
                          warm_start=warm_start, copy_X=copy_X, verbose=verbose)
 
@@ -619,6 +619,15 @@ class TweedieRegressor(GeneralizedLinearRegressor):
         case, the design matrix X must have full column rank
         (no collinearities).
 
+    link : {'auto', 'identity', 'log'}, default='auto'
+        The link function of the GLM, i.e. mapping from linear predictor
+        (X*coef) to expectation (y_pred). Option 'auto' sets the link
+        depending on the chosen family as follows:
+
+        - 'identity' for Normal distribution
+
+        - 'log' for Poisson,  Gamma or Inverse Gaussian distributions
+
     fit_intercept : boolean, optional (default=True)
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
@@ -659,9 +668,9 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     n_iter_ : int
         Actual number of iterations used in solver.
     """
-    def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log',
-                 solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
-                 copy_X=True, check_input=True, verbose=0):
+    def __init__(self, power=0.0, alpha=1.0, fit_intercept=True,
+                 link='auto', solver='lbfgs', max_iter=100, tol=1e-4,
+                 warm_start=False, copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
                          family=TweedieDistribution(power=power), link=link,

From 3b526e98ce11514090cbdad890f3c2d13664cc83 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Fri, 30 Aug 2019 12:44:05 +0200
Subject: [PATCH 159/209] EXA Use a simpler pipeline for GBDT in poisson
 regression example

---
 ...plot_poisson_regression_non_normal_loss.py | 66 ++++++++++++++-----
 1 file changed, 48 insertions(+), 18 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 9deca6c25032a..7ab772fb59920 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -34,6 +34,7 @@
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
+from sklearn.preprocessing import OrdinalEncoder
 from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
 from sklearn.ensemble import GradientBoostingRegressor
 
@@ -134,10 +135,14 @@ def load_mtpl2(n_samples=100000):
 # To evaluate the pertinence of the used metrics, we will consider as a
 # baseline an estimator that returns 0 for any input.
 
-df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)
+df_train, df_test = train_test_split(df, random_state=0)
 
-dummy = DummyRegressor(strategy='constant', constant=0)
-dummy.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
+dummy = make_pipeline(
+    column_trans,
+    DummyRegressor(strategy='constant', constant=0)
+)
+dummy.fit(df_train, df_train.Frequency,
+          dummyregressor__sample_weight=df_train.Exposure)
 
 ##############################################################################
 #
@@ -149,14 +154,14 @@ def score_estimator(estimator, df_test, eps=1e-5):
     """Score an estimatr on the test set"""
 
     print("MSE: %.3f" % mean_squared_error(
-              df_test.Frequency.values, estimator.predict(X_test),
+              df_test.Frequency.values, estimator.predict(df_test),
               df_test.Exposure.values))
     print("MAE: %.3f" % mean_absolute_error(
-              df_test.Frequency.values, estimator.predict(X_test),
+              df_test.Frequency.values, estimator.predict(df_test),
               df_test.Exposure.values))
 
     print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
-            df_test.Frequency.values, np.fmax(estimator.predict(X_test), eps),
+            df_test.Frequency.values, np.fmax(estimator.predict(df_test), eps),
             df_test.Exposure.values))
 
 
@@ -168,12 +173,14 @@ def score_estimator(estimator, df_test, eps=1e-5):
 # We start by modeling the target variable with the least squares linear
 # regression model,
 
-linregr = LinearRegression()
-linregr.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
+linregr = make_pipeline(column_trans, LinearRegression())
+linregr.fit(df_train, df_train.Frequency,
+            linearregression__sample_weight=df_train.Exposure)
 
 
 print('Number Negatives: %s / total: %s' % (
-      (linregr.predict(X_test) < 0).sum(), X_test.shape[0]))
+      (linregr.predict(df_train) < 0).sum(),
+      df_train.shape[0]))
 
 print("LinearRegression")
 score_estimator(linregr, df_test)
@@ -182,8 +189,12 @@ def score_estimator(estimator, df_test, eps=1e-5):
 #
 # Next we fit the Poisson regressor on the target variable,
 
-glm_freq = PoissonRegressor(alpha=0, max_iter=1000)
-glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
+glm_freq = make_pipeline(
+    column_trans,
+    PoissonRegressor(alpha=0, max_iter=1000)
+)
+glm_freq.fit(df_train, df_train.Frequency,
+             poissonregressor__sample_weight=df_train.Exposure)
 
 print("PoissonRegressor")
 score_estimator(glm_freq, df_test)
@@ -191,12 +202,31 @@ def score_estimator(estimator, df_test, eps=1e-5):
 ##############################################################################
 #
 # Finally we will consider a non linear model  with Gradient boosting that
-# still minimizes the least square error.
-
-
-gbr = GradientBoostingRegressor()
-gbr.fit(X_train, df_train.Frequency.values,
-        sample_weight=df_train.Exposure.values)
+# still minimizes the least square error. Gradient Boostring Decision Trees do
+# not require for categorical data to be one hot encoded, therefore here we use
+# a simpler pre-processing pipeline without ``KBinsDiscretizer`` and with
+# ``OrdinalEncoder`` instead of ``OneHotEncoder``.
+
+
+gbr = make_pipeline(
+    ColumnTransformer(
+        [
+            (
+                "Veh_Brand_Gas_Region",
+                OrdinalEncoder(),
+                ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
+            ),
+            ("Continious", "passthrough", ["VehAge", "DrivAge", "BonusMalus"]),
+            ("Density_log", make_pipeline(
+                FunctionTransformer(np.log, validate=False), StandardScaler()),
+                ["Density"]),
+        ],
+        remainder="drop",
+    ),
+    GradientBoostingRegressor()
+)
+gbr.fit(df_train, df_train.Frequency.values,
+        gradientboostingregressor__sample_weight=df_train.Exposure.values)
 
 
 print("GradientBoostingRegressor")
@@ -224,7 +254,7 @@ def score_estimator(estimator, df_test, eps=1e-5):
 axes[0].set_title('Experimental data')
 
 for idx, model in enumerate([linregr, glm_freq, gbr]):
-    y_pred = model.predict(X_train)
+    y_pred = model.predict(df_train)
 
     pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=axes[idx+1])
     axes[idx + 1].set_title(model.__class__.__name__)

From b1eb611f1d7364af25f1d8f4222ff0a9cb494efa Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 1 Sep 2019 16:49:52 +0200
Subject: [PATCH 160/209] Minor fixes for user guide

---
 doc/modules/linear_model.rst | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 622ab335059ab..29c329f6f333a 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -874,7 +874,7 @@ with 'log' loss, which might be even faster but requires more tuning.
     It is possible to obtain the p-values and confidence intervals for
     coefficients in cases of regression without penalization. The `statsmodels
     package <https://pypi.org/project/statsmodels/>` natively supports this.
-    Within sklearn, one could use bootstrapping instead as well.  
+    Within sklearn, one could use bootstrapping instead as well.
 
 
 :class:`LogisticRegressionCV` implements Logistic Regression with built-in
@@ -919,8 +919,8 @@ The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
 likelihood as
 
 .. math::     d(y, \mu) = -2\phi\cdot
-              \left( log P(y|\mu,\phi)
-              - log P(y|y,\phi)\right)
+              \left( \log p(y|\mu,\phi)
+              - \log p(y|y,\phi)\right).
 
 The following table lists some specific EDM distributions—all are Tweedie
 distributions—and some of their properties.
@@ -939,7 +939,7 @@ Usage
 -----
 
 In the following use cases, a loss different from the squared loss might be
-appropriate,
+appropriate:
 
   * If the target values :math:`y` are counts (non-negative integer valued) or
     frequencies (non-negative), you might use a Poisson deviance with log-link.
@@ -960,7 +960,7 @@ log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`.
 :class:`TweedieRegressor` implements a generalized linear model
 for the Tweedie distribution, that allows to model any of the above mentioned
 distributions using the appropriate ``power`` parameter, i.e. the exponent
-of the unit variance function,
+of the unit variance function:
 
  - ``power = 0``: Normal distribution. Specialized solvers such as
    :class:`Ridge`, :class:`ElasticNet` are generally
@@ -987,15 +987,15 @@ of the unit variance function,
      together with :math:`s=\mathrm{exposure}` as sample weights. This is done
      in both examples linked below.
    * The fit itself does not need Y to be from an EDM, but only assumes
-     the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
-     :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`.
-   * If the target y is a ratio, appropriate sample weights s should be
+     the first two moments to be :math:`E[Y_i]=\mu_i=h((Xw)_i)` and
+     :math:`Var[Y_i]=\frac{\phi}{s_i} v(\mu_i)`.
+   * If the target `y` is a ratio, appropriate sample weights ``s`` should be
      provided.
      As an example, consider Poisson distributed counts z (integers) and
      weights s=exposure (time, money, persons years, ...). Then you fit
      y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``.
      The weights are necessary for the right (finite sample) mean.
-     Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
+     Consider :math:`\bar{y} = \frac{\\sum_i s_i y_i}{\sum_i s_i}`,
      in this case one might say that y has a 'scaled' Poisson distributions.
      The same holds for other distributions.
 

From d964c01324a9945253549fbfb2ec1d3b3b18b79f Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 1 Sep 2019 19:01:08 +0200
Subject: [PATCH 161/209] EXA Poisson: minor changes

---
 ...plot_poisson_regression_non_normal_loss.py | 37 +++++++++++--------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 7ab772fb59920..b71fba9236ef5 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -3,11 +3,11 @@
 Poisson regression and non-normal loss
 ======================================
 
-This example illustrates the use of linear Poisson regression
+This example illustrates the use of log-linear Poisson regression
 on the French Motor Third-Party Liability Claims dataset [1] and compares
 it with models learned with least squared error. The goal is to predict the
-number of insurance claims (or frequency) following car accidents for a user
-given historical data over a population of users.
+number of insurance claims (or frequency) following car accidents for a
+policyholder given historical data over a population of policyholders.
 
 We start by defining a few helper functions for loading the data and
 visualizing results.
@@ -48,7 +48,8 @@ def load_mtpl2(n_samples=100000):
     Parameters
     ----------
     n_samples: int, default=100000
-      number of samples to select (for faster run time).
+      number of samples to select (for faster run time). Full dataset has
+      678013 samples.
     """
 
     # freMTPL2freq dataset from https://www.openml.org/d/41214
@@ -76,14 +77,15 @@ def load_mtpl2(n_samples=100000):
 # 1. Loading datasets and pre-processing
 # --------------------------------------
 #
-# We construct the freMTPL2 dataset by joining the  freMTPL2freq table,
+# We construct the freMTPL2 dataset by joining the freMTPL2freq table,
 # containing the number of claims (``ClaimNb``) with the freMTPL2sev table
-# containing the claim amount (``ClaimAmount``) for the same user ids.
+# containing the claim amount (``ClaimAmount``) for the same policy ids
+# (``IDpol``).
 
 df = load_mtpl2(n_samples=50000)
 
 # Note: filter out claims with zero amount, as the severity model
-# requires a strictly positive target values.
+# requires strictly positive target values.
 df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
 
 # correct for unreasonable observations (that might be data error)
@@ -116,9 +118,9 @@ def load_mtpl2(n_samples=100000):
 # The number of claims (``ClaimNb``) is a positive integer that can be modeled
 # as a Poisson distribution. It is then assumed to be the number of discrete
 # events occurring with a constant rate in a given time interval
-# (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``,
-# which is still a (scaled) Poisson distribution, and use ``Exposure`` as
-# `sample_weight`.
+# (``Exposure``, in units of years). Here we model the frequency
+# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution,
+# and use ``Exposure`` as `sample_weight`.
 
 df["Frequency"] = df.ClaimNb / df.Exposure
 
@@ -126,20 +128,23 @@ def load_mtpl2(n_samples=100000):
    pd.cut(df.Frequency, [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts()
 )
 
+print("Average Frequency = {}"
+      .format(np.average(df.Frequency, weights=df.Exposure)))
+
 ##############################################################################
 #
-# It worth noting that 96 % of users have 0 claims, and if we were to convert
-# this problem into a binary classification task, it would be significantly
-# imbalanced.
+# It worth noting that 96 % of policyholders have zero claims, and if we were
+# to convert this problem into a binary classification task, it would be
+# significantly imbalanced.
 #
 # To evaluate the pertinence of the used metrics, we will consider as a
-# baseline an estimator that returns 0 for any input.
+# baseline an estimator that returns the mean of the training sample.
 
 df_train, df_test = train_test_split(df, random_state=0)
 
 dummy = make_pipeline(
     column_trans,
-    DummyRegressor(strategy='constant', constant=0)
+    DummyRegressor(strategy='mean')
 )
 dummy.fit(df_train, df_train.Frequency,
           dummyregressor__sample_weight=df_train.Exposure)
@@ -257,7 +262,7 @@ def score_estimator(estimator, df_test, eps=1e-5):
     y_pred = model.predict(df_train)
 
     pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=axes[idx+1])
-    axes[idx + 1].set_title(model.__class__.__name__)
+    axes[idx + 1].set_title(model[-1].__class__.__name__)
 
 for axi in axes:
     axi.set(

From a1844b8543a9a43fadfc0523e48cd0135e58ba37 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Mon, 2 Sep 2019 20:29:56 +0200
Subject: [PATCH 162/209] Fix mu->y_pred and p->power

---
 doc/whats_new/v0.22.rst                          | 16 ++++++++--------
 .../plot_tweedie_regression_insurance_claims.py  |  4 ++--
 sklearn/metrics/regression.py                    |  4 ++--
 sklearn/metrics/scorer.py                        |  4 ++--
 sklearn/metrics/tests/test_common.py             |  4 ++--
 sklearn/metrics/tests/test_regression.py         | 10 +++++-----
 6 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 6e3d4822b261d..d6d52732ba714 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -241,10 +241,10 @@ Changelog
   :user:`Mohamed Maskani <maskani-moh>`, and :user:`Thomas Fan <thomasjpfan>`.
 
 - |Feature| Add :class:`metrics.mean_tweedie_deviance` measuring the
-  Tweedie deviance for a power parameter ``p``. Also add mean Poisson deviance
-  :class:`metrics.mean_poisson_deviance` and mean Gamma deviance
+  Tweedie deviance for a power parameter ``power``. Also add mean Poisson
+  deviance :class:`metrics.mean_poisson_deviance` and mean Gamma deviance
   :class:`metrics.mean_gamma_deviance` that are special cases of the Tweedie
-  deviance for `p=1` and `p=2` respectively.
+  deviance for `power=1` and `power=2` respectively.
   :pr:`13938` by :user:`Christian Lorentzen <lorentzenchr>` and
   `Roman Yurchak`_.
 
@@ -306,19 +306,19 @@ Changelog
 - |Enhancement| SVM now throws more specific error when fit on non-square data
   and kernel = precomputed.  :class:`svm.BaseLibSVM`
   :pr:`14336` by :user:`Gregory Dexter <gdex1>`.
-  
+
 :mod:`sklearn.tree`
 ...................
 
 - |Feature| Adds minimal cost complexity pruning, controlled by ``ccp_alpha``,
   to :class:`tree.DecisionTreeClassifier`, :class:`tree.DecisionTreeRegressor`,
   :class:`tree.ExtraTreeClassifier`, :class:`tree.ExtraTreeRegressor`,
-  :class:`ensemble.RandomForestClassifier`, 
+  :class:`ensemble.RandomForestClassifier`,
   :class:`ensemble.RandomForestRegressor`,
-  :class:`ensemble.ExtraTreesClassifier`, 
+  :class:`ensemble.ExtraTreesClassifier`,
   :class:`ensemble.ExtraTreesRegressor`,
-  :class:`ensemble.RandomTreesEmbedding`, 
-  :class:`ensemble.GradientBoostingClassifier`, 
+  :class:`ensemble.RandomTreesEmbedding`,
+  :class:`ensemble.GradientBoostingClassifier`,
   and :class:`ensemble.GradientBoostingRegressor`.
   :pr:`12887` by `Thomas Fan`_.
 
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index f866518b69db8..eb9769814ade5 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -426,7 +426,7 @@ class ClaimProdEstimator:
     """Total claim amount estimator.
 
     Computed as the product of the frequency model by the serverity model,
-    denormalized by exposure. Use Tweedie deviance with `p=1.5`.
+    denormalized by exposure. Use Tweedie deviance with `power=1.5`.
     """
 
     def __init__(self, est_freq, est_sev):
@@ -527,7 +527,7 @@ def score(self, X, y, sample_weight=None):
             "predicted, frequency*severity model": np.sum(
                 est_prod.predict(X, exposure=df.Exposure.values)
             ),
-            "predicted, tweedie, p=%.2f"
+            "predicted, tweedie, power=%.2f"
             % glm_total.best_estimator_.family.power: np.sum(
                 glm_total.best_estimator_.predict(X)
             ),
diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py
index 73db0acc945e6..706c484334d21 100644
--- a/sklearn/metrics/regression.py
+++ b/sklearn/metrics/regression.py
@@ -693,7 +693,7 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None):
     """Mean Poisson deviance regression loss.
 
     Poisson deviance is equivalent to the Tweedie deviance with
-    the power parameter `p=1`.
+    the power parameter `power=1`.
 
     Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
 
@@ -730,7 +730,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None):
     """Mean Gamma deviance regression loss.
 
     Gamma deviance is equivalent to the Tweedie deviance with
-    the power parameter `p=2`. It is invariant to scaling of
+    the power parameter `power=2`. It is invariant to scaling of
     the target variable, and mesures relative errors.
 
     Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index bf2892bdf83a2..e2496c83b666d 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -499,11 +499,11 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
                                                  greater_is_better=False,
                                                  squared=False)
 neg_mean_poisson_deviance_scorer = make_scorer(
-    mean_tweedie_deviance, p=1., greater_is_better=False
+    mean_tweedie_deviance, power=1., greater_is_better=False
 )
 
 neg_mean_gamma_deviance_scorer = make_scorer(
-    mean_tweedie_deviance, p=2., greater_is_better=False
+    mean_tweedie_deviance, power=2., greater_is_better=False
 )
 
 # Standard Classification Scores
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 6459f93c68449..a8cabe984e563 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -102,11 +102,11 @@
     "median_absolute_error": median_absolute_error,
     "explained_variance_score": explained_variance_score,
     "r2_score": partial(r2_score, multioutput='variance_weighted'),
-    "mean_normal_deviance": partial(mean_tweedie_deviance, p=0),
+    "mean_normal_deviance": partial(mean_tweedie_deviance, power=0),
     "mean_poisson_deviance": mean_poisson_deviance,
     "mean_gamma_deviance": mean_gamma_deviance,
     "mean_compound_poisson_deviance":
-    partial(mean_tweedie_deviance, p=1.4),
+    partial(mean_tweedie_deviance, power=1.4),
 }
 
 CLASSIFICATION_METRICS = {
diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index 0f987a088bb84..c3947db5ed857 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -105,31 +105,31 @@ def test_regression_metrics_at_limits():
     assert_allclose(mean_tweedie_deviance([0], [1.], power=power),
                     2 / (2 - power), rtol=1e-3)
     with pytest.raises(ValueError,
-                       match="can only be used on strictly positive mu."):
+                       match="can only be used on strictly positive y_pred."):
         mean_tweedie_deviance([0.], [0.], power=power)
     assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2)
 
-    msg = "only be used on non-negative y and strictly positive mu."
+    msg = "only be used on non-negative y and strictly positive y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=1.0)
 
     power = 1.5
     assert_allclose(mean_tweedie_deviance([0.], [1.], power=power),
                     2 / (2 - power))
-    msg = "only be used on non-negative y and strictly positive mu."
+    msg = "only be used on non-negative y and strictly positive y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
     power = 2.
     assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00,
                     atol=1e-8)
-    msg = "can only be used on strictly positive y and mu."
+    msg = "can only be used on strictly positive y and y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
     power = 3.
     assert_allclose(mean_tweedie_deviance([1.], [1.], power=power),
                     0.00, atol=1e-8)
 
-    msg = "can only be used on strictly positive y and mu."
+    msg = "can only be used on strictly positive y and y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
 

From f5133920b47070f5b252dc4d8015745195130c44 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 3 Sep 2019 18:34:08 +0200
Subject: [PATCH 163/209] EXA Tweedie: some improvements

---
 ...lot_tweedie_regression_insurance_claims.py | 56 +++++++++++--------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index eb9769814ade5..22a26d880a869 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -13,8 +13,8 @@
 which are:
 
 1. Model the number of claims with a Poisson distribution, the average
-   claim amount as a Gamma distribution and multiply the predictions of both in
-   order to get the total claim amount.
+   claim amount per claim, also known as severity, as a Gamma distribution and
+   multiply the predictions of both in order to get the total claim amount.
 2. Model total claim amount directly, typically with a Tweedie distribution of
    Tweedie power :math:`p \\in (1, 2)`.
 
@@ -42,6 +42,7 @@
 from sklearn.compose import ColumnTransformer
 from sklearn.linear_model import PoissonRegressor, GammaRegressor
 from sklearn.linear_model import TweedieRegressor
+from sklearn.metrics import mean_tweedie_deviance
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
@@ -56,7 +57,8 @@ def load_mtpl2(n_samples=100000):
     Parameters
     ----------
     n_samples: int, default=100000
-      number of samples to select (for faster run time).
+      number of samples to select (for faster run time). Full dataset has
+      678013 samples.
     """
 
     # freMTPL2freq dataset from https://www.openml.org/d/41214
@@ -139,7 +141,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 df = load_mtpl2(n_samples=60000)
 
 # Note: filter out claims with zero amount, as the severity model
-# requires a strictly positive target values.
+# requires strictly positive target values.
 df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
 
 # Correct for unreasonable observations (that might be data error)
@@ -182,10 +184,10 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 #
 # The number of claims (``ClaimNb``) is a positive integer that can be modeled
 # as a Poisson distribution. It is then assumed to be the number of discrete
-# events occuring with a constant rate in a given time interval (``Exposure``).
-# Here we model the frequency ``y = ClaimNb / Exposure``,
-# which is still a (scaled) Poisson distribution, and use ``Exposure`` as
-# `sample_weight`.
+# events occuring with a constant rate in a given time interval
+# (``Exposure``, in units of years). Here we model the frequency
+# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution,
+# and use ``Exposure`` as `sample_weight`.
 
 df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)
 
@@ -197,7 +199,10 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 
 def mean_deviance(estimator, y, y_pred, weights):
     if hasattr(estimator, "_family_instance"):
-        return estimator._family_instance.deviance(y, y_pred, weights) / len(y)
+        if weights is None:
+            weights = np.ones_like(y)
+        return (estimator._family_instance.deviance(y, y_pred, weights)
+                / np.sum(weights))
     else:
         return np.nan
 
@@ -320,10 +325,10 @@ def score_estimator(
 #
 # According to the observed data, the frequency of accidents is higher for
 # drivers younger than 30 years old, and it positively correlated with the
-# `BonusMalus` variable. Out model is able to mostly correctly model
+# `BonusMalus` variable. Our model is able to mostly correctly model
 # this behaviour.
 #
-# 3. Severity model -  Gamma Distribution
+# 3. Severity model -  Gamma distribution
 # ---------------------------------------
 # The mean claim amount or severity (`AvgClaimAmount`) can be empirically
 # shown to follow approximately a Gamma distribution. We fit a GLM model for
@@ -333,7 +338,7 @@ def score_estimator(
 #
 # - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support
 #   on :math:`(0, \infty)`, not :math:`[0, \infty)`.
-# - We use ``ClaimNb`` as sample weights.
+# - We use ``ClaimNb`` as `sample_weight`.
 
 mask_train = df_train["ClaimAmount"] > 0
 mask_test = df_test["ClaimAmount"] > 0
@@ -360,6 +365,8 @@ def score_estimator(
 
 ##############################################################################
 #
+# Here, the scores for the test data call for caution as they are significantly
+# worse than for the training data indicating an overfit.
 # Note that the resulting model is the average claim amount per claim. As such,
 # it is conditional on having at least one claim, and cannot be used to predict
 # the average claim amount per policy in general.
@@ -412,10 +419,10 @@ def score_estimator(
 
 ##############################################################################
 #
-# Overall the drivers age (``DrivAge``) has a weak impact on the claim
+# Overall, the drivers age (``DrivAge``) has a weak impact on the claim
 # severity, both in observed and predicted data.
 #
-# 4. Total Claims Amount -- Compound Poisson distribution
+# 4. Total claim amount -- Compound Poisson distribution
 # -------------------------------------------------------
 #
 # As mentionned in the introduction, the total claim amount can be modeled
@@ -426,12 +433,16 @@ class ClaimProdEstimator:
     """Total claim amount estimator.
 
     Computed as the product of the frequency model by the serverity model,
-    denormalized by exposure. Use Tweedie deviance with `power=1.5`.
+    denormalized by exposure. For scores, use Tweedie deviance with
+    `power=1.5`.
     """
 
     def __init__(self, est_freq, est_sev):
+        from sklearn.linear_model._glm.distribution import TweedieDistribution
+
         self.est_freq = est_freq
         self.est_sev = est_sev
+        self._family_instance = TweedieDistribution(power=1.5)
 
     def predict(self, X, exposure):
         """Predict the total claim amount.
@@ -442,14 +453,13 @@ def predict(self, X, exposure):
 
     def score(self, X, y, sample_weight=None):
         """Compute D², the percentage of deviance explained."""
-        # TODO: remove this private import once d2_score is available
-        from sklearn.linear_model._glm.distribution import TweedieDistribution
-
+        # TODO: use d2_score directly once it is available
         mu = self.predict(X, exposure=sample_weight)
-        family = TweedieDistribution(power=1.5)
-        dev = family.deviance(y, mu, weights=sample_weight)
-        y_mean = np.average(y, weights=sample_weight)
-        dev_null = family.deviance(y, y_mean, weights=sample_weight)
+        dev = mean_tweedie_deviance(
+            y, mu, sample_weight=sample_weight, power=1.5)
+        y_mean = np.average(y, weights=sample_weight) * np.ones_like(y)
+        dev_null = mean_tweedie_deviance(
+            y, y_mean, sample_weight=sample_weight, power=1.5)
         return 1. - dev / dev_null
 
 
@@ -475,7 +485,7 @@ def score(self, X, y, sample_weight=None):
 
 from sklearn.model_selection import GridSearchCV
 
-# exclude upper bound as power=2 does not support null y values.
+# exclude upper bound as power>=2 does not support y=0.
 params = {"power": np.linspace(1 + 1e-4, 2 - 1e-4, 8)}
 
 

From 84229a6d5fbe6cf9964f573496c66fe8c88bd2ab Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 3 Sep 2019 21:19:44 +0200
Subject: [PATCH 164/209] Fix doc test

---
 doc/modules/linear_model.rst                                  | 2 +-
 .../linear_model/plot_poisson_regression_non_normal_loss.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 29c329f6f333a..ee418af1d414b 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -1004,7 +1004,7 @@ The estimator can be used as follows::
     >>> from sklearn.linear_model import TweedieRegressor
     >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
-    TweedieRegressor(alpha=0.5, power=1)
+    TweedieRegressor(alpha=0.5, link='log', power=1)
     >>> reg.coef_
     array([0.2463..., 0.4337...])
     >>> reg.intercept_
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index b71fba9236ef5..d739c37d2bb60 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -206,8 +206,8 @@ def score_estimator(estimator, df_test, eps=1e-5):
 
 ##############################################################################
 #
-# Finally we will consider a non linear model  with Gradient boosting that
-# still minimizes the least square error. Gradient Boostring Decision Trees do
+# Finally, we will consider a non linear model with Gradient boosting that
+# still minimizes the least square error. Gradient Boosting Decision Trees do
 # not require for categorical data to be one hot encoded, therefore here we use
 # a simpler pre-processing pipeline without ``KBinsDiscretizer`` and with
 # ``OrdinalEncoder`` instead of ``OneHotEncoder``.

From 8c6c255cbad9cae5c82b5154f94f9a6a14cc6b3a Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 11 Sep 2019 15:20:10 +0200
Subject: [PATCH 165/209] Fix test

---
 sklearn/metrics/tests/test_regression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index 17bf7f828948c..f29e7d2ad1c13 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -118,7 +118,7 @@ def test_regression_metrics_at_limits():
     power = 1.5
     assert_allclose(mean_tweedie_deviance([0.], [1.], power=power),
                     2 / (2 - power))
-    msg = "only be used on non-negative y_true and strictly positive y_pred."
+    msg = "only be used on non-negative y and strictly positive y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
     power = 2.

From 0a2331385daca833429b82a3493471cb45329ac4 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 12 Sep 2019 17:17:41 +0200
Subject: [PATCH 166/209] EXA Use Ridge and remove eps

---
 ...plot_poisson_regression_non_normal_loss.py | 28 +++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index d739c37d2bb60..4fb16f6419209 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -30,7 +30,7 @@
 from sklearn.datasets import fetch_openml
 from sklearn.dummy import DummyRegressor
 from sklearn.compose import ColumnTransformer
-from sklearn.linear_model import PoissonRegressor, LinearRegression
+from sklearn.linear_model import Ridge, PoissonRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
@@ -152,22 +152,28 @@ def load_mtpl2(n_samples=100000):
 ##############################################################################
 #
 # The Poisson deviance cannot be computed on negative values predicted by the
-# model, so we set the minimum predicted value to eps,
+# model, so all models need to return positive preditions if we intend to
+# use this metric,
 
 
-def score_estimator(estimator, df_test, eps=1e-5):
+def score_estimator(estimator, df_test):
     """Score an estimatr on the test set"""
 
+    y_pred = estimator.predict(df_test)
+
     print("MSE: %.3f" % mean_squared_error(
-              df_test.Frequency.values, estimator.predict(df_test),
+              df_test.Frequency.values, y_pred,
               df_test.Exposure.values))
     print("MAE: %.3f" % mean_absolute_error(
-              df_test.Frequency.values, estimator.predict(df_test),
+              df_test.Frequency.values, y_pred,
               df_test.Exposure.values))
 
+    # ignore negative predictions
+    mask = y_pred > 0
+
     print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
-            df_test.Frequency.values, np.fmax(estimator.predict(df_test), eps),
-            df_test.Exposure.values))
+            df_test.Frequency.values[mask], y_pred[mask],
+            df_test.Exposure.values[mask]))
 
 
 print("DummyRegressor")
@@ -178,16 +184,16 @@ def score_estimator(estimator, df_test, eps=1e-5):
 # We start by modeling the target variable with the least squares linear
 # regression model,
 
-linregr = make_pipeline(column_trans, LinearRegression())
+linregr = make_pipeline(column_trans, Ridge(alpha=1.0))
 linregr.fit(df_train, df_train.Frequency,
-            linearregression__sample_weight=df_train.Exposure)
+            ridge__sample_weight=df_train.Exposure)
 
 
 print('Number Negatives: %s / total: %s' % (
       (linregr.predict(df_train) < 0).sum(),
       df_train.shape[0]))
 
-print("LinearRegression")
+print("Ridge")
 score_estimator(linregr, df_test)
 
 ##############################################################################
@@ -196,7 +202,7 @@ def score_estimator(estimator, df_test, eps=1e-5):
 
 glm_freq = make_pipeline(
     column_trans,
-    PoissonRegressor(alpha=0, max_iter=1000)
+    PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000)
 )
 glm_freq.fit(df_train, df_train.Frequency,
              poissonregressor__sample_weight=df_train.Exposure)

From 976b436ebbeff0f6d21f9ae06352cb6bb25c174e Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Mon, 16 Sep 2019 13:26:16 +0200
Subject: [PATCH 167/209] Address comments in
 plot_poisson_regression_non_normal_loss.py

---
 ...plot_poisson_regression_non_normal_loss.py | 73 +++++++++++--------
 1 file changed, 41 insertions(+), 32 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 4fb16f6419209..fa1bc09a76285 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -23,6 +23,8 @@
 # Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
 #          Roman Yurchak <rth.yurchak@gmail.com>
 # License: BSD 3 clause
+import warnings
+
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
@@ -36,7 +38,7 @@
 from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
 from sklearn.preprocessing import OrdinalEncoder
 from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
-from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.ensemble import RandomForestRegressor
 
 from sklearn.metrics import mean_squared_error, mean_absolute_error
 from sklearn.metrics import mean_poisson_deviance
@@ -149,11 +151,6 @@ def load_mtpl2(n_samples=100000):
 dummy.fit(df_train, df_train.Frequency,
           dummyregressor__sample_weight=df_train.Exposure)
 
-##############################################################################
-#
-# The Poisson deviance cannot be computed on negative values predicted by the
-# model, so all models need to return positive preditions if we intend to
-# use this metric,
 
 
 def score_estimator(estimator, df_test):
@@ -168,11 +165,17 @@ def score_estimator(estimator, df_test):
               df_test.Frequency.values, y_pred,
               df_test.Exposure.values))
 
-    # ignore negative predictions
+    # ignore negative predictions, as they are invalid for
+    # the Poisson deviance
     mask = y_pred > 0
+    if (~mask).any():
+        warnings.warn("estimator yields negative predictions for {} samples "
+                      "out of {}. These will be ignored while computing the "
+                      "poisson deviance".format((~mask).sum(), mask.shape[0]))
 
     print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
-            df_test.Frequency.values[mask], y_pred[mask],
+            df_test.Frequency.values[mask],
+            y_pred[mask],
             df_test.Exposure.values[mask]))
 
 
@@ -184,14 +187,21 @@ def score_estimator(estimator, df_test):
 # We start by modeling the target variable with the least squares linear
 # regression model,
 
-linregr = make_pipeline(column_trans, Ridge(alpha=1.0))
+linregr = make_pipeline(
+    column_trans,
+    Ridge(alpha=1.0)
+)
 linregr.fit(df_train, df_train.Frequency,
             ridge__sample_weight=df_train.Exposure)
 
-
-print('Number Negatives: %s / total: %s' % (
-      (linregr.predict(df_train) < 0).sum(),
-      df_train.shape[0]))
+##############################################################################
+#
+# The Poisson deviance cannot be computed on negative values predicted by the
+# model. For models that do return a few negative predictions
+# (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples,
+# meaning that the obtained Poisson deviance is approximate. An alternative
+# apporach could be to use class:`compose.TransformedTargetRegressor`
+# meta-estimator to map ``y_pred`` to strictly positive domain.
 
 print("Ridge")
 score_estimator(linregr, df_test)
@@ -212,40 +222,38 @@ def score_estimator(estimator, df_test):
 
 ##############################################################################
 #
-# Finally, we will consider a non linear model with Gradient boosting that
-# still minimizes the least square error. Gradient Boosting Decision Trees do
+# Finally, we will consider a non linear model with a random forest that
+# still minimizes the least square error.  Random forest does
 # not require for categorical data to be one hot encoded, therefore here we use
-# a simpler pre-processing pipeline without ``KBinsDiscretizer`` and with
-# ``OrdinalEncoder`` instead of ``OneHotEncoder``.
+# a simpler pre-processing pipeline with :class:`preprocessing.OrdinalEncoder`,
 
 
 gbr = make_pipeline(
     ColumnTransformer(
         [
             (
-                "Veh_Brand_Gas_Region",
-                OrdinalEncoder(),
+                "Veh_Brand_Gas_Region", OrdinalEncoder(),
                 ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
             ),
-            ("Continious", "passthrough", ["VehAge", "DrivAge", "BonusMalus"]),
-            ("Density_log", make_pipeline(
-                FunctionTransformer(np.log, validate=False), StandardScaler()),
-                ["Density"]),
+            (
+                "Continious", "passthrough",
+                ["VehAge", "DrivAge", "BonusMalus", "Density"]
+            ),
         ],
         remainder="drop",
     ),
-    GradientBoostingRegressor()
+    RandomForestRegressor(min_weight_fraction_leaf=1e-2)
 )
 gbr.fit(df_train, df_train.Frequency.values,
-        gradientboostingregressor__sample_weight=df_train.Exposure.values)
+        randomforestregressor__sample_weight=df_train.Exposure.values)
 
 
-print("GradientBoostingRegressor")
+print("RandomForestRegressor")
 score_estimator(gbr, df_test)
 
 ##############################################################################
 #
-# In this example, although Gradient boosting minimizes the least square error,
+# In this example, although random forest minimizes the least square error,
 # because of a higher predictive power it also results in a smaller Poisson
 # deviance than the Poisson regression model.
 #
@@ -281,11 +289,12 @@ def score_estimator(estimator, df_test):
 # The experimental data presents a long tail distribution for ``y``. In all
 # models we predict the mean expected value, so we will have necessairily fewer
 # extreme values. Additionally normal distribution used in ``Ridge`` and
-# ``GradientBoostingRegressor`` has a constant variance, while for the Poisson
+# ``RandomForestRegressor`` has a constant variance, while for the Poisson
 # distribution used in ``PoissonRegressor``, the variance is proportional to
 # the mean predicted value.
 #
-# Thus, among the considered estimators,
-# ``PoissonRegressor`` and ``GradientBoostingRegressor`` are better suited for
-# modeling the long tail distribution of the data as compared to the ``Ridge``
-# estimator.
+# Thus, among the considered estimators, ``PoissonRegressor`` is better suited
+# for modeling the long tail distribution of the data as compared to the
+# ``Ridge`` and ``RandomForestRegressor`` estimators.
+
+plt.show()

From 7c850d1a7c9a5f468ee0136ee91e2d662aefa4da Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Mon, 16 Sep 2019 13:28:06 +0200
Subject: [PATCH 168/209] Lint

---
 examples/linear_model/plot_poisson_regression_non_normal_loss.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index fa1bc09a76285..5098016f22913 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -152,7 +152,6 @@ def load_mtpl2(n_samples=100000):
           dummyregressor__sample_weight=df_train.Exposure)
 
 
-
 def score_estimator(estimator, df_test):
     """Score an estimatr on the test set"""
 

From f64dc4a4c51128c6d1017911e23760a866ca4007 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Mon, 16 Sep 2019 15:24:30 +0200
Subject: [PATCH 169/209] Simplify plot_tweedie_regression_insurance_claims.py
 example

---
 ...plot_poisson_regression_non_normal_loss.py |  2 -
 ...lot_tweedie_regression_insurance_claims.py | 85 ++++++-------------
 2 files changed, 25 insertions(+), 62 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 5098016f22913..769f321ff1562 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -295,5 +295,3 @@ def score_estimator(estimator, df_test):
 # Thus, among the considered estimators, ``PoissonRegressor`` is better suited
 # for modeling the long tail distribution of the data as compared to the
 # ``Ridge`` and ``RandomForestRegressor`` estimators.
-
-plt.show()
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 22a26d880a869..5a312f656d9ce 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -197,16 +197,6 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
 
 
-def mean_deviance(estimator, y, y_pred, weights):
-    if hasattr(estimator, "_family_instance"):
-        if weights is None:
-            weights = np.ones_like(y)
-        return (estimator._family_instance.deviance(y, y_pred, weights)
-                / np.sum(weights))
-    else:
-        return np.nan
-
-
 def score_estimator(
     estimator, X_train, X_test, df_train, df_test, target, weights
 ):
@@ -221,18 +211,25 @@ def score_estimator(
 
         for score_label, metric in [
             ("D² explained", None),
-            ("mean deviance", partial(mean_deviance, estimator)),
+            ("mean deviance", mean_tweedie_deviance),
             ("mean abs. error", mean_absolute_error),
             ("mean squared error", mean_squared_error),
         ]:
-            if estimator.__class__.__name__ == "ClaimProdEstimator":
-                # ClaimProdEstimator is the product of frequency and severity
-                # models, denormalized by the exposure values.
-                # It does not fully follow the scikit-learn API and we
-                # must handle it separately.
-                y_pred = estimator.predict(X, exposure=df.Exposure.values)
+            if isinstance(estimator, tuple) and len(estimator) == 2:
+                # Score the model consisting of the product of frequency and
+                # severity models, denormalized by the exposure values.
+                est_freq, est_sev = estimator
+                y_pred = (df.Exposure.values * est_freq.predict(X)
+                          * est_sev.predict(X))
+                power = 1.5
             else:
                 y_pred = estimator.predict(X)
+                power = getattr(getattr(estimator, "_family_instance"),
+                                "power")
+
+            if score_label == "mean deviance":
+                metric = partial(mean_tweedie_deviance, power=power)
+
             if metric is None:
                 if not hasattr(estimator, "score"):
                     continue
@@ -248,7 +245,8 @@ def score_estimator(
         pd.DataFrame(res)
         .set_index(["metric", "subset"])
         .score.unstack(-1)
-        .round(3)
+        .round(2)
+        .loc[:, ['train', 'test']]
     )
     return res
 
@@ -425,48 +423,16 @@ def score_estimator(
 # 4. Total claim amount -- Compound Poisson distribution
 # -------------------------------------------------------
 #
-# As mentionned in the introduction, the total claim amount can be modeled
+# As mentioned in the introduction, the total claim amount can be modeled
 # either as the product of the frequency model by the severity model,
+# denormalized by exposure. In the following code sample, the
+# ``score_estimator`` is extended to score such a model. The mean deviance
+# is computed assuming a Tweedie distribution with ``power=1.5`` to be
+# comparable with the model from the following section,
 
 
-class ClaimProdEstimator:
-    """Total claim amount estimator.
-
-    Computed as the product of the frequency model by the serverity model,
-    denormalized by exposure. For scores, use Tweedie deviance with
-    `power=1.5`.
-    """
-
-    def __init__(self, est_freq, est_sev):
-        from sklearn.linear_model._glm.distribution import TweedieDistribution
-
-        self.est_freq = est_freq
-        self.est_sev = est_sev
-        self._family_instance = TweedieDistribution(power=1.5)
-
-    def predict(self, X, exposure):
-        """Predict the total claim amount.
-
-        The predict method is not compatible with the scikit-learn API.
-        """
-        return exposure * self.est_freq.predict(X) * self.est_sev.predict(X)
-
-    def score(self, X, y, sample_weight=None):
-        """Compute D², the percentage of deviance explained."""
-        # TODO: use d2_score directly once it is available
-        mu = self.predict(X, exposure=sample_weight)
-        dev = mean_tweedie_deviance(
-            y, mu, sample_weight=sample_weight, power=1.5)
-        y_mean = np.average(y, weights=sample_weight) * np.ones_like(y)
-        dev_null = mean_tweedie_deviance(
-            y, y_mean, sample_weight=sample_weight, power=1.5)
-        return 1. - dev / dev_null
-
-
-est_prod = ClaimProdEstimator(glm_freq, glm_sev)
-
 scores = score_estimator(
-    est_prod,
+    (glm_freq, glm_sev),
     X_train,
     X_test,
     df_train,
@@ -479,7 +445,8 @@ def score(self, X, y, sample_weight=None):
 
 ##############################################################################
 #
-# or as a unique Compound Poisson model, also corresponding to a Tweedie model
+# Indeed, an alternative approach for modeling the total loss is with a unique
+# Compound Poisson model, also corresponding to a Tweedie model
 # with a power :math:`p \in (1, 2)`. We determine the optimal hyperparameter
 # ``p`` with a grid search,
 
@@ -535,7 +502,7 @@ def score(self, X, y, sample_weight=None):
             "subset": subset_label,
             "observed": df.ClaimAmount.values.sum(),
             "predicted, frequency*severity model": np.sum(
-                est_prod.predict(X, exposure=df.Exposure.values)
+                df.Exposure.values*glm_freq.predict(X)*glm_sev.predict(X)
             ),
             "predicted, tweedie, power=%.2f"
             % glm_total.best_estimator_.family.power: np.sum(
@@ -545,5 +512,3 @@ def score(self, X, y, sample_weight=None):
     )
 
 print(pd.DataFrame(res).set_index("subset").T)
-
-plt.show()

From b1f5bde2ea12f7195d5efcf30d3a28fa9db1ef7f Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 18 Sep 2019 16:57:48 +0200
Subject: [PATCH 170/209] Add "lift curve" for model validation in Poisson
 example

---
 ...plot_poisson_regression_non_normal_loss.py | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 769f321ff1562..3cdbc7cc1a789 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -39,6 +39,7 @@
 from sklearn.preprocessing import OrdinalEncoder
 from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
 from sklearn.ensemble import RandomForestRegressor
+from sklearn.utils import gen_batches
 
 from sklearn.metrics import mean_squared_error, mean_absolute_error
 from sklearn.metrics import mean_poisson_deviance
@@ -295,3 +296,93 @@ def score_estimator(estimator, df_test):
 # Thus, among the considered estimators, ``PoissonRegressor`` is better suited
 # for modeling the long tail distribution of the data as compared to the
 # ``Ridge`` and ``RandomForestRegressor`` estimators.
+#
+# To ensure that estimators yield reasonable predictions for different
+# policyholder types, we can bin test samples according to `y_pred` returned by
+# each model. Then for each bin, compare the mean predicted `y_pred`, with
+# the mean observed target.
+
+
+def _lift_curve(y_true, y_pred, sample_weights=None, n_bins=100):
+    """Compare predictions and observations for bins
+    ordered by y_pred
+
+    We order the samples by ``y_pred`` and split it in bins.
+    In each bin the observed mean is compared with the predicted
+    mean.
+
+    Parameters
+    ----------
+    y_true: array-like of shape (n_samples,)
+        Ground truth (correct) target values.
+    y_pred: array-like of shape (n_samples,)
+        Estimated target values.
+    sample_weight : array-like of shape (n_samples,)
+        Sample weights.
+    n_bins: int
+        number of bins to use
+
+    Returns
+    -------
+    bin_centers: ndarray of shape (n_bins,)
+        bin centers
+    y_true_bin: ndarray of shape (n_bins,)
+        average y_pred for each bin
+    y_pred_bin: ndarray of shape (n_bins,)
+        average y_pred for each bin
+    """
+    idx_sort = np.argsort(y_pred)
+
+    bin_centers = np.arange(0, 1, 1/n_bins) + 0.5/n_bins
+
+    y_pred_bin = np.zeros(n_bins)
+    y_true_bin = np.zeros(n_bins)
+    bin_size = len(y_true) // n_bins
+    for n, sl in enumerate(gen_batches(len(y_true), bin_size)):
+        weights = sample_weights[idx_sort][sl]
+        y_pred_bin[n] = np.average(
+               y_pred[idx_sort][sl], weights=weights
+        )
+        y_true_bin[n] = np.average(
+            y_true[idx_sort][sl],
+            weights=weights
+        )
+    return bin_centers, y_true_bin, y_pred_bin
+
+
+fig, ax = plt.subplots(1, 3, figsize=(12, 3.2))
+plt.subplots_adjust(wspace=0.3)
+
+
+for axi, (label, model, color) in zip(ax, [
+        ('Ridge', linregr, 'b'),
+        ('PoissonRegressor', glm_freq, 'k'),
+        ('Random Forest', gbr, 'r')
+]):
+    y_pred = model.predict(df_test)
+
+    q, y_true_seg, y_pred_seg = _lift_curve(
+        df_test.Frequency.values,
+        y_pred,
+        sample_weights=df_test.Exposure.values,
+        n_bins=10)
+
+    axi.plot(q, y_pred_seg, 'o'+color, label="predictions", ms=5)
+    axi.step(q, y_true_seg, '--'+color, label="observations",
+             where='mid')
+    axi.set_xlim(0, 1.0)
+    axi.set(
+        title=label,
+        xlabel='Fraction of samples sorted by y_pred',
+        ylabel='Mean Frequency (y_pred)'
+
+    )
+
+    axi.legend()
+
+
+##############################################################################
+#
+# On the above figure, ``PoissonRegressor`` is the model which presents the
+# best consistency between predicted and observed targets, both for low
+# and high target values.

From a9ab4e4975d3b8b1bf27e73561bbadc921aac5bf Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 25 Sep 2019 19:24:11 +0200
Subject: [PATCH 171/209] Various improvements to the model comparison example

---
 ...plot_poisson_regression_non_normal_loss.py | 153 +++++++++---------
 1 file changed, 79 insertions(+), 74 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 3cdbc7cc1a789..5c044d5530bd8 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -39,7 +39,7 @@
 from sklearn.preprocessing import OrdinalEncoder
 from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.utils import gen_batches
+from sklearn.utils import gen_even_slices
 
 from sklearn.metrics import mean_squared_error, mean_absolute_error
 from sklearn.metrics import mean_poisson_deviance
@@ -149,8 +149,8 @@ def load_mtpl2(n_samples=100000):
     column_trans,
     DummyRegressor(strategy='mean')
 )
-dummy.fit(df_train, df_train.Frequency,
-          dummyregressor__sample_weight=df_train.Exposure)
+dummy.fit(df_train, df_train["Frequency"],
+          dummyregressor__sample_weight=df_train["Exposure"])
 
 
 def score_estimator(estimator, df_test):
@@ -159,11 +159,11 @@ def score_estimator(estimator, df_test):
     y_pred = estimator.predict(df_test)
 
     print("MSE: %.3f" % mean_squared_error(
-              df_test.Frequency.values, y_pred,
-              df_test.Exposure.values))
+              df_test["Frequency"], y_pred,
+              df_test["Exposure"]))
     print("MAE: %.3f" % mean_absolute_error(
-              df_test.Frequency.values, y_pred,
-              df_test.Exposure.values))
+              df_test["Frequency"], y_pred,
+              df_test["Exposure"]))
 
     # ignore negative predictions, as they are invalid for
     # the Poisson deviance
@@ -174,12 +174,12 @@ def score_estimator(estimator, df_test):
                       "poisson deviance".format((~mask).sum(), mask.shape[0]))
 
     print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
-            df_test.Frequency.values[mask],
+            df_test["Frequency"][mask],
             y_pred[mask],
-            df_test.Exposure.values[mask]))
+            df_test["Exposure"][mask]))
 
 
-print("DummyRegressor")
+print("Constant mean frequency evaluation:")
 score_estimator(dummy, df_test)
 
 ##############################################################################
@@ -187,12 +187,12 @@ def score_estimator(estimator, df_test):
 # We start by modeling the target variable with the least squares linear
 # regression model,
 
-linregr = make_pipeline(
+ridge = make_pipeline(
     column_trans,
     Ridge(alpha=1.0)
 )
-linregr.fit(df_train, df_train.Frequency,
-            ridge__sample_weight=df_train.Exposure)
+ridge.fit(df_train, df_train["Frequency"],
+          ridge__sample_weight=df_train["Exposure"])
 
 ##############################################################################
 #
@@ -203,32 +203,33 @@ def score_estimator(estimator, df_test):
 # apporach could be to use class:`compose.TransformedTargetRegressor`
 # meta-estimator to map ``y_pred`` to strictly positive domain.
 
-print("Ridge")
-score_estimator(linregr, df_test)
+print("Ridge evaluation:")
+score_estimator(ridge, df_test)
 
 ##############################################################################
 #
 # Next we fit the Poisson regressor on the target variable,
 
-glm_freq = make_pipeline(
+poisson = make_pipeline(
     column_trans,
     PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000)
 )
-glm_freq.fit(df_train, df_train.Frequency,
-             poissonregressor__sample_weight=df_train.Exposure)
+poisson.fit(df_train, df_train["Frequency"],
+            poissonregressor__sample_weight=df_train["Exposure"])
 
-print("PoissonRegressor")
-score_estimator(glm_freq, df_test)
+print("PoissonRegressor evaluation:")
+score_estimator(poisson, df_test)
 
 ##############################################################################
 #
-# Finally, we will consider a non linear model with a random forest that
-# still minimizes the least square error.  Random forest does
-# not require for categorical data to be one hot encoded, therefore here we use
-# a simpler pre-processing pipeline with :class:`preprocessing.OrdinalEncoder`,
-
-
-gbr = make_pipeline(
+# Finally, we will consider a non-linear model, namely a random forest. Random
+# forests do not require the categorical data to be one-hot encoded, instead
+# we encode each category label with an arbirtrary integer using
+# :class:`preprocessing.OrdinalEncoder` to make the model faster to train (the
+# same information is encoded with a small number of features than with
+# one-hot encoding).
+
+rf = make_pipeline(
     ColumnTransformer(
         [
             (
@@ -242,53 +243,55 @@ def score_estimator(estimator, df_test):
         ],
         remainder="drop",
     ),
-    RandomForestRegressor(min_weight_fraction_leaf=1e-2)
+    RandomForestRegressor(min_weight_fraction_leaf=0.01, n_jobs=2)
 )
-gbr.fit(df_train, df_train.Frequency.values,
-        randomforestregressor__sample_weight=df_train.Exposure.values)
+rf.fit(df_train, df_train["Frequency"].values,
+       randomforestregressor__sample_weight=df_train["Exposure"].values)
+
 
+print("RandomForestRegressor evaluation:")
+score_estimator(rf, df_test)
 
-print("RandomForestRegressor")
-score_estimator(gbr, df_test)
 
 ##############################################################################
 #
-# In this example, although random forest minimizes the least square error,
-# because of a higher predictive power it also results in a smaller Poisson
-# deviance than the Poisson regression model.
+# The random forest model also minimizes the conditional least square error.
+# However because of a higher predictive power it also results in a smaller
+# Poisson deviance than the Poisson regression model.
 #
-# Evaluating models with a single train / test split is prone to numerical
-# errors, we can verify that we would also get equivalent resuts with the
-# cross-validation score.
+# Not that Evaluating models with a single train / test split is prone to
+# random fluctuations. We can verify that we would also get equivalent
+# conclusions with cross-validated performance metrics.
 #
-# The difference between these models can also be visualized by comparing the
+# The qualitative difference between these models can also be visualized by comparing the
 # histogram of observed target values with that of predicted values,
 
 
 fig, axes = plt.subplots(1, 4, figsize=(16, 3))
 fig.subplots_adjust(bottom=0.2)
+n_bins = 20
+df_train["Frequency"].hist(bins=np.linspace(-1, 10, n_bins), ax=axes[0])
 
-df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=axes[0])
+axes[0].set_title("Data")
+axes[0].set_xlabel("y (observed Frequency)")
 
-axes[0].set_title('Experimental data')
-
-for idx, model in enumerate([linregr, glm_freq, gbr]):
+for idx, model in enumerate([ridge, poisson, rf]):
     y_pred = model.predict(df_train)
 
-    pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=axes[idx+1])
+    pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), ax=axes[idx+1])
     axes[idx + 1].set_title(model[-1].__class__.__name__)
 
 for axi in axes:
     axi.set(
         yscale='log',
-        xlabel="y (Frequency)"
+        xlabel="y_pred (predicted expected Frequency)"
     )
 
 ##############################################################################
 #
 # The experimental data presents a long tail distribution for ``y``. In all
-# models we predict the mean expected value, so we will have necessairily fewer
-# extreme values. Additionally normal distribution used in ``Ridge`` and
+# models we predict the mean expected value, so we will have necessairily
+# fewer extreme values. Additionally normal distribution used in ``Ridge`` and
 # ``RandomForestRegressor`` has a constant variance, while for the Poisson
 # distribution used in ``PoissonRegressor``, the variance is proportional to
 # the mean predicted value.
@@ -298,14 +301,13 @@ def score_estimator(estimator, df_test):
 # ``Ridge`` and ``RandomForestRegressor`` estimators.
 #
 # To ensure that estimators yield reasonable predictions for different
-# policyholder types, we can bin test samples according to `y_pred` returned by
-# each model. Then for each bin, compare the mean predicted `y_pred`, with
-# the mean observed target.
+# policyholder types, we can bin test samples according to `y_pred` returned
+# by each model. Then for each bin, compare the mean predicted `y_pred`, with
+# the mean observed target:
 
 
-def _lift_curve(y_true, y_pred, sample_weights=None, n_bins=100):
-    """Compare predictions and observations for bins
-    ordered by y_pred
+def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100):
+    """Compare predictions and observations for bins ordered by y_pred
 
     We order the samples by ``y_pred`` and split it in bins.
     In each bin the observed mean is compared with the predicted
@@ -332,14 +334,12 @@ def _lift_curve(y_true, y_pred, sample_weights=None, n_bins=100):
         average y_pred for each bin
     """
     idx_sort = np.argsort(y_pred)
-
     bin_centers = np.arange(0, 1, 1/n_bins) + 0.5/n_bins
-
     y_pred_bin = np.zeros(n_bins)
     y_true_bin = np.zeros(n_bins)
-    bin_size = len(y_true) // n_bins
-    for n, sl in enumerate(gen_batches(len(y_true), bin_size)):
-        weights = sample_weights[idx_sort][sl]
+
+    for n, sl in enumerate(gen_even_slices(len(y_true), n_bins)):
+        weights = sample_weight[idx_sort][sl]
         y_pred_bin[n] = np.average(
                y_pred[idx_sort][sl], weights=weights
         )
@@ -350,39 +350,44 @@ def _lift_curve(y_true, y_pred, sample_weights=None, n_bins=100):
     return bin_centers, y_true_bin, y_pred_bin
 
 
-fig, ax = plt.subplots(1, 3, figsize=(12, 3.2))
+fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.2))
 plt.subplots_adjust(wspace=0.3)
 
-
-for axi, (label, model, color) in zip(ax, [
-        ('Ridge', linregr, 'b'),
-        ('PoissonRegressor', glm_freq, 'k'),
-        ('Random Forest', gbr, 'r')
+for axi, (label, model) in zip(ax, [
+        ('Ridge', ridge),
+        ('PoissonRegressor', poisson),
+        ('Random Forest', rf)
 ]):
     y_pred = model.predict(df_test)
 
-    q, y_true_seg, y_pred_seg = _lift_curve(
-        df_test.Frequency.values,
+    q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group(
+        df_test["Frequency"].values,
         y_pred,
-        sample_weights=df_test.Exposure.values,
-        n_bins=10)
+        sample_weights=df_test["Exposure"].values,
+        n_bins=5)
 
-    axi.plot(q, y_pred_seg, 'o'+color, label="predictions", ms=5)
-    axi.step(q, y_true_seg, '--'+color, label="observations",
-             where='mid')
+    axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions")
+    axi.plot(q, y_true_seg, marker='x', linestyle="--", label="observations")
     axi.set_xlim(0, 1.0)
+    axi.set_ylim(0, 0.3)
     axi.set(
         title=label,
         xlabel='Fraction of samples sorted by y_pred',
         ylabel='Mean Frequency (y_pred)'
 
     )
-
     axi.legend()
 
 
 ##############################################################################
 #
 # On the above figure, ``PoissonRegressor`` is the model which presents the
-# best consistency between predicted and observed targets, both for low
-# and high target values.
+# best consistency between predicted and observed targets, both for low and
+# high target values.
+#
+# The ridge regression model tends to predict very low expected frequencies
+# that do not match the data.
+#
+# The random forest regression model also tends to exaggerate low predicted
+# frequencies although to a lower extent than ridge. It also tends to
+# exaggerate high frequencies on the other hand.

From be7bb67e2859b6c74854aeb942a008c3947757e2 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 25 Sep 2019 19:31:51 +0200
Subject: [PATCH 172/209] Add cumulated claims plot

---
 ...plot_poisson_regression_non_normal_loss.py | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 5c044d5530bd8..cbfbfc45dd703 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -391,3 +391,48 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100
 # The random forest regression model also tends to exaggerate low predicted
 # frequencies although to a lower extent than ridge. It also tends to
 # exaggerate high frequencies on the other hand.
+
+
+
+def _cumulated_claims(y_true, y_pred, exposure):
+    idx_sort = np.argsort(y_pred)[::-1]
+    sorted_exposure = exposure[idx_sort]
+    sorted_frequencies = y_true[idx_sort]
+    cumulated_exposure = np.cumsum(sorted_exposure)
+    cumulated_exposure /= cumulated_exposure[-1]
+    cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies)
+    cumulated_claims /= cumulated_claims[-1]
+    return cumulated_exposure, cumulated_claims
+
+
+fig, ax = plt.subplots(figsize=(8, 8))
+plt.subplots_adjust(wspace=0.3)
+
+for (label, model) in [
+        ('Ridge', ridge),
+        ('PoissonRegressor', poisson),
+        ('Random Forest', rf)
+]:
+    y_pred = model.predict(df_test)
+    cum_exposure, cum_claims = _cumulated_claims(
+        df_test["Frequency"].values,
+        y_pred,
+        df_test["Exposure"].values)
+    ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
+
+# Oracle model
+cum_exposure, cum_claims = _cumulated_claims(
+    df_test["Frequency"].values,
+    df_test["Frequency"].values,
+    df_test["Exposure"].values)
+ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label="Oracle")
+
+# Random Baseline
+ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline")
+ax.set(
+    title="Cumulated claims by model",
+    xlabel='Fraction of cumulated exposure (from riskiest to safest)',
+    ylabel='Fraction of cumulated number of claims'
+
+)
+ax.legend()

From 4125c20c9a5d06a0d3ed8241cceb6aaf582b350f Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 26 Sep 2019 09:05:17 +0200
Subject: [PATCH 173/209] Improve the cumulated nb claims plot

---
 ...plot_poisson_regression_non_normal_loss.py | 44 +++++++++++++++----
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index cbfbfc45dd703..22a4b419cf483 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -263,8 +263,9 @@ def score_estimator(estimator, df_test):
 # random fluctuations. We can verify that we would also get equivalent
 # conclusions with cross-validated performance metrics.
 #
-# The qualitative difference between these models can also be visualized by comparing the
-# histogram of observed target values with that of predicted values,
+# The qualitative difference between these models can also be visualized by
+# comparing the histogram of observed target values with that of predicted
+# values,
 
 
 fig, axes = plt.subplots(1, 4, figsize=(16, 3))
@@ -306,7 +307,8 @@ def score_estimator(estimator, df_test):
 # the mean observed target:
 
 
-def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100):
+def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
+                                  n_bins=100):
     """Compare predictions and observations for bins ordered by y_pred
 
     We order the samples by ``y_pred`` and split it in bins.
@@ -363,7 +365,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100
     q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group(
         df_test["Frequency"].values,
         y_pred,
-        sample_weights=df_test["Exposure"].values,
+        sample_weight=df_test["Exposure"].values,
         n_bins=5)
 
     axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions")
@@ -391,11 +393,21 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100
 # The random forest regression model also tends to exaggerate low predicted
 # frequencies although to a lower extent than ridge. It also tends to
 # exaggerate high frequencies on the other hand.
-
+#
+# However for some business applications we are not necessarily interested in
+# the the ability of the model in predicting the expected frequency value but
+# instead in predicting which customer profiles are the riskiest and which are
+# the safest. In this case the model evaluation would cast the problem as a
+# ranking problem rather than a regression problem.
+#
+# To compare the 3 models under this light on, one can plot the fraction
+# of cumulated number of claims vs the fraction of cumulated of exposure
+# for test samples ordered by the model predictions, from riskiest to safest
+# according to each model:
 
 
 def _cumulated_claims(y_true, y_pred, exposure):
-    idx_sort = np.argsort(y_pred)[::-1]
+    idx_sort = np.argsort(y_pred)[::-1]  # from riskiest to safest
     sorted_exposure = exposure[idx_sort]
     sorted_frequencies = y_true[idx_sort]
     cumulated_exposure = np.cumsum(sorted_exposure)
@@ -420,7 +432,7 @@ def _cumulated_claims(y_true, y_pred, exposure):
         df_test["Exposure"].values)
     ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
 
-# Oracle model
+# Oracle model: y_pred == y_test
 cum_exposure, cum_claims = _cumulated_claims(
     df_test["Frequency"].values,
     df_test["Frequency"].values,
@@ -433,6 +445,22 @@ def _cumulated_claims(y_true, y_pred, exposure):
     title="Cumulated claims by model",
     xlabel='Fraction of cumulated exposure (from riskiest to safest)',
     ylabel='Fraction of cumulated number of claims'
-
 )
 ax.legend()
+
+##############################################################################
+#
+# This plot reveals that the random forest model is almost uniformly the best
+# at sorting customers by risk profiles even if the absolute value of the
+# predicted expected frequencies are less well calibrated than for the linear
+# Poisson model.
+#
+#
+# All three models are significantly better than chance but also very far from
+# making perfect predictions.
+#
+# This last point is expected due to the nature of the problem: the occurence
+# of accidents is mostly dominated by environmental causes that are not
+# captured in the columns of the dataset.
+
+plt.show()

From 0070d527e7b3399cae9a6bd6447b4de636191481 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 26 Sep 2019 10:41:02 +0200
Subject: [PATCH 174/209] Fix wrong xlabel in histogram plot

---
 .../plot_poisson_regression_non_normal_loss.py             | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 22a4b419cf483..47e08b618758c 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -274,16 +274,15 @@ def score_estimator(estimator, df_test):
 df_train["Frequency"].hist(bins=np.linspace(-1, 10, n_bins), ax=axes[0])
 
 axes[0].set_title("Data")
+axes[0].set_yscale('log')
 axes[0].set_xlabel("y (observed Frequency)")
 
 for idx, model in enumerate([ridge, poisson, rf]):
     y_pred = model.predict(df_train)
 
     pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), ax=axes[idx+1])
-    axes[idx + 1].set_title(model[-1].__class__.__name__)
-
-for axi in axes:
-    axi.set(
+    axes[idx + 1].set(
+        title=model[-1].__class__.__name__,
         yscale='log',
         xlabel="y_pred (predicted expected Frequency)"
     )

From 9d6bb5258ac604bc523beb1c8b85357344a9929e Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 26 Sep 2019 12:03:51 +0200
Subject: [PATCH 175/209] More example improvements (preprocessors + plots)

---
 ...plot_poisson_regression_non_normal_loss.py | 130 +++++++++---------
 1 file changed, 63 insertions(+), 67 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 47e08b618758c..76f957c57b6da 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -40,6 +40,7 @@
 from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.utils import gen_even_slices
+from sklearn.metrics import auc
 
 from sklearn.metrics import mean_squared_error, mean_absolute_error
 from sklearn.metrics import mean_poisson_deviance
@@ -95,26 +96,33 @@ def load_mtpl2(n_samples=100000):
 df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
 df["Exposure"] = df["Exposure"].clip(upper=1)
 
-column_trans = ColumnTransformer(
+##############################################################################
+#
+# The remaining columns can be used to predict the frequency of claim events.
+# Those columns are very heterogeneous with a mix of categorical and numeric
+# variables with different scales, possibly with heavy tails.
+#
+# In order to fit linear models with those predictors it is therefore
+# necessary to perform standard feature transformation as follows:
+
+log_scale_transformer = make_pipeline(
+    FunctionTransformer(np.log, validate=False),
+    StandardScaler()
+)
+
+linear_model_preprocessor = ColumnTransformer(
     [
-        ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
-        (
-            "Veh_Brand_Gas_Region",
-            OneHotEncoder(),
-            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
-        ),
-        ("BonusMalus", "passthrough", ["BonusMalus"]),
-        (
-            "Density_log",
-            make_pipeline(
-                FunctionTransformer(np.log, validate=False), StandardScaler()
-            ),
-            ["Density"],
-        ),
+        ("passthrough_numeric", "passthrough",
+            ["BonusMalus"]),
+        ("binned_numeric", KBinsDiscretizer(n_bins=10),
+            ["VehAge", "DrivAge"]),
+        ("log_scaled_numeric", log_scale_transformer,
+            ["Density"]),
+        ("onehot_categorical", OneHotEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
     ],
     remainder="drop",
 )
-X = column_trans.fit_transform(df)
 
 ##############################################################################
 #
@@ -141,12 +149,13 @@ def load_mtpl2(n_samples=100000):
 # significantly imbalanced.
 #
 # To evaluate the pertinence of the used metrics, we will consider as a
-# baseline an estimator that returns the mean of the training sample.
+# baseline an estimator that constantly predicts the mean frequency of the
+# training sample.
 
 df_train, df_test = train_test_split(df, random_state=0)
 
 dummy = make_pipeline(
-    column_trans,
+    linear_model_preprocessor,
     DummyRegressor(strategy='mean')
 )
 dummy.fit(df_train, df_train["Frequency"],
@@ -187,10 +196,7 @@ def score_estimator(estimator, df_test):
 # We start by modeling the target variable with the least squares linear
 # regression model,
 
-ridge = make_pipeline(
-    column_trans,
-    Ridge(alpha=1.0)
-)
+ridge = make_pipeline(linear_model_preprocessor, Ridge(alpha=1.0))
 ridge.fit(df_train, df_train["Frequency"],
           ridge__sample_weight=df_train["Exposure"])
 
@@ -211,7 +217,7 @@ def score_estimator(estimator, df_test):
 # Next we fit the Poisson regressor on the target variable,
 
 poisson = make_pipeline(
-    column_trans,
+    linear_model_preprocessor,
     PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000)
 )
 poisson.fit(df_train, df_train["Frequency"],
@@ -229,20 +235,17 @@ def score_estimator(estimator, df_test):
 # same information is encoded with a small number of features than with
 # one-hot encoding).
 
+rf_preprocessor = ColumnTransformer(
+    [
+        ("categorical", OrdinalEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
+        ("numeric", "passthrough",
+            ["VehAge", "DrivAge", "BonusMalus", "Density"]),
+    ],
+    remainder="drop",
+)
 rf = make_pipeline(
-    ColumnTransformer(
-        [
-            (
-                "Veh_Brand_Gas_Region", OrdinalEncoder(),
-                ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
-            ),
-            (
-                "Continious", "passthrough",
-                ["VehAge", "DrivAge", "BonusMalus", "Density"]
-            ),
-        ],
-        remainder="drop",
-    ),
+    rf_preprocessor,
     RandomForestRegressor(min_weight_fraction_leaf=0.01, n_jobs=2)
 )
 rf.fit(df_train, df_train["Frequency"].values,
@@ -351,14 +354,10 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
     return bin_centers, y_true_bin, y_pred_bin
 
 
-fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.2))
+fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5))
 plt.subplots_adjust(wspace=0.3)
 
-for axi, (label, model) in zip(ax, [
-        ('Ridge', ridge),
-        ('PoissonRegressor', poisson),
-        ('Random Forest', rf)
-]):
+for axi, model in zip(ax, [ridge,  poisson, rf]):
     y_pred = model.predict(df_test)
 
     q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group(
@@ -372,19 +371,19 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
     axi.set_xlim(0, 1.0)
     axi.set_ylim(0, 0.3)
     axi.set(
-        title=label,
+        title=model[-1].__class__.__name__,
         xlabel='Fraction of samples sorted by y_pred',
         ylabel='Mean Frequency (y_pred)'
 
     )
     axi.legend()
-
+plt.tight_layout()
 
 ##############################################################################
 #
 # On the above figure, ``PoissonRegressor`` is the model which presents the
 # best consistency between predicted and observed targets, both for low and
-# high target values.
+# high predicted target values.
 #
 # The ridge regression model tends to predict very low expected frequencies
 # that do not match the data.
@@ -393,16 +392,16 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
 # frequencies although to a lower extent than ridge. It also tends to
 # exaggerate high frequencies on the other hand.
 #
-# However for some business applications we are not necessarily interested in
-# the the ability of the model in predicting the expected frequency value but
-# instead in predicting which customer profiles are the riskiest and which are
-# the safest. In this case the model evaluation would cast the problem as a
-# ranking problem rather than a regression problem.
+# However, for some business applications, we are not necessarily interested
+# in the the ability of the model in predicting the expected frequency value
+# but instead in predicting which policyholder groups are the riskiest and
+# which are the safest. In this case the model evaluation would cast the
+# problem as a ranking problem rather than a regression problem.
 #
-# To compare the 3 models under this light on, one can plot the fraction
-# of cumulated number of claims vs the fraction of cumulated of exposure
-# for test samples ordered by the model predictions, from riskiest to safest
-# according to each model:
+# To compare the 3 models under this light on, one can plot the fraction of
+# cumulated number of claims vs the fraction of cumulated of exposure for test
+# samples ordered by the model predictions, from riskiest to safest according
+# to each model:
 
 
 def _cumulated_claims(y_true, y_pred, exposure):
@@ -417,18 +416,16 @@ def _cumulated_claims(y_true, y_pred, exposure):
 
 
 fig, ax = plt.subplots(figsize=(8, 8))
-plt.subplots_adjust(wspace=0.3)
 
-for (label, model) in [
-        ('Ridge', ridge),
-        ('PoissonRegressor', poisson),
-        ('Random Forest', rf)
-]:
+for model in [ridge, poisson, rf]:
     y_pred = model.predict(df_test)
     cum_exposure, cum_claims = _cumulated_claims(
         df_test["Frequency"].values,
         y_pred,
         df_test["Exposure"].values)
+    area = auc(cum_exposure, cum_claims)
+    label = "{} (area under curve: {:.3f})".format(
+        model[-1].__class__.__name__, area)
     ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
 
 # Oracle model: y_pred == y_test
@@ -449,17 +446,16 @@ def _cumulated_claims(y_true, y_pred, exposure):
 
 ##############################################################################
 #
-# This plot reveals that the random forest model is almost uniformly the best
-# at sorting customers by risk profiles even if the absolute value of the
-# predicted expected frequencies are less well calibrated than for the linear
-# Poisson model.
-#
+# This plot reveals that the random forest model is slightly better at ranking
+# policyholders by risk profiles even if the absolute value of the predicted
+# expected frequencies are less well calibrated than for the linear Poisson
+# model.
 #
 # All three models are significantly better than chance but also very far from
 # making perfect predictions.
 #
-# This last point is expected due to the nature of the problem: the occurence
-# of accidents is mostly dominated by environmental causes that are not
+# This last point is expected due to the nature of the problem: the occurrence
+# of accidents is mostly dominated by circumstantial causes that are not
 # captured in the columns of the dataset.
 
 plt.show()

From b353b2dcdb25bd2e55dffaacda1b80856f4b5b78 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 26 Sep 2019 13:14:09 +0200
Subject: [PATCH 176/209] Simplify dataset + use more data

---
 ...plot_poisson_regression_non_normal_loss.py | 99 ++++++++-----------
 1 file changed, 39 insertions(+), 60 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 76f957c57b6da..da2a4bd2dccf8 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -9,10 +9,6 @@
 number of insurance claims (or frequency) following car accidents for a
 policyholder given historical data over a population of policyholders.
 
-We start by defining a few helper functions for loading the data and
-visualizing results.
-
-
 .. [1]  A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor
     Third-Party Liability Claims (November 8, 2018).
     `doi:10.2139/ssrn.3164764 <http://dx.doi.org/10.2139/ssrn.3164764>`_
@@ -46,7 +42,7 @@
 from sklearn.metrics import mean_poisson_deviance
 
 
-def load_mtpl2(n_samples=100000):
+def load_mtpl2(n_samples=None):
     """Fetcher for French Motor Third-Party Liability Claims dataset
 
     Parameters
@@ -57,43 +53,27 @@ def load_mtpl2(n_samples=100000):
     """
 
     # freMTPL2freq dataset from https://www.openml.org/d/41214
-    df_freq = fetch_openml(data_id=41214, as_frame=True)['data']
-    df_freq['IDpol'] = df_freq['IDpol'].astype(np.int)
-    df_freq.set_index('IDpol', inplace=True)
-
-    # freMTPL2sev dataset from https://www.openml.org/d/41215
-    df_sev = fetch_openml(data_id=41215, as_frame=True)['data']
-
-    # sum ClaimAmount over identical IDs
-    df_sev = df_sev.groupby('IDpol').sum()
-
-    df = df_freq.join(df_sev, how="left")
-    df["ClaimAmount"].fillna(0, inplace=True)
+    df = fetch_openml(data_id=41214, as_frame=True)['data']
 
     # unquote string fields
     for column_name in df.columns[df.dtypes.values == np.object]:
         df[column_name] = df[column_name].str.strip("'")
-    return df.iloc[:n_samples]
+    if n_samples is not None:
+        return df.iloc[:n_samples]
+    return df
 
 
 ##############################################################################
 #
-# 1. Loading datasets and pre-processing
-# --------------------------------------
+# Let's load the motor claim dataset. We ignore the severity data for this
+# study for the sake of simplicitly.
 #
-# We construct the freMTPL2 dataset by joining the freMTPL2freq table,
-# containing the number of claims (``ClaimNb``) with the freMTPL2sev table
-# containing the claim amount (``ClaimAmount``) for the same policy ids
-# (``IDpol``).
-
-df = load_mtpl2(n_samples=50000)
+# We also subsample the data for the sake of computational cost and running
+# time. Using the full dataset would lead to similar conclusions.
 
-# Note: filter out claims with zero amount, as the severity model
-# requires strictly positive target values.
-df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
+df = load_mtpl2(n_samples=300000)
 
-# correct for unreasonable observations (that might be data error)
-df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
+# Correct for unreasonable observations (that might be data error)
 df["Exposure"] = df["Exposure"].clip(upper=1)
 
 ##############################################################################
@@ -133,14 +113,14 @@ def load_mtpl2(n_samples=100000):
 # ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution,
 # and use ``Exposure`` as `sample_weight`.
 
-df["Frequency"] = df.ClaimNb / df.Exposure
+df["Frequency"] = df["ClaimNb"] / df["Exposure"]
 
 print(
-   pd.cut(df.Frequency, [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts()
+   pd.cut(df["Frequency"], [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts()
 )
 
 print("Average Frequency = {}"
-      .format(np.average(df.Frequency, weights=df.Exposure)))
+      .format(np.average(df["Frequency"], weights=df["Exposure"])))
 
 ##############################################################################
 #
@@ -262,13 +242,13 @@ def score_estimator(estimator, df_test):
 # However because of a higher predictive power it also results in a smaller
 # Poisson deviance than the Poisson regression model.
 #
-# Not that Evaluating models with a single train / test split is prone to
-# random fluctuations. We can verify that we would also get equivalent
-# conclusions with cross-validated performance metrics.
+# Evaluating models with a single train / test split is prone to random
+# fluctuations. If computation resources allow, it should be verified that
+# cross-validated performance metrics would lead to similar conclusions.
 #
 # The qualitative difference between these models can also be visualized by
 # comparing the histogram of observed target values with that of predicted
-# values,
+# values:
 
 
 fig, axes = plt.subplots(1, 4, figsize=(16, 3))
@@ -293,8 +273,8 @@ def score_estimator(estimator, df_test):
 ##############################################################################
 #
 # The experimental data presents a long tail distribution for ``y``. In all
-# models we predict the mean expected value, so we will have necessairily
-# fewer extreme values. Additionally normal distribution used in ``Ridge`` and
+# models we predict the mean expected value, so we will have necessarily fewer
+# extreme values. Additionally normal distribution used in ``Ridge`` and
 # ``RandomForestRegressor`` has a constant variance, while for the Poisson
 # distribution used in ``PoissonRegressor``, the variance is proportional to
 # the mean predicted value.
@@ -364,12 +344,12 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
         df_test["Frequency"].values,
         y_pred,
         sample_weight=df_test["Exposure"].values,
-        n_bins=5)
+        n_bins=10)
 
     axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions")
     axi.plot(q, y_true_seg, marker='x', linestyle="--", label="observations")
     axi.set_xlim(0, 1.0)
-    axi.set_ylim(0, 0.3)
+    axi.set_ylim(0, 0.6)
     axi.set(
         title=model[-1].__class__.__name__,
         xlabel='Fraction of samples sorted by y_pred',
@@ -381,16 +361,13 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
 
 ##############################################################################
 #
-# On the above figure, ``PoissonRegressor`` is the model which presents the
-# best consistency between predicted and observed targets, both for low and
-# high predicted target values.
-#
-# The ridge regression model tends to predict very low expected frequencies
-# that do not match the data.
+# The ``Ridge`` regression model can predict very low expected frequencies
+# that do not match the data. It can therefore severly under-estimate the risk
+# for some policyholders.
 #
-# The random forest regression model also tends to exaggerate low predicted
-# frequencies although to a lower extent than ridge. It also tends to
-# exaggerate high frequencies on the other hand.
+# ``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency
+# between predicted and observed targets, especially for low predicted target
+# values.
 #
 # However, for some business applications, we are not necessarily interested
 # in the the ability of the model in predicting the expected frequency value
@@ -399,9 +376,8 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
 # problem as a ranking problem rather than a regression problem.
 #
 # To compare the 3 models under this light on, one can plot the fraction of
-# cumulated number of claims vs the fraction of cumulated of exposure for test
-# samples ordered by the model predictions, from riskiest to safest according
-# to each model:
+# the number of claims vs the fraction of exposure for test samples ordered by
+# the model predictions, from riskiest to safest according to each model:
 
 
 def _cumulated_claims(y_true, y_pred, exposure):
@@ -433,16 +409,19 @@ def _cumulated_claims(y_true, y_pred, exposure):
     df_test["Frequency"].values,
     df_test["Frequency"].values,
     df_test["Exposure"].values)
-ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label="Oracle")
+area = auc(cum_exposure, cum_claims)
+label = "Oracle (area under curve: {:.3f})".format(area)
+ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)
 
 # Random Baseline
-ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline")
+ax.plot([0, 1], [0, 1], linestyle="--", color="black",
+        label="Random baseline")
 ax.set(
-    title="Cumulated claims by model",
-    xlabel='Fraction of cumulated exposure (from riskiest to safest)',
-    ylabel='Fraction of cumulated number of claims'
+    title="Cumulated number of claims by model",
+    xlabel='Fraction of exposure (from riskiest to safest)',
+    ylabel='Fraction of number of claims'
 )
-ax.legend()
+ax.legend(loc="lower right")
 
 ##############################################################################
 #

From 88757fdb99cc516be230fe08ec1ebfb7bea0b694 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 26 Sep 2019 15:59:36 +0200
Subject: [PATCH 177/209] Remove solver parameter from
 {Poisson,Gamma,Tweedie}Regression

---
 sklearn/linear_model/_glm/glm.py | 54 +++++++++++---------------------
 sklearn/neighbors/base.py        |  8 ++---
 2 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index d6da8b8b80949..f7985c0f3bae3 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -80,7 +80,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
             Calls scipy's L-BFGS-B optimizer.
 
     max_iter : int, optional (default=100)
-        The maximal number of iterations for solver algorithms.
+        The maximal number of iterations for the solver.
 
     tol : float, optional (default=1e-4)
         Stopping criterion. For the lbfgs solver,
@@ -113,7 +113,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         Intercept (a.k.a. bias) added to linear predictor.
 
     n_iter_ : int
-        Actual number of iterations used in solver.
+        Actual number of iterations used in the solver.
     """
     def __init__(self, alpha=1.0,
                  fit_intercept=True, family='normal', link='auto',
@@ -423,14 +423,8 @@ class PoissonRegressor(GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    solver : {'lbfgs'}, optional (default='lbfgs')
-        Algorithm to use in the optimization problem:
-
-        'lbfgs'
-            Calls scipy's L-BFGS-B optimizer.
-
     max_iter : int, optional (default=100)
-        The maximal number of iterations for solver algorithms.
+        The maximal number of iterations for the solver.
 
     tol : float, optional (default=1e-4)
         Stopping criterion. For the lbfgs solver,
@@ -458,16 +452,16 @@ class PoissonRegressor(GeneralizedLinearRegressor):
         Intercept (a.k.a. bias) added to linear predictor.
 
     n_iter_ : int
-        Actual number of iterations used in solver.
+        Actual number of iterations used in the solver.
     """
     def __init__(self, alpha=1.0, fit_intercept=True,
-                 solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
+                 max_iter=100, tol=1e-4, warm_start=False,
                  copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
-                         family="poisson", link='log',
-                         solver=solver, max_iter=max_iter, tol=tol,
-                         warm_start=warm_start, copy_X=copy_X, verbose=verbose)
+                         family="poisson", link='log', max_iter=max_iter,
+                         tol=tol, warm_start=warm_start, copy_X=copy_X,
+                         verbose=verbose)
 
     @property
     def family(self):
@@ -511,14 +505,8 @@ class GammaRegressor(GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    solver : {'lbfgs'}, optional (default='lbfgs')
-        Algorithm to use in the optimization problem:
-
-        'lbfgs'
-            Calls scipy's L-BFGS-B optimizer.
-
     max_iter : int, optional (default=100)
-        The maximal number of iterations for solver algorithms.
+        The maximal number of iterations for the solver.
 
     tol : float, optional (default=1e-4)
         Stopping criterion. For the lbfgs solver,
@@ -546,16 +534,16 @@ class GammaRegressor(GeneralizedLinearRegressor):
         Intercept (a.k.a. bias) added to linear predictor.
 
     n_iter_ : int
-        Actual number of iterations used in solver.
+        Actual number of iterations used in the solver.
     """
-    def __init__(self, alpha=1.0, fit_intercept=True, solver='lbfgs',
+    def __init__(self, alpha=1.0, fit_intercept=True,
                  max_iter=100, tol=1e-4, warm_start=False,
                  copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
-                         family="gamma", link='log',
-                         solver=solver, max_iter=max_iter, tol=tol,
-                         warm_start=warm_start, copy_X=copy_X, verbose=verbose)
+                         family="gamma", link='log', max_iter=max_iter,
+                         tol=tol, warm_start=warm_start, copy_X=copy_X,
+                         verbose=verbose)
 
     @property
     def family(self):
@@ -632,14 +620,8 @@ class TweedieRegressor(GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    solver : {'lbfgs'}, optional (default='lbfgs')
-        Algorithm to use in the optimization problem:
-
-        'lbfgs'
-            Calls scipy's L-BFGS-B optimizer.
-
     max_iter : int, optional (default=100)
-        The maximal number of iterations for solver algorithms.
+        The maximal number of iterations for the solver.
 
     tol : float, optional (default=1e-4)
         Stopping criterion. For the lbfgs solver,
@@ -666,15 +648,15 @@ class TweedieRegressor(GeneralizedLinearRegressor):
         Intercept (a.k.a. bias) added to linear predictor.
 
     n_iter_ : int
-        Actual number of iterations used in solver.
+        Actual number of iterations used in the solver.
     """
     def __init__(self, power=0.0, alpha=1.0, fit_intercept=True,
-                 link='auto', solver='lbfgs', max_iter=100, tol=1e-4,
+                 link='auto', max_iter=100, tol=1e-4,
                  warm_start=False, copy_X=True, check_input=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
                          family=TweedieDistribution(power=power), link=link,
-                         solver=solver, max_iter=max_iter, tol=tol,
+                         max_iter=max_iter, tol=tol,
                          warm_start=warm_start, copy_X=copy_X, verbose=verbose)
 
     @property
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
index 9548a619b0b14..d178d607d3636 100644
--- a/sklearn/neighbors/base.py
+++ b/sklearn/neighbors/base.py
@@ -289,13 +289,13 @@ def _pairwise(self):
         return self.metric == 'precomputed'
 
 
-def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance):
+def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance, **kwargs):
     """Helper for the Parallel calls in KNeighborsMixin.kneighbors
 
     The Cython method tree.query is not directly picklable by cloudpickle
     under PyPy.
     """
-    return tree.query(data, n_neighbors, return_distance)
+    return tree.query(data, n_neighbors, return_distance, **kwargs)
 
 
 class KNeighborsMixin:
@@ -336,7 +336,7 @@ def _kneighbors_reduce_func(self, dist, start,
             result = neigh_ind
         return result
 
-    def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
+    def kneighbors(self, X=None, n_neighbors=None, return_distance=True, **kwargs):
         """Finds the K-neighbors of a point.
         Returns indices of and distances to the neighbors of each point.
 
@@ -458,7 +458,7 @@ class from an array representing our data set and ask who's
                 parallel_kwargs = {"prefer": "threads"}
             result = Parallel(n_jobs, **parallel_kwargs)(
                 delayed_query(
-                    self._tree, X[s], n_neighbors, return_distance)
+                    self._tree, X[s], n_neighbors, return_distance, **kwargs)
                 for s in gen_even_slices(X.shape[0], n_jobs)
             )
         else:

From 6d119d43577e46f1a0dde29980df20434a920739 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 26 Sep 2019 16:02:15 +0200
Subject: [PATCH 178/209] Revert some accidental changes from
 88757fdb99cc516be230fe08ec1ebfb7bea0b694.

---
 sklearn/neighbors/base.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
index d178d607d3636..9548a619b0b14 100644
--- a/sklearn/neighbors/base.py
+++ b/sklearn/neighbors/base.py
@@ -289,13 +289,13 @@ def _pairwise(self):
         return self.metric == 'precomputed'
 
 
-def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance, **kwargs):
+def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance):
     """Helper for the Parallel calls in KNeighborsMixin.kneighbors
 
     The Cython method tree.query is not directly picklable by cloudpickle
     under PyPy.
     """
-    return tree.query(data, n_neighbors, return_distance, **kwargs)
+    return tree.query(data, n_neighbors, return_distance)
 
 
 class KNeighborsMixin:
@@ -336,7 +336,7 @@ def _kneighbors_reduce_func(self, dist, start,
             result = neigh_ind
         return result
 
-    def kneighbors(self, X=None, n_neighbors=None, return_distance=True, **kwargs):
+    def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
         """Finds the K-neighbors of a point.
         Returns indices of and distances to the neighbors of each point.
 
@@ -458,7 +458,7 @@ class from an array representing our data set and ask who's
                 parallel_kwargs = {"prefer": "threads"}
             result = Parallel(n_jobs, **parallel_kwargs)(
                 delayed_query(
-                    self._tree, X[s], n_neighbors, return_distance, **kwargs)
+                    self._tree, X[s], n_neighbors, return_distance)
                 for s in gen_even_slices(X.shape[0], n_jobs)
             )
         else:

From b735eb786ca9ef03a9bbfc25851e62a4fbb71f3b Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 26 Sep 2019 16:23:12 +0200
Subject: [PATCH 179/209] Additional comment about the use of properties with
 setters

---
 sklearn/linear_model/_glm/distribution.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py
index a5e42bcee5d1c..5f9e9ed06847c 100644
--- a/sklearn/linear_model/_glm/distribution.py
+++ b/sklearn/linear_model/_glm/distribution.py
@@ -245,6 +245,9 @@ def power(self):
 
     @power.setter
     def power(self, power):
+        # We use a property with a setter, to update lower and
+        # upper bound when the power parameter is updated e.g. in grid
+        # search.
         if not isinstance(power, numbers.Real):
             raise TypeError('power must be a real number, input was {0}'
                             .format(power))

From 2d911143067a9ab0fb206cbbb0b13fa228955969 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 26 Sep 2019 16:43:13 +0200
Subject: [PATCH 180/209] Add additional tests for link derivatives

---
 sklearn/linear_model/_glm/tests/test_link.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py
index 36219e09b58e3..27ec4ed19bdc2 100644
--- a/sklearn/linear_model/_glm/tests/test_link.py
+++ b/sklearn/linear_model/_glm/tests/test_link.py
@@ -4,6 +4,7 @@
 import numpy as np
 from numpy.testing import assert_allclose
 import pytest
+from scipy.optimize import check_grad
 
 from sklearn.linear_model._glm.link import (
     IdentityLink,
@@ -15,12 +16,12 @@
 LINK_FUNCTIONS = [IdentityLink, LogLink, LogitLink]
 
 
-@pytest.mark.parametrize('link', LINK_FUNCTIONS)
-def test_link_properties(link):
+@pytest.mark.parametrize('Link', LINK_FUNCTIONS)
+def test_link_properties(Link):
     """Test link inverse and derivative."""
     rng = np.random.RandomState(42)
     x = rng.rand(100) * 100
-    link = link()  # instantiate object
+    link = Link()
     if isinstance(link, LogitLink):
         # careful for large x, note expit(36) = 1
         # limit max eta to 15
@@ -30,3 +31,15 @@ def test_link_properties(link):
     # g = link, h = link.inverse
     assert_allclose(link.derivative(link.inverse(x)),
                     1 / link.inverse_derivative(x))
+
+
+@pytest.mark.parametrize('Link', LINK_FUNCTIONS)
+def test_link_derivative(Link):
+    link = Link()
+    x = np.random.RandomState(0).rand(1)
+    err = check_grad(link, link.derivative, x) / link.derivative(x)
+    assert abs(err) < 1e-6
+
+    err = (check_grad(link.inverse, link.inverse_derivative, x)
+           / link.derivative(x))
+    assert abs(err) < 1e-6

From 89103bc417646864a1aa85c616a3148cb26ac2ed Mon Sep 17 00:00:00 2001
From: Alexandre Gramfort <alexandre.gramfort@m4x.org>
Date: Sun, 29 Sep 2019 23:06:18 +0200
Subject: [PATCH 181/209] cosmits + typos

---
 doc/modules/linear_model.rst                  |  4 +--
 ...plot_poisson_regression_non_normal_loss.py | 36 +++++++++----------
 ...lot_tweedie_regression_insurance_claims.py |  6 ++--
 3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index a9b2e66599537..e53f309076b3b 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -996,8 +996,8 @@ of the unit variance function:
      weights s=exposure (time, money, persons years, ...). Then you fit
      y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``.
      The weights are necessary for the right (finite sample) mean.
-     Consider :math:`\bar{y} = \frac{\\sum_i s_i y_i}{\sum_i s_i}`,
-     in this case one might say that y has a 'scaled' Poisson distributions.
+     Considering :math:`\bar{y} = \frac{\\sum_i s_i y_i}{\sum_i s_i}`,
+     in this case one might say that y has a 'scaled' Poisson distribution.
      The same holds for other distributions.
 
 The estimator can be used as follows::
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index da2a4bd2dccf8..d99654cf04080 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -129,8 +129,8 @@ def load_mtpl2(n_samples=None):
 # significantly imbalanced.
 #
 # To evaluate the pertinence of the used metrics, we will consider as a
-# baseline an estimator that constantly predicts the mean frequency of the
-# training sample.
+# baseline a "dummy" estimator that constantly predicts the mean frequency of
+# the training sample.
 
 df_train, df_test = train_test_split(df, random_state=0)
 
@@ -143,16 +143,16 @@ def load_mtpl2(n_samples=None):
 
 
 def score_estimator(estimator, df_test):
-    """Score an estimatr on the test set"""
+    """Score an estimator on the test set"""
 
     y_pred = estimator.predict(df_test)
 
-    print("MSE: %.3f" % mean_squared_error(
-              df_test["Frequency"], y_pred,
-              df_test["Exposure"]))
-    print("MAE: %.3f" % mean_absolute_error(
-              df_test["Frequency"], y_pred,
-              df_test["Exposure"]))
+    print("MSE: %.3f" %
+          mean_squared_error(df_test["Frequency"], y_pred,
+                             df_test["Exposure"]))
+    print("MAE: %.3f" %
+          mean_absolute_error(df_test["Frequency"], y_pred,
+                              df_test["Exposure"]))
 
     # ignore negative predictions, as they are invalid for
     # the Poisson deviance
@@ -160,12 +160,12 @@ def score_estimator(estimator, df_test):
     if (~mask).any():
         warnings.warn("estimator yields negative predictions for {} samples "
                       "out of {}. These will be ignored while computing the "
-                      "poisson deviance".format((~mask).sum(), mask.shape[0]))
+                      "Poisson deviance".format((~mask).sum(), mask.shape[0]))
 
-    print("mean Poisson deviance: %.3f" % mean_poisson_deviance(
-            df_test["Frequency"][mask],
-            y_pred[mask],
-            df_test["Exposure"][mask]))
+    print("mean Poisson deviance: %.3f" %
+          mean_poisson_deviance(df_test["Frequency"][mask],
+                                y_pred[mask],
+                                df_test["Exposure"][mask]))
 
 
 print("Constant mean frequency evaluation:")
@@ -285,8 +285,8 @@ def score_estimator(estimator, df_test):
 #
 # To ensure that estimators yield reasonable predictions for different
 # policyholder types, we can bin test samples according to `y_pred` returned
-# by each model. Then for each bin, compare the mean predicted `y_pred`, with
-# the mean observed target:
+# by each model. Then for each bin, we compare the mean predicted `y_pred`,
+# with the mean observed target:
 
 
 def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
@@ -325,7 +325,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
     for n, sl in enumerate(gen_even_slices(len(y_true), n_bins)):
         weights = sample_weight[idx_sort][sl]
         y_pred_bin[n] = np.average(
-               y_pred[idx_sort][sl], weights=weights
+            y_pred[idx_sort][sl], weights=weights
         )
         y_true_bin[n] = np.average(
             y_true[idx_sort][sl],
@@ -337,7 +337,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
 fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5))
 plt.subplots_adjust(wspace=0.3)
 
-for axi, model in zip(ax, [ridge,  poisson, rf]):
+for axi, model in zip(ax, [ridge, poisson, rf]):
     y_pred = model.predict(df_test)
 
     q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group(
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 5a312f656d9ce..1c50541fcd85c 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -98,7 +98,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
     predicted : frame
         a dataframe, with the same index as df, with the predicted target
     fill_legend : bool, default=False
-        wgether to show fill_between legend
+        whether to show fill_between legend
     """
     # aggregate observed and predicted variables by feature level
     df_ = df.loc[:, [feature, weight]].copy()
@@ -219,8 +219,8 @@ def score_estimator(
                 # Score the model consisting of the product of frequency and
                 # severity models, denormalized by the exposure values.
                 est_freq, est_sev = estimator
-                y_pred = (df.Exposure.values * est_freq.predict(X)
-                          * est_sev.predict(X))
+                y_pred = (df.Exposure.values * est_freq.predict(X) *
+                          est_sev.predict(X))
                 power = 1.5
             else:
                 y_pred = estimator.predict(X)

From 4f28a44bff0a7d84de13f66e0d2b5264897f6c95 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Mon, 30 Sep 2019 11:20:25 +0200
Subject: [PATCH 182/209] Address some of Alex's comments

- Link -> BaseLink
- Removed reference to none existing notes
- Use X.dtype for dtype of y
- remove check_input
---
 sklearn/linear_model/_glm/glm.py            | 51 ++++++++-------------
 sklearn/linear_model/_glm/link.py           |  8 ++--
 sklearn/linear_model/_glm/tests/test_glm.py |  2 +-
 3 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index f7985c0f3bae3..819e36e13addf 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -21,7 +21,7 @@
         EDM_DISTRIBUTIONS
 )
 from .link import (
-        Link,
+        BaseLink,
         IdentityLink,
         LogLink,
 )
@@ -47,10 +47,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     ----------
     alpha : float, optional (default=1)
         Constant that multiplies the penalty terms and thus determines the
-        regularization strength.
-        See the notes for the exact mathematical meaning of this
-        parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this
-        case, the design matrix X must have full column rank
+        regularization strength.  ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix X must have full column rank
         (no collinearities).
 
     fit_intercept : boolean, optional (default=True)
@@ -63,7 +61,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         The distributional assumption of the GLM, i.e. which distribution from
         the EDM, specifies the loss function to be minimized.
 
-    link : {'auto', 'identity', 'log'} or an instance of class Link, \
+    link : {'auto', 'identity', 'log'} or an instance of class BaseLink, \
             optional (default='auto')
         The link function of the GLM, i.e. mapping from linear predictor
         (X*coef) to expectation (y_pred). Option 'auto' sets the link
@@ -167,8 +165,8 @@ def fit(self, X, y, sample_weight=None):
                 "; got (family={0})".format(self.family))
 
         # Guarantee that self._link_instance is set to an instance of
-        # class Link
-        if isinstance(self.link, Link):
+        # class BaseLink
+        if isinstance(self.link, BaseLink):
             self._link_instance = self.link
         else:
             if self.link == 'auto':
@@ -227,7 +225,7 @@ def fit(self, X, y, sample_weight=None):
         X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'],
                          dtype=[np.float64, np.float32],
                          y_numeric=True, multi_output=False, copy=self.copy_X)
-        y = np.asarray(y, dtype=np.float64)
+        y = np.asarray(y, dtype=X.dtype)
 
         weights = _check_sample_weight(sample_weight, X)
 
@@ -247,8 +245,7 @@ def fit(self, X, y, sample_weight=None):
         # deviance = sum(sample_weight * unit_deviance),
         # we rescale weights such that sum(weights) = 1 and this becomes
         # 1/2*deviance + L2 with deviance=sum(weights * unit_deviance)
-        weights_sum = np.sum(weights)
-        weights = weights / weights_sum
+        weights = weights / weights.sum()
 
         if self.warm_start and hasattr(self, 'coef_'):
             if self.fit_intercept:
@@ -318,7 +315,7 @@ def _linear_predictor(self, X):
         """
         check_is_fitted(self)
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                        dtype='numeric', ensure_2d=True,
+                        dtype=[np.float64, np.float32], ensure_2d=True,
                         allow_nd=False)
         return X @ self.coef_ + self.intercept_
 
@@ -413,10 +410,8 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     ----------
     alpha : float, optional (default=1)
         Constant that multiplies the penalty terms and thus determines the
-        regularization strength.
-        See the notes for the exact mathematical meaning of this
-        parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this
-        case, the design matrix X must have full column rank
+        regularization strength.  ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix X must have full column rank
         (no collinearities).
 
     fit_intercept : boolean, optional (default=True)
@@ -454,9 +449,8 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     n_iter_ : int
         Actual number of iterations used in the solver.
     """
-    def __init__(self, alpha=1.0, fit_intercept=True,
-                 max_iter=100, tol=1e-4, warm_start=False,
-                 copy_X=True, check_input=True, verbose=0):
+    def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4,
+                 warm_start=False, copy_X=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
                          family="poisson", link='log', max_iter=max_iter,
@@ -495,10 +489,8 @@ class GammaRegressor(GeneralizedLinearRegressor):
     ----------
     alpha : float, optional (default=1)
         Constant that multiplies the penalty terms and thus determines the
-        regularization strength.
-        See the notes for the exact mathematical meaning of this
-        parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this
-        case, the design matrix X must have full column rank
+        regularization strength.  ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix X must have full column rank
         (no collinearities).
 
     fit_intercept : boolean, optional (default=True)
@@ -536,9 +528,8 @@ class GammaRegressor(GeneralizedLinearRegressor):
     n_iter_ : int
         Actual number of iterations used in the solver.
     """
-    def __init__(self, alpha=1.0, fit_intercept=True,
-                 max_iter=100, tol=1e-4, warm_start=False,
-                 copy_X=True, check_input=True, verbose=0):
+    def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4,
+                 warm_start=False, copy_X=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
                          family="gamma", link='log', max_iter=max_iter,
@@ -601,10 +592,8 @@ class TweedieRegressor(GeneralizedLinearRegressor):
 
     alpha : float, optional (default=1)
         Constant that multiplies the penalty terms and thus determines the
-        regularization strength.
-        See the notes for the exact mathematical meaning of this
-        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
-        case, the design matrix X must have full column rank
+        regularization strength.  ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix X must have full column rank
         (no collinearities).
 
     link : {'auto', 'identity', 'log'}, default='auto'
@@ -652,7 +641,7 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     """
     def __init__(self, power=0.0, alpha=1.0, fit_intercept=True,
                  link='auto', max_iter=100, tol=1e-4,
-                 warm_start=False, copy_X=True, check_input=True, verbose=0):
+                 warm_start=False, copy_X=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
                          family=TweedieDistribution(power=power), link=link,
diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py
index cfdc6f181a832..7c404798b2c18 100644
--- a/sklearn/linear_model/_glm/link.py
+++ b/sklearn/linear_model/_glm/link.py
@@ -11,7 +11,7 @@
 from scipy.special import expit, logit
 
 
-class Link(metaclass=ABCMeta):
+class BaseLink(metaclass=ABCMeta):
     """Abstract base class for Link functions."""
 
     @abstractmethod
@@ -65,7 +65,7 @@ def inverse_derivative(self, lin_pred):
         pass  # pragma: no cover
 
 
-class IdentityLink(Link):
+class IdentityLink(BaseLink):
     """The identity link function g(x)=x."""
 
     def __call__(self, y_pred):
@@ -81,7 +81,7 @@ def inverse_derivative(self, lin_pred):
         return np.ones_like(lin_pred)
 
 
-class LogLink(Link):
+class LogLink(BaseLink):
     """The log link function g(x)=log(x)."""
 
     def __call__(self, y_pred):
@@ -97,7 +97,7 @@ def inverse_derivative(self, lin_pred):
         return np.exp(lin_pred)
 
 
-class LogitLink(Link):
+class LogitLink(BaseLink):
     """The logit link function g(x)=logit(x)."""
 
     def __call__(self, y_pred):
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 898d3c4edf9c0..030cdf8a9b141 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -338,7 +338,7 @@ def test_tweedie_regression_family(regression_data):
     with pytest.raises(TypeError, match=msg):
         est.family = None
 
-    # TODO: the following should not be allowed
+    # XXX: following is currently allowed, but maybe it shouldn't be
     # est.family.power = 2
 
 

From d4dfd0b13c9bd3ce7fae0d7ed2b0ad40411f46cc Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Mon, 30 Sep 2019 12:10:01 +0200
Subject: [PATCH 183/209] Removing unnecessary comments / asarray call

---
 sklearn/linear_model/_glm/glm.py            | 1 -
 sklearn/linear_model/_glm/tests/test_glm.py | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 819e36e13addf..65de7f9532717 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -225,7 +225,6 @@ def fit(self, X, y, sample_weight=None):
         X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'],
                          dtype=[np.float64, np.float32],
                          y_numeric=True, multi_output=False, copy=self.copy_X)
-        y = np.asarray(y, dtype=X.dtype)
 
         weights = _check_sample_weight(sample_weight, X)
 
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 030cdf8a9b141..ef49fabdee4b1 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -338,9 +338,6 @@ def test_tweedie_regression_family(regression_data):
     with pytest.raises(TypeError, match=msg):
         est.family = None
 
-    # XXX: following is currently allowed, but maybe it shouldn't be
-    # est.family.power = 2
-
 
 @pytest.mark.parametrize(
         'estimator, value',

From 64d6fbd0dd99d66789c3043558f80883a974e99e Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 3 Oct 2019 10:57:18 +0200
Subject: [PATCH 184/209] Update doc/modules/linear_model.rst

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>
---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index e53f309076b3b..8945235279d9d 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -923,7 +923,7 @@ likelihood as
               \left( \log p(y|\mu,\phi)
               - \log p(y|y,\phi)\right).
 
-The following table lists some specific EDM distributions—all are Tweedie
+The following table lists some specific EDM distributions—all are instances of Tweedie
 distributions—and some of their properties.
 
 ================= ===============================  ====================================== ============================================

From 82ace9f399c83d7f023f9dafcdada88540c0fd25 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 3 Oct 2019 11:14:34 +0200
Subject: [PATCH 185/209] Remove unused solver parameter in tests

---
 sklearn/linear_model/_glm/tests/test_glm.py | 30 +++++++--------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index ef49fabdee4b1..a5df69b50c967 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -26,8 +26,6 @@
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.model_selection import train_test_split
 
-GLM_SOLVERS = ['lbfgs']
-
 
 @pytest.fixture(scope="module")
 def regression_data():
@@ -176,14 +174,13 @@ def test_glm_check_input_argument(check_input):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize('solver', GLM_SOLVERS)
-def test_glm_identity_regression(solver):
+def test_glm_identity_regression():
     """Test GLM regression with identity link on a simple dataset."""
     coef = [1., 2.]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
     glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
-                                     fit_intercept=False, solver=solver)
+                                     fit_intercept=False)
     res = glm.fit(X, y)
     assert_allclose(res.coef_, coef, rtol=1e-6)
 
@@ -193,15 +190,14 @@ def test_glm_identity_regression(solver):
     [NormalDistribution(), PoissonDistribution(),
      GammaDistribution(), InverseGaussianDistribution(),
      TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)])
-@pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-6)])
-def test_glm_log_regression(family, solver, tol):
+def test_glm_log_regression(family):
     """Test GLM regression with log link on a simple dataset."""
     coef = [0.2, -0.1]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.exp(np.dot(X, coef))
     glm = GeneralizedLinearRegressor(
                 alpha=0, family=family, link='log', fit_intercept=False,
-                solver=solver, tol=tol)
+                tol=1e-6)
     res = glm.fit(X, y)
     assert_allclose(res.coef_, coef, rtol=5e-6)
 
@@ -239,8 +235,7 @@ def test_warm_start(fit_intercept):
 
 @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)])
 @pytest.mark.parametrize('fit_intercept', [True, False])
-@pytest.mark.parametrize('solver', GLM_SOLVERS)
-def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
+def test_normal_ridge_comparison(n_samples, n_features, fit_intercept):
     """Compare with Ridge regression for Normal distributions."""
     alpha = 1.0
     test_size = 10
@@ -264,8 +259,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
 
     glm = GeneralizedLinearRegressor(alpha=1.0, family='normal',
                                      link='identity', fit_intercept=True,
-                                     solver=solver, check_input=False,
-                                     max_iter=300)
+                                     check_input=False, max_iter=300)
     glm.fit(X_train, y_train)
     assert glm.coef_.shape == (X.shape[1], )
     assert_allclose(glm.coef_, ridge.coef_, atol=5e-5)
@@ -274,8 +268,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
     assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=5e-5)
 
 
-@pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-7)])
-def test_poisson_glmnet(solver, tol):
+def test_poisson_glmnet():
     """Compare Poisson regression with L2 regularization and LogLink to glmnet
     """
     # library("glmnet")
@@ -294,19 +287,16 @@ def test_poisson_glmnet(solver, tol):
     glm = GeneralizedLinearRegressor(alpha=1,
                                      fit_intercept=True, family='poisson',
                                      link='log', tol=1e-7,
-                                     solver=solver, max_iter=300,
-                                     )
+                                     max_iter=300)
     glm.fit(X, y)
     assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5)
     assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5)
 
 
-@pytest.mark.parametrize("solver", GLM_SOLVERS)
-def test_convergence_warning(solver, regression_data):
+def test_convergence_warning(regression_data):
     X, y = regression_data
 
-    est = GeneralizedLinearRegressor(solver=solver,
-                                     max_iter=1, tol=1e-20)
+    est = GeneralizedLinearRegressor(max_iter=1, tol=1e-20)
     with pytest.warns(ConvergenceWarning):
         est.fit(X, y)
 

From 5288a0ff156c254df7e465971127685f5532fbf3 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 3 Oct 2019 12:13:35 +0200
Subject: [PATCH 186/209] Add test for sample_weight consistency

---
 sklearn/linear_model/_glm/tests/test_glm.py | 36 +++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index a5df69b50c967..2a54b759011e7 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -181,8 +181,40 @@ def test_glm_identity_regression():
     y = np.dot(X, coef)
     glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
                                      fit_intercept=False)
-    res = glm.fit(X, y)
-    assert_allclose(res.coef_, coef, rtol=1e-6)
+    glm.fit(X, y)
+    assert_allclose(glm.coef_, coef, rtol=1e-6)
+
+
+def test_glm_sample_weight_consistentcy():
+    """Test that the impact of sample_weight is consistent"""
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 5
+
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+    glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
+                                     fit_intercept=False)
+    glm.fit(X, y)
+    coef = glm.coef_.copy()
+
+    # sample_weight=np.ones(..) should be equivalent to sample_weight=None
+    sample_weight = np.ones(y.shape)
+    glm.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(glm.coef_, coef, rtol=1e-6)
+
+    # sample_weight are normalized to 1 so, scaling them has no effect
+    sample_weight = 2*np.ones(y.shape)
+    glm.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(glm.coef_, coef, rtol=1e-6)
+
+    # setting one element of sample_weight to 0 is equivalent to removing
+    # the correspoding sample
+    sample_weight = np.ones(y.shape)
+    sample_weight[-1] = 0
+    glm.fit(X, y, sample_weight=sample_weight)
+    coef1 = glm.coef_.copy()
+    glm.fit(X[:-1], y[:-1])
+    assert_allclose(glm.coef_, coef1, rtol=1e-6)
 
 
 @pytest.mark.parametrize(

From 499e8d244b5f6bf3ad7856dace4338b1cf5d31e1 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 3 Oct 2019 14:41:47 +0200
Subject: [PATCH 187/209] Move GLM losses under sklearn._loss.glm_distribution

---
 sklearn/_loss/__init__.py                                       | 0
 .../_glm/distribution.py => _loss/glm_distribution.py}          | 0
 sklearn/_loss/tests/__init__.py                                 | 0
 .../tests/test_glm_distribution.py}                             | 2 +-
 sklearn/linear_model/_glm/glm.py                                | 2 +-
 sklearn/linear_model/_glm/tests/test_glm.py                     | 2 +-
 sklearn/metrics/regression.py                                   | 2 +-
 7 files changed, 4 insertions(+), 4 deletions(-)
 create mode 100644 sklearn/_loss/__init__.py
 rename sklearn/{linear_model/_glm/distribution.py => _loss/glm_distribution.py} (100%)
 create mode 100644 sklearn/_loss/tests/__init__.py
 rename sklearn/{linear_model/_glm/tests/test_distribution.py => _loss/tests/test_glm_distribution.py} (98%)

diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/_loss/glm_distribution.py
similarity index 100%
rename from sklearn/linear_model/_glm/distribution.py
rename to sklearn/_loss/glm_distribution.py
diff --git a/sklearn/_loss/tests/__init__.py b/sklearn/_loss/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/_loss/tests/test_glm_distribution.py
similarity index 98%
rename from sklearn/linear_model/_glm/tests/test_distribution.py
rename to sklearn/_loss/tests/test_glm_distribution.py
index 97c3a485ef4bb..cb4c5ae07e4d1 100644
--- a/sklearn/linear_model/_glm/tests/test_distribution.py
+++ b/sklearn/_loss/tests/test_glm_distribution.py
@@ -9,7 +9,7 @@
 from scipy.optimize import check_grad
 import pytest
 
-from sklearn.linear_model._glm.distribution import (
+from sklearn._loss.glm_distribution import (
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 65de7f9532717..360db6e4f741e 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -15,7 +15,7 @@
 from ...utils import check_array, check_X_y
 from ...utils.optimize import _check_optimize_result
 from ...utils.validation import check_is_fitted, _check_sample_weight
-from .distribution import (
+from ..._loss.glm_distribution import (
         ExponentialDispersionModel,
         TweedieDistribution,
         EDM_DISTRIBUTIONS
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 2a54b759011e7..9e21ae7775cf4 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -17,7 +17,7 @@
     IdentityLink,
     LogLink,
 )
-from sklearn.linear_model._glm.distribution import (
+from sklearn._loss.glm_distribution import (
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py
index 706c484334d21..f7ef99794727b 100644
--- a/sklearn/metrics/regression.py
+++ b/sklearn/metrics/regression.py
@@ -25,6 +25,7 @@
 import numpy as np
 import warnings
 
+from .._loss.glm_distribution import TweedieDistribution
 from ..utils.validation import (check_array, check_consistent_length,
                                 _num_samples)
 from ..utils.validation import column_or_1d
@@ -672,7 +673,6 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0):
     >>> mean_tweedie_deviance(y_true, y_pred, power=1)
     1.4260...
     """
-    from ..linear_model._glm.distribution import TweedieDistribution
     y_type, y_true, y_pred, _ = _check_reg_targets(
         y_true, y_pred, None, dtype=[np.float64, np.float32])
     if y_type == 'continuous-multioutput':

From f4aa839da1f0226ff8ed98adb22d6e90446d7120 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 3 Oct 2019 15:21:42 +0200
Subject: [PATCH 188/209] Update sklearn/linear_model/_glm/glm.py

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>
---
 sklearn/linear_model/_glm/glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 360db6e4f741e..dd6f847895434 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -571,7 +571,7 @@ class TweedieRegressor(GeneralizedLinearRegressor):
             mean (:math:`\y_\textrm{pred}`):
             :math:`v(\y_\textrm{pred}) = \y_\textrm{pred}^{power}`.
 
-            For ``0<power<1``, no distribution exists.
+            For ``0 < power < 1``, no distribution exists.
 
             Special cases are:
 

From 48fcbe669a9fc8135331ad5a4aae987df7bcdc62 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 3 Oct 2019 14:55:48 +0200
Subject: [PATCH 189/209] Add missing config.add_subpackage in setup.py

---
 sklearn/linear_model/setup.py | 1 +
 sklearn/setup.py              | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/sklearn/linear_model/setup.py b/sklearn/linear_model/setup.py
index 5cf7040d4c9d4..e50a30eca73da 100644
--- a/sklearn/linear_model/setup.py
+++ b/sklearn/linear_model/setup.py
@@ -43,6 +43,7 @@ def configuration(parent_package='', top_path=None):
     # add other directories
     config.add_subpackage('tests')
     config.add_subpackage('_glm')
+    config.add_subpackage('_glm/tests')
 
     return config
 
diff --git a/sklearn/setup.py b/sklearn/setup.py
index 53f6d3f6eb30c..3913965a375cd 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -47,6 +47,8 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('experimental/tests')
     config.add_subpackage('ensemble/_hist_gradient_boosting')
     config.add_subpackage('ensemble/_hist_gradient_boosting/tests')
+    config.add_subpackage('_loss/')
+    config.add_subpackage('_loss/tests')
 
     # submodules which have their own setup.py
     config.add_subpackage('cluster')

From d71fb9f8fb54608124f4947931ecf1cbaaba425f Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 3 Oct 2019 15:28:50 +0200
Subject: [PATCH 190/209] Address Nicolas comments in the documentation
 (partial)

---
 doc/modules/linear_model.rst | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 8945235279d9d..6667057dc5073 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -907,13 +907,13 @@ Generalized Linear Models (GLM) extend linear models in two ways
 combination of the input variables :math:`X` via an inverse link function
 :math:`h` as
 
-.. math::    \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p).
+.. math::    \hat{y}(w, X) = h(x^\top w) = h(w_0 + w_1 X_1 + ... + w_p X_p).
 
 Secondly, the squared loss function is replaced by the unit deviance :math:`d`
 of a reproductive exponential dispersion model (EDM) [11]_. The minimization
 problem becomes
 
-.. math::    \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, x_i)) + \frac{\alpha}{2} ||w||_2
+.. math::    \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, X_i)) + \frac{\alpha}{2} ||w||_2
 
 with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`.
 The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
@@ -939,8 +939,8 @@ Inverse Gaussian  :math:`y \in (0, \infty)`        :math:`\mu^3`
 Usage
 -----
 
-In the following use cases, a loss different from the squared loss might be
-appropriate:
+A GLM loss different from the classical squared loss might be appropriate in
+the following cases:
 
   * If the target values :math:`y` are counts (non-negative integer valued) or
     frequencies (non-negative), you might use a Poisson deviance with log-link.
@@ -985,13 +985,8 @@ of the unit variance function:
    * If you want to model a relative frequency, i.e. counts per exposure (time,
      volume, ...) you can do so by a Poisson distribution and passing
      :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values
-     together with :math:`s=\mathrm{exposure}` as sample weights. This is done
-     in both examples linked below.
-   * The fit itself does not need Y to be from an EDM, but only assumes
-     the first two moments to be :math:`E[Y_i]=\mu_i=h((Xw)_i)` and
-     :math:`Var[Y_i]=\frac{\phi}{s_i} v(\mu_i)`.
-   * If the target `y` is a ratio, appropriate sample weights ``s`` should be
-     provided.
+     together with :math:`s=\mathrm{exposure}` as sample weights.
+
      As an example, consider Poisson distributed counts z (integers) and
      weights s=exposure (time, money, persons years, ...). Then you fit
      y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``.
@@ -1000,6 +995,10 @@ of the unit variance function:
      in this case one might say that y has a 'scaled' Poisson distribution.
      The same holds for other distributions.
 
+   * The fit itself does not need Y to be from an EDM, but only assumes
+     the first two moments to be :math:`E[Y_i]=\mu_i=h((Xw)_i)` and
+     :math:`Var[Y_i]=\frac{\phi}{s_i} v(\mu_i)`.
+
 The estimator can be used as follows::
 
     >>> from sklearn.linear_model import TweedieRegressor

From fa90272e4d1925b4caa967febf417c4ea9e96457 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 3 Oct 2019 16:10:40 +0200
Subject: [PATCH 191/209] More cleanups in the
 plot_tweedie_regression_insurance_claims.py example

---
 ...lot_tweedie_regression_insurance_claims.py | 57 ++++++++-----------
 1 file changed, 24 insertions(+), 33 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 1c50541fcd85c..dfd5555c83af4 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -142,7 +142,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 
 # Note: filter out claims with zero amount, as the severity model
 # requires strictly positive target values.
-df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
+df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0
 
 # Correct for unreasonable observations (that might be data error)
 # and a few exceptionally large claim amounts
@@ -150,30 +150,26 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 df["Exposure"] = df["Exposure"].clip(upper=1)
 df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)
 
+log_scale_transformer = make_pipeline(
+    FunctionTransformer(np.log, validate=False),
+    StandardScaler()
+)
+
 column_trans = ColumnTransformer(
     [
-        ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
-        (
-            "Veh_Brand_Gas_Region",
-            OneHotEncoder(),
-            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
-        ),
-        ("BonusMalus", "passthrough", ["BonusMalus"]),
-        (
-            "Density_log",
-            make_pipeline(
-                FunctionTransformer(np.log, validate=False), StandardScaler()
-            ),
-            ["Density"],
-        ),
+        ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
+        ("onehot_categorical", OneHotEncoder(),
+         ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
+        ("passthrough_numeric", "passthrough", ["BonusMalus"]),
+        ("log_scaled_numeric", log_scale_transformer, ["Density"]),
     ],
     remainder="drop",
 )
 X = column_trans.fit_transform(df)
 
 
-df["Frequency"] = df.ClaimNb / df.Exposure
-df["AvgClaimAmount"] = df.ClaimAmount / np.fmax(df.ClaimNb, 1)
+df["Frequency"] = df["ClaimNb"] / df["Exposure"]
+df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1)
 
 print(df[df.ClaimAmount > 0].head())
 
@@ -268,7 +264,7 @@ def score_estimator(
 # the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance
 # bonus/malus (``BonusMalus``).
 
-fig, ax = plt.subplots(2, 2, figsize=(16, 8))
+fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(16, 8))
 fig.subplots_adjust(hspace=0.3, wspace=0.2)
 
 plot_obs_pred(
@@ -369,18 +365,12 @@ def score_estimator(
 # it is conditional on having at least one claim, and cannot be used to predict
 # the average claim amount per policy in general.
 
-print(
-    "Mean AvgClaim Amount per policy:              %.2f "
-    % df_train.AvgClaimAmount.mean()
-)
-print(
-    "Mean AvgClaim Amount | NbClaim > 0:           %.2f"
-    % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean()
-)
-print(
-    "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f"
-    % glm_sev.predict(X_train).mean()
-)
+print("Mean AvgClaim Amount per policy:              %.2f "
+      % df_train["AvgClaimAmount"].mean())
+print("Mean AvgClaim Amount | NbClaim > 0:           %.2f"
+      % df_train["AvgClaimAmount"][df_train["AvgClaimAmount"] > 0].mean())
+print("Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f"
+      % glm_sev.predict(X_train).mean())
 
 
 ##############################################################################
@@ -388,7 +378,7 @@ def score_estimator(
 # We can visually compare observed and predicted values, aggregated for
 # the drivers age (``DrivAge``).
 
-fig, ax = plt.subplots(1, 2, figsize=(16, 4))
+fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 4))
 
 # plot DivAge
 plot_obs_pred(
@@ -500,9 +490,9 @@ def score_estimator(
     res.append(
         {
             "subset": subset_label,
-            "observed": df.ClaimAmount.values.sum(),
+            "observed": df["ClaimAmount"].values.sum(),
             "predicted, frequency*severity model": np.sum(
-                df.Exposure.values*glm_freq.predict(X)*glm_sev.predict(X)
+                df["Exposure"].values*glm_freq.predict(X)*glm_sev.predict(X)
             ),
             "predicted, tweedie, power=%.2f"
             % glm_total.best_estimator_.family.power: np.sum(
@@ -512,3 +502,4 @@ def score_estimator(
     )
 
 print(pd.DataFrame(res).set_index("subset").T)
+plt.plot()

From 4d16f318ca07cf8a00200201042bba177d4d27ba Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 6 Oct 2019 16:13:21 +0200
Subject: [PATCH 192/209] Typos and text improvement in poisson example

---
 doc/modules/linear_model.rst                  |  2 +-
 ...plot_poisson_regression_non_normal_loss.py | 65 ++++++++++---------
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 6667057dc5073..4a5aeab305b8a 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -991,7 +991,7 @@ of the unit variance function:
      weights s=exposure (time, money, persons years, ...). Then you fit
      y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``.
      The weights are necessary for the right (finite sample) mean.
-     Considering :math:`\bar{y} = \frac{\\sum_i s_i y_i}{\sum_i s_i}`,
+     Considering :math:`\bar{y} = \frac{\sum_i s_i y_i}{\sum_i s_i}`,
      in this case one might say that y has a 'scaled' Poisson distribution.
      The same holds for other distributions.
 
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index d99654cf04080..3ecb02108de4f 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -6,8 +6,8 @@
 This example illustrates the use of log-linear Poisson regression
 on the French Motor Third-Party Liability Claims dataset [1] and compares
 it with models learned with least squared error. The goal is to predict the
-number of insurance claims (or frequency) following car accidents for a
-policyholder given historical data over a population of policyholders.
+expected number of insurance claims (or frequency) following car accidents for
+a policyholder given historical data over a population of policyholders.
 
 .. [1]  A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor
     Third-Party Liability Claims (November 8, 2018).
@@ -42,8 +42,8 @@
 from sklearn.metrics import mean_poisson_deviance
 
 
-def load_mtpl2(n_samples=None):
-    """Fetcher for French Motor Third-Party Liability Claims dataset
+def load_mtpl2(n_samples=100000):
+    """Fetch the French Motor Third-Party Liability Claims dataset.
 
     Parameters
     ----------
@@ -122,9 +122,13 @@ def load_mtpl2(n_samples=None):
 print("Average Frequency = {}"
       .format(np.average(df["Frequency"], weights=df["Exposure"])))
 
+print("Percentage of zero claims = {0:%}"
+      .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() /
+              df["Exposure"].sum()))
+
 ##############################################################################
 #
-# It worth noting that 96 % of policyholders have zero claims, and if we were
+# It worth noting that 92 % of policyholders have zero claims, and if we were
 # to convert this problem into a binary classification task, it would be
 # significantly imbalanced.
 #
@@ -143,7 +147,7 @@ def load_mtpl2(n_samples=None):
 
 
 def score_estimator(estimator, df_test):
-    """Score an estimator on the test set"""
+    """Score an estimator on the test set."""
 
     y_pred = estimator.predict(df_test)
 
@@ -154,13 +158,14 @@ def score_estimator(estimator, df_test):
           mean_absolute_error(df_test["Frequency"], y_pred,
                               df_test["Exposure"]))
 
-    # ignore negative predictions, as they are invalid for
+    # ignore non-positive predictions, as they are invalid for
     # the Poisson deviance
     mask = y_pred > 0
     if (~mask).any():
-        warnings.warn("estimator yields negative predictions for {} samples "
-                      "out of {}. These will be ignored while computing the "
-                      "Poisson deviance".format((~mask).sum(), mask.shape[0]))
+        warnings.warn("Estimator yields non-positive predictions for {} "
+                      "samples out of {}. These will be ignored while "
+                      "computing the Poisson deviance"
+                      .format((~mask).sum(), mask.shape[0]))
 
     print("mean Poisson deviance: %.3f" %
           mean_poisson_deviance(df_test["Frequency"][mask],
@@ -182,12 +187,12 @@ def score_estimator(estimator, df_test):
 
 ##############################################################################
 #
-# The Poisson deviance cannot be computed on negative values predicted by the
-# model. For models that do return a few negative predictions
+# The Poisson deviance cannot be computed on non-positive values predicted by
+# the model. For models that do return a few non-positive predictions
 # (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples,
 # meaning that the obtained Poisson deviance is approximate. An alternative
-# apporach could be to use class:`compose.TransformedTargetRegressor`
-# meta-estimator to map ``y_pred`` to strictly positive domain.
+# approach could be to use :class:`compose.TransformedTargetRegressor`
+# meta-estimator to map ``y_pred`` to a strictly positive domain.
 
 print("Ridge evaluation:")
 score_estimator(ridge, df_test)
@@ -210,9 +215,9 @@ def score_estimator(estimator, df_test):
 #
 # Finally, we will consider a non-linear model, namely a random forest. Random
 # forests do not require the categorical data to be one-hot encoded, instead
-# we encode each category label with an arbirtrary integer using
+# we encode each category label with an arbitrary integer using
 # :class:`preprocessing.OrdinalEncoder` to make the model faster to train (the
-# same information is encoded with a small number of features than with
+# same information is encoded with a smaller number of features than with
 # one-hot encoding).
 
 rf_preprocessor = ColumnTransformer(
@@ -238,12 +243,13 @@ def score_estimator(estimator, df_test):
 
 ##############################################################################
 #
-# The random forest model also minimizes the conditional least square error.
-# However because of a higher predictive power it also results in a smaller
-# Poisson deviance than the Poisson regression model.
+# Like the Ridge regression above, the random forest model minimizes the
+# conditional squared error, too. However, because of a higher predictive
+# power, it also results in a smaller Poisson deviance than the Poisson
+# regression model.
 #
 # Evaluating models with a single train / test split is prone to random
-# fluctuations. If computation resources allow, it should be verified that
+# fluctuations. If computing resources allow, it should be verified that
 # cross-validated performance metrics would lead to similar conclusions.
 #
 # The qualitative difference between these models can also be visualized by
@@ -274,7 +280,7 @@ def score_estimator(estimator, df_test):
 #
 # The experimental data presents a long tail distribution for ``y``. In all
 # models we predict the mean expected value, so we will have necessarily fewer
-# extreme values. Additionally normal distribution used in ``Ridge`` and
+# extreme values. Additionally, normal distribution used in ``Ridge`` and
 # ``RandomForestRegressor`` has a constant variance, while for the Poisson
 # distribution used in ``PoissonRegressor``, the variance is proportional to
 # the mean predicted value.
@@ -291,11 +297,10 @@ def score_estimator(estimator, df_test):
 
 def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
                                   n_bins=100):
-    """Compare predictions and observations for bins ordered by y_pred
+    """Compare predictions and observations for bins ordered by y_pred.
 
     We order the samples by ``y_pred`` and split it in bins.
-    In each bin the observed mean is compared with the predicted
-    mean.
+    In each bin the observed mean is compared with the predicted mean.
 
     Parameters
     ----------
@@ -306,7 +311,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
     sample_weight : array-like of shape (n_samples,)
         Sample weights.
     n_bins: int
-        number of bins to use
+        Number of bins to use.
 
     Returns
     -------
@@ -370,10 +375,10 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
 # values.
 #
 # However, for some business applications, we are not necessarily interested
-# in the the ability of the model in predicting the expected frequency value
-# but instead in predicting which policyholder groups are the riskiest and
-# which are the safest. In this case the model evaluation would cast the
-# problem as a ranking problem rather than a regression problem.
+# in the ability of the model to predict the expected frequency value, but
+# instead to predict which policyholder groups are the riskiest and which are
+# the safest. In this case, the model evaluation would cast the problem as a
+# ranking problem rather than a regression problem.
 #
 # To compare the 3 models under this light on, one can plot the fraction of
 # the number of claims vs the fraction of exposure for test samples ordered by
@@ -435,6 +440,6 @@ def _cumulated_claims(y_true, y_pred, exposure):
 #
 # This last point is expected due to the nature of the problem: the occurrence
 # of accidents is mostly dominated by circumstantial causes that are not
-# captured in the columns of the dataset.
+# captured in the columns of the dataset or that are indeed random.
 
 plt.show()

From 15eb1d39c6c2bbbdceeb64996d0f297ef4f80ebf Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 6 Oct 2019 16:20:33 +0200
Subject: [PATCH 193/209] EXA sharey for histograms

---
 .../linear_model/plot_poisson_regression_non_normal_loss.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 3ecb02108de4f..769c072c624fb 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -257,7 +257,7 @@ def score_estimator(estimator, df_test):
 # values:
 
 
-fig, axes = plt.subplots(1, 4, figsize=(16, 3))
+fig, axes = plt.subplots(1, 4, figsize=(16, 3), sharey=True)
 fig.subplots_adjust(bottom=0.2)
 n_bins = 20
 df_train["Frequency"].hist(bins=np.linspace(-1, 10, n_bins), ax=axes[0])
@@ -265,6 +265,7 @@ def score_estimator(estimator, df_test):
 axes[0].set_title("Data")
 axes[0].set_yscale('log')
 axes[0].set_xlabel("y (observed Frequency)")
+axes[0].set_ylim([1E2, 5E5])
 
 for idx, model in enumerate([ridge, poisson, rf]):
     y_pred = model.predict(df_train)

From 3d097c686dfb016a4561afca37f28ad0d40dc0f3 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 8 Oct 2019 15:07:56 +0200
Subject: [PATCH 194/209] Plot y_pred histograms on the test set

---
 ...plot_poisson_regression_non_normal_loss.py | 42 +++++++++++--------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 769c072c624fb..0e948873da570 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -256,26 +256,32 @@ def score_estimator(estimator, df_test):
 # comparing the histogram of observed target values with that of predicted
 # values:
 
-
-fig, axes = plt.subplots(1, 4, figsize=(16, 3), sharey=True)
+fig, axes = plt.subplots(2, 4, figsize=(16, 6), sharey=True)
 fig.subplots_adjust(bottom=0.2)
 n_bins = 20
-df_train["Frequency"].hist(bins=np.linspace(-1, 10, n_bins), ax=axes[0])
-
-axes[0].set_title("Data")
-axes[0].set_yscale('log')
-axes[0].set_xlabel("y (observed Frequency)")
-axes[0].set_ylim([1E2, 5E5])
-
-for idx, model in enumerate([ridge, poisson, rf]):
-    y_pred = model.predict(df_train)
-
-    pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), ax=axes[idx+1])
-    axes[idx + 1].set(
-        title=model[-1].__class__.__name__,
-        yscale='log',
-        xlabel="y_pred (predicted expected Frequency)"
-    )
+for row_idx, label, df in zip(range(2),
+                              ["train", "test"],
+                              [df_train, df_test]):
+    df["Frequency"].hist(bins=np.linspace(-1, 30, n_bins),
+                         ax=axes[row_idx, 0])
+
+    axes[row_idx, 0].set_title("Data")
+    axes[row_idx, 0].set_yscale('log')
+    axes[row_idx, 0].set_xlabel("y (observed Frequency)")
+    axes[row_idx, 0].set_ylim([1e1, 5e5])
+    axes[row_idx, 0].set_ylabel(label + " samples")
+
+    for idx, model in enumerate([ridge, poisson, rf]):
+        y_pred = model.predict(df)
+
+        pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins),
+                               ax=axes[row_idx, idx+1])
+        axes[row_idx, idx + 1].set(
+            title=model[-1].__class__.__name__,
+            yscale='log',
+            xlabel="y_pred (predicted expected Frequency)"
+        )
+plt.tight_layout()
 
 ##############################################################################
 #

From 31f5b3d6a7a5e01055bc8bd914b51d3d92bcafb5 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 9 Oct 2019 16:30:37 +0200
Subject: [PATCH 195/209] Compound Poisson => Compound Poisson Gamma

---
 ...lot_tweedie_regression_insurance_claims.py | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index dfd5555c83af4..3bdb7d93f0130 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -410,15 +410,15 @@ def score_estimator(
 # Overall, the drivers age (``DrivAge``) has a weak impact on the claim
 # severity, both in observed and predicted data.
 #
-# 4. Total claim amount -- Compound Poisson distribution
-# -------------------------------------------------------
+# 4. Total claim amount -- Compound Poisson Gamma distribution
+#    ------------------------------------------------------------
 #
 # As mentioned in the introduction, the total claim amount can be modeled
 # either as the product of the frequency model by the severity model,
 # denormalized by exposure. In the following code sample, the
-# ``score_estimator`` is extended to score such a model. The mean deviance
-# is computed assuming a Tweedie distribution with ``power=1.5`` to be
-# comparable with the model from the following section,
+# ``score_estimator`` is extended to score such a model. The mean deviance is
+# computed assuming a Tweedie distribution with ``power=1.5`` to be comparable
+# with the model from the following section,
 
 
 scores = score_estimator(
@@ -436,9 +436,9 @@ def score_estimator(
 ##############################################################################
 #
 # Indeed, an alternative approach for modeling the total loss is with a unique
-# Compound Poisson model, also corresponding to a Tweedie model
-# with a power :math:`p \in (1, 2)`. We determine the optimal hyperparameter
-# ``p`` with a grid search,
+# Compound Poisson Gamma model, also corresponding to a Tweedie model with a
+# power :math:`p \in (1, 2)`. We determine the optimal hyperparameter ``p``
+# with a grid search:
 
 from sklearn.model_selection import GridSearchCV
 
@@ -475,12 +475,13 @@ def score_estimator(
 ##############################################################################
 #
 # In this example, the mean absolute error is lower for the Compound Poisson
-# model than when using separate models for frequency and severity.
+# Gamma model than when using the product of the predictions of separate
+# models for frequency and severity.
 #
-# We can additionally validate these models by comparing observed and predicted
-# total claim amount over the test and train subsets. We see that in our case
-# the frequency-severity model underestimates the total claim amount, whereas
-# the Tweedie model overestimates.
+# We can additionally validate these models by comparing observed and
+# predicted total claim amount over the test and train subsets. We see that,
+# on average, the frequency-severity model underestimates the total claim
+# amount, whereas the Tweedie model overestimates.
 
 res = []
 for subset_label, X, df in [

From a498ff546da062fb4f6c33bef9d18e45e18249c8 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 9 Oct 2019 17:07:02 +0200
Subject: [PATCH 196/209] Compound Poisson => Compound Poisson Gamma

---
 sklearn/linear_model/_glm/glm.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index dd6f847895434..aae438733d424 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -28,7 +28,7 @@
 
 
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
-    """Regression via a Generalized Linear Model (GLM) with penalties.
+    """Regression via a penalized Generalized Linear Model (GLM).
 
     GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
     fitting and predicting the mean of the target y as y_pred=h(X*w).
@@ -575,19 +575,19 @@ class TweedieRegressor(GeneralizedLinearRegressor):
 
             Special cases are:
 
-            +-------+------------------+
-            | Power | Distribution     |
-            +=======+==================+
-            | 0     | Normal           |
-            +-------+------------------+
-            | 1     | Poisson          |
-            +-------+------------------+
-            | (1,2) | Compound Poisson |
-            +-------+------------------+
-            | 2     | Gamma            |
-            +-------+------------------+
-            | 3     | Inverse Gaussian |
-            +-------+------------------+
+            +-------+------------------------+
+            | Power | Distribution           |
+            +=======+========================+
+            | 0     | Normal                 |
+            +-------+------------------------+
+            | 1     | Poisson                |
+            +-------+------------------------+
+            | (1,2) | Compound Poisson Gamma |
+            +-------+------------------------+
+            | 2     | Gamma                  |
+            +-------+------------------------+
+            | 3     | Inverse Gaussian       |
+            +-------+------------------------+
 
     alpha : float, optional (default=1)
         Constant that multiplies the penalty terms and thus determines the

From 3fae28a06968ab8ce0d95b5e4a53b2e7a7d66205 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 9 Oct 2019 19:23:19 +0200
Subject: [PATCH 197/209] Various improvement in Tweedie regression example

---
 ...lot_tweedie_regression_insurance_claims.py | 156 +++++++++++++-----
 1 file changed, 118 insertions(+), 38 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 3bdb7d93f0130..4b450fe34bb1e 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -48,7 +48,7 @@
 from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
 from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
 
-from sklearn.metrics import mean_absolute_error, mean_squared_error
+from sklearn.metrics import mean_absolute_error, mean_squared_error, auc
 
 
 def load_mtpl2(n_samples=100000):
@@ -157,11 +157,14 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 
 column_trans = ColumnTransformer(
     [
-        ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
+        ("binned_numeric", KBinsDiscretizer(n_bins=10),
+            ["VehAge", "DrivAge"]),
         ("onehot_categorical", OneHotEncoder(),
-         ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
-        ("passthrough_numeric", "passthrough", ["BonusMalus"]),
-        ("log_scaled_numeric", log_scale_transformer, ["Density"]),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
+        ("passthrough_numeric", "passthrough",
+            ["BonusMalus"]),
+        ("log_scaled_numeric", log_scale_transformer,
+            ["Density"]),
     ],
     remainder="drop",
 )
@@ -194,7 +197,8 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 
 
 def score_estimator(
-    estimator, X_train, X_test, df_train, df_test, target, weights
+    estimator, X_train, X_test, df_train, df_test, target, weights,
+    power=None,
 ):
     """Evaluate an estimator on train and test sets with different metrics"""
     res = []
@@ -217,13 +221,15 @@ def score_estimator(
                 est_freq, est_sev = estimator
                 y_pred = (df.Exposure.values * est_freq.predict(X) *
                           est_sev.predict(X))
-                power = 1.5
             else:
                 y_pred = estimator.predict(X)
-                power = getattr(getattr(estimator, "_family_instance"),
-                                "power")
+                if power is None:
+                    power = getattr(getattr(estimator, "_family_instance"),
+                                    "power")
 
             if score_label == "mean deviance":
+                if power is None:
+                    continue
                 metric = partial(mean_tweedie_deviance, power=power)
 
             if metric is None:
@@ -378,7 +384,7 @@ def score_estimator(
 # We can visually compare observed and predicted values, aggregated for
 # the drivers age (``DrivAge``).
 
-fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 4))
+fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 6))
 
 # plot DivAge
 plot_obs_pred(
@@ -403,7 +409,7 @@ def score_estimator(
     ax=ax[1],
     fill_legend=True
 )
-
+plt.tight_layout()
 
 ##############################################################################
 #
@@ -411,16 +417,16 @@ def score_estimator(
 # severity, both in observed and predicted data.
 #
 # 4. Total claim amount -- Compound Poisson Gamma distribution
-#    ------------------------------------------------------------
+# ------------------------------------------------------------
 #
 # As mentioned in the introduction, the total claim amount can be modeled
 # either as the product of the frequency model by the severity model,
 # denormalized by exposure. In the following code sample, the
 # ``score_estimator`` is extended to score such a model. The mean deviance is
-# computed assuming a Tweedie distribution with ``power=1.5`` to be comparable
-# with the model from the following section,
-
+# computed assuming a Tweedie distribution with ``power=2`` to be comparable
+# with the model from the following section:
 
+eps = 1e-4
 scores = score_estimator(
     (glm_freq, glm_sev),
     X_train,
@@ -429,40 +435,54 @@ def score_estimator(
     df_test,
     target="ClaimAmount",
     weights="Exposure",
+    power=2-eps,
 )
 print(scores)
 
 
 ##############################################################################
 #
-# Indeed, an alternative approach for modeling the total loss is with a unique
-# Compound Poisson Gamma model, also corresponding to a Tweedie model with a
-# power :math:`p \in (1, 2)`. We determine the optimal hyperparameter ``p``
-# with a grid search:
+# Instead of taking the product of two independently fit models for frequency
+# and severity one can directly model the total loss is with a unique Compound
+# Poisson Gamma generalized linear model (with a log link function). This
+# model is a special case of the Tweedie model with a power parameter :math:`p
+# \in (1, 2)`.
+#
+# We determine the optimal hyperparameter ``p`` with a grid search so as to
+# minimize the deviance:
 
 from sklearn.model_selection import GridSearchCV
 
-# exclude upper bound as power>=2 does not support y=0.
-params = {"power": np.linspace(1 + 1e-4, 2 - 1e-4, 8)}
-
-
-# this takes a while
-glm_total = GridSearchCV(
-    TweedieRegressor(tol=1e-3, max_iter=500), cv=3,
-    param_grid=params, n_jobs=-1
+# exclude upper bound as power>=2 as p=2 would lead to an undefined unit
+# deviance on data points with y=0.
+params = {"power": np.linspace(1 + eps, 2 - eps, 5)}
+
+X_train_small, _, df_train_small, _ = train_test_split(
+    X_train, df_train, train_size=5000)
+
+# This can takes a while on the full training set, therefore we do the
+# hyper-parameter search on a random subset, hoping that the best value of
+# power does not depend too much on the dataset size. We use a bit
+# penalization to avoid numerical issues with colinear features and speed-up
+# convergence.
+glm_total = TweedieRegressor(max_iter=10000, alpha=1e-2)
+search = GridSearchCV(
+    glm_total, cv=3,
+    param_grid=params, n_jobs=-1, verbose=10,
+    refit=False,
 )
-glm_total.fit(
-    X_train, df_train["ClaimAmount"], sample_weight=df_train["Exposure"]
+search.fit(
+    X_train_small, df_train_small["ClaimAmount"],
+    sample_weight=df_train_small["Exposure"]
 )
+print("Best hyper-parameters: %s" % search.best_params_)
 
-
-print(
-    "Best hyperparameters: power=%.2f\n"
-    % glm_total.best_estimator_.family.power
-)
+glm_total.set_params(**search.best_params_)
+glm_total.fit(X_train, df_train["ClaimAmount"],
+              sample_weight=df_train["Exposure"])
 
 scores = score_estimator(
-    glm_total.best_estimator_,
+    glm_total,
     X_train,
     X_test,
     df_train,
@@ -496,11 +516,71 @@ def score_estimator(
                 df["Exposure"].values*glm_freq.predict(X)*glm_sev.predict(X)
             ),
             "predicted, tweedie, power=%.2f"
-            % glm_total.best_estimator_.family.power: np.sum(
-                glm_total.best_estimator_.predict(X)
-            ),
+            % glm_total.power: np.sum(glm_total.predict(X)),
         }
     )
 
 print(pd.DataFrame(res).set_index("subset").T)
+
+##############################################################################
+#
+# Finally, we can compare the two models using a plot of cumulated claims: for
+# each model, the policyholders are ranked from riskiest to safest and the
+# actual cumulated claims are plotted against the cumulated exposure.
+#
+# The area under the curve can be used as a model selection metric to quantify
+# the ability of the model to rank policyholders. Note that this metric does
+# not reflect the ability of the models to make accurate predictions in terms
+# of absolute value of total claim amounts but only in terms of relative
+# amounts as a ranking metric.
+#
+# Both models are able to rank policyholders by risky-ness significantly
+# better than chance although they are also both far from perfect due to the
+# natural difficulty of the prediction problem from few features.
+
+
+def _cumulated_claims(y_true, y_pred, exposure):
+    idx_sort = np.argsort(y_pred)[::-1]  # from riskiest to safest
+    sorted_exposure = exposure[idx_sort]
+    sorted_frequencies = y_true[idx_sort]
+    cumulated_exposure = np.cumsum(sorted_exposure)
+    cumulated_exposure /= cumulated_exposure[-1]
+    cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies)
+    cumulated_claims /= cumulated_claims[-1]
+    return cumulated_exposure, cumulated_claims
+
+
+fig, ax = plt.subplots(figsize=(8, 8))
+
+y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test)
+y_pred_total = glm_total.predict(X_test)
+
+for label, y_pred in [("Frequency * Severity model", y_pred_product),
+                      ("Compound Poisson Gamma", y_pred_total)]:
+    cum_exposure, cum_claims = _cumulated_claims(
+        df_test["Frequency"].values,
+        y_pred,
+        df_test["Exposure"].values)
+    area = auc(cum_exposure, cum_claims)
+    label += " (area under curve: {:.3f})".format(area)
+    ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
+
+# Oracle model: y_pred == y_test
+cum_exposure, cum_claims = _cumulated_claims(
+    df_test["Frequency"].values,
+    df_test["Frequency"].values,
+    df_test["Exposure"].values)
+area = auc(cum_exposure, cum_claims)
+label = "Oracle (area under curve: {:.3f})".format(area)
+ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)
+
+# Random Baseline
+ax.plot([0, 1], [0, 1], linestyle="--", color="black",
+        label="Random baseline")
+ax.set(
+    title="Cumulated claim amount by model",
+    xlabel='Fraction of exposure (from riskiest to safest)',
+    ylabel='Fraction of total claim amount'
+)
+ax.legend(loc="lower right")
 plt.plot()

From a47798afe6e30e75b66f274a2323838c6a1401ea Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 10 Oct 2019 11:07:09 +0200
Subject: [PATCH 198/209] Update doc/modules/linear_model.rst

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>
---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index d9e60a3517f8e..3119b9b0db94b 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -915,7 +915,7 @@ problem becomes
 
 .. math::    \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, X_i)) + \frac{\alpha}{2} ||w||_2
 
-with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`.
+with sample weights :math:`s_i`, and L2 regularization penalty :math:`\alpha`.
 The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
 likelihood as
 

From 83391dd56bac107b21eea4cb258f3831a56d02ff Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 10 Oct 2019 11:49:53 +0200
Subject: [PATCH 199/209] Use latest docstring conventions everywhere

---
 sklearn/_loss/glm_distribution.py |  36 +++++------
 sklearn/linear_model/_glm/glm.py  | 100 +++++++++++++++---------------
 sklearn/linear_model/_glm/link.py |   8 +--
 sklearn/metrics/regression.py     |   6 +-
 4 files changed, 74 insertions(+), 76 deletions(-)

diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py
index 5f9e9ed06847c..4020f74427c44 100644
--- a/sklearn/_loss/glm_distribution.py
+++ b/sklearn/_loss/glm_distribution.py
@@ -60,7 +60,7 @@ def in_y_range(self, y):
 
         Parameters
         ----------
-        y : array, shape (n_samples,)
+        y : array of shape (n_samples,)
             Target values.
         """
         # Note that currently supported distributions have +inf upper bound
@@ -92,7 +92,7 @@ def unit_variance(self, y_pred):
 
         Parameters
         ----------
-        y_pred : array, shape (n_samples,)
+        y_pred : array of shape (n_samples,)
             Predicted mean.
         """
         pass  # pragma: no cover
@@ -105,7 +105,7 @@ def unit_variance_derivative(self, y_pred):
 
         Parameters
         ----------
-        y_pred : array, shape (n_samples,)
+        y_pred : array of shape (n_samples,)
             Target values.
         """
         pass  # pragma: no cover
@@ -121,10 +121,10 @@ def unit_deviance(self, y, y_pred, check_input=False):
 
         Parameters
         ----------
-        y : array, shape (n_samples,)
+        y : array of shape (n_samples,)
             Target values.
 
-        y_pred : array, shape (n_samples,)
+        y_pred : array of shape (n_samples,)
             Predicted mean.
 
         check_input : bool, default=False
@@ -132,7 +132,7 @@ def unit_deviance(self, y, y_pred, check_input=False):
             they will be propagated as NaN.
         Returns
         -------
-        deviance: array, shape (n_samples,)
+        deviance: array of shape (n_samples,)
             Computed deviance
         """
         pass  # pragma: no cover
@@ -147,10 +147,10 @@ def unit_deviance_derivative(self, y, y_pred):
 
         Parameters
         ----------
-        y : array, shape (n_samples,)
+        y : array of shape (n_samples,)
             Target values.
 
-        y_pred : array, shape (n_samples,)
+        y_pred : array of shape (n_samples,)
             Predicted mean.
         """
         return -2 * (y - y_pred) / self.unit_variance(y_pred)
@@ -168,13 +168,13 @@ def deviance(self, y, y_pred, weights=1):
 
         Parameters
         ----------
-        y : array, shape (n_samples,)
+        y : array of shape (n_samples,)
             Target values.
 
-        y_pred : array, shape (n_samples,)
+        y_pred : array of shape (n_samples,)
             Predicted mean.
 
-        weights : array, shape (n_samples,) (default=1)
+        weights : {int, array of shape (n_samples,)}, default=1
             Weights or exposure to which variance is inverse proportional.
         """
         return np.sum(weights * self.unit_deviance(y, y_pred))
@@ -193,7 +193,7 @@ def deviance_derivative(self, y, y_pred, weights=1):
         y_pred : array, shape (n_samples,)
             Predicted mean.
 
-        weights : array, shape (n_samples,) (default=1)
+        weights : {int, array of shape (n_samples,)}, default=1
             Weights or exposure to which variance is inverse proportional.
         """
         return weights * self.unit_deviance_derivative(y, y_pred)
@@ -231,7 +231,7 @@ class TweedieDistribution(ExponentialDispersionModel):
 
     Parameters
     ----------
-    power : float (default=0)
+    power : float, default=0
             The variance power of the `unit_variance`
             :math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`.
             For ``0<power<1``, no distribution exists.
@@ -276,7 +276,7 @@ def unit_variance(self, y_pred):
 
         Parameters
         ----------
-        y_pred : array, shape (n_samples,)
+        y_pred : array of shape (n_samples,)
             Predicted mean.
         """
         return np.power(y_pred, self.power)
@@ -287,7 +287,7 @@ def unit_variance_derivative(self, y_pred):
 
         Parameters
         ----------
-        y_pred : array, shape (n_samples,)
+        y_pred : array of shape (n_samples,)
             Predicted mean.
         """
         return self.power * np.power(y_pred, self.power - 1)
@@ -302,10 +302,10 @@ def unit_deviance(self, y, y_pred, check_input=False):
 
         Parameters
         ----------
-        y : array, shape (n_samples,)
+        y : array of shape (n_samples,)
             Target values.
 
-        y_pred : array, shape (n_samples,)
+        y_pred : array of shape (n_samples,)
             Predicted mean.
 
         check_input : bool, default=False
@@ -313,7 +313,7 @@ def unit_deviance(self, y, y_pred, check_input=False):
             they will be propagated as NaN.
         Returns
         -------
-        deviance: array, shape (n_samples,)
+        deviance: array of shape (n_samples,)
             Computed deviance
         """
         p = self.power
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index aae438733d424..2012f3cbb32a9 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -45,24 +45,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     Parameters
     ----------
-    alpha : float, optional (default=1)
+    alpha : float, default=1
         Constant that multiplies the penalty terms and thus determines the
         regularization strength.  ``alpha = 0`` is equivalent to unpenalized
         GLMs. In this case, the design matrix X must have full column rank
         (no collinearities).
 
-    fit_intercept : boolean, optional (default=True)
+    fit_intercept : bool, default=True
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
     family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} \
-            or an instance of class ExponentialDispersionModel, \
-            optional(default='normal')
+            or an ExponentialDispersionModel instance, default='normal'
         The distributional assumption of the GLM, i.e. which distribution from
         the EDM, specifies the loss function to be minimized.
 
     link : {'auto', 'identity', 'log'} or an instance of class BaseLink, \
-            optional (default='auto')
+            default='auto'
         The link function of the GLM, i.e. mapping from linear predictor
         (X*coef) to expectation (y_pred). Option 'auto' sets the link
         depending on the chosen family as follows:
@@ -71,39 +70,39 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         - 'log' for families 'poisson', 'gamma', 'inverse-gaussian'
 
-    solver : 'lbfgs', optional (default='lbfgs')
+    solver : 'lbfgs', default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
 
-    max_iter : int, optional (default=100)
+    max_iter : int, default=100
         The maximal number of iterations for the solver.
 
-    tol : float, optional (default=1e-4)
+    tol : float, default=1e-4
         Stopping criterion. For the lbfgs solver,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative) of
         the objective function.
 
-    warm_start : boolean, optional (default=False)
+    warm_start : bool, default=False
         If set to ``True``, reuse the solution of the previous call to ``fit``
         as initialization for ``coef_`` and ``intercept_``.
 
-    copy_X : boolean, optional, (default=True)
+    copy_X : bool, default=True
         If ``True``, X will be copied; else, it may be overwritten.
 
-    check_input : boolean, optional (default=True)
+    check_input : bool, default=True
         Allow to bypass several checks on input: y values in range of family,
         sample_weight non-negative.
         Don't use this parameter unless you know what you do.
 
-    verbose : int, optional (default=0)
+    verbose : int, default=0
         For the lbfgs solver set verbose to any positive number for verbosity.
 
     Attributes
     ----------
-    coef_ : array, shape (n_features,)
+    coef_ : array of shape (n_features,)
         Estimated coefficients for the linear predictor (X*coef_+intercept_) in
         the GLM.
 
@@ -134,14 +133,13 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Training data.
 
-        y : array-like, shape (n_samples,)
+        y : array-like of shape (n_samples,)
             Target values.
 
-        sample_weight : {None, array-like}, shape (n_samples,),\
-                optional (default=None)
+        sample_weight : array-like of shape (n_samples,), default=None
             Individual weights w_i for each sample. Note that for an
             Exponential Dispersion Model (EDM), one has
             Var[Y_i]=phi/w_i * v(y_pred).
@@ -304,12 +302,12 @@ def _linear_predictor(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Samples.
 
         Returns
         -------
-        y_pred : array, shape (n_samples,)
+        y_pred : array of shape (n_samples,)
             Returns predicted values of linear predictor.
         """
         check_is_fitted(self)
@@ -323,12 +321,12 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Samples.
 
         Returns
         -------
-        y_pred : array, shape (n_samples,)
+        y_pred : array of shape (n_samples,)
             Returns predicted values.
         """
         # check_array is done in _linear_predictor
@@ -353,14 +351,13 @@ def score(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Test samples.
 
-        y : array-like, shape (n_samples,)
+        y : array-like of shape (n_samples,)
             True values of target.
 
-        sample_weight : {None, array-like}, shape (n_samples,), optional \
-                (default=None)
+        sample_weight : array-like of shape (n_samples,), default=None
             Sample weights.
 
         Returns
@@ -407,38 +404,38 @@ class PoissonRegressor(GeneralizedLinearRegressor):
 
     Parameters
     ----------
-    alpha : float, optional (default=1)
+    alpha : float, default=1
         Constant that multiplies the penalty terms and thus determines the
         regularization strength.  ``alpha = 0`` is equivalent to unpenalized
         GLMs. In this case, the design matrix X must have full column rank
         (no collinearities).
 
-    fit_intercept : boolean, optional (default=True)
+    fit_intercept : bool, default=True
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    max_iter : int, optional (default=100)
+    max_iter : int, default=100
         The maximal number of iterations for the solver.
 
-    tol : float, optional (default=1e-4)
+    tol : float, default=1e-4
         Stopping criterion. For the lbfgs solver,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative) of
         the objective function.
 
-    warm_start : boolean, optional (default=False)
+    warm_start : bool, default=False
         If set to ``True``, reuse the solution of the previous call to ``fit``
         as initialization for ``coef_`` and ``intercept_`` .
 
-    copy_X : boolean, optional, (default=True)
+    copy_X : bool, default=True
         If ``True``, X will be copied; else, it may be overwritten.
 
-    verbose : int, optional (default=0)
+    verbose : int, default=0
         For the lbfgs solver set verbose to any positive number for verbosity.
 
     Attributes
     ----------
-    coef_ : array, shape (n_features,)
+    coef_ : array of shape (n_features,)
         Estimated coefficients for the linear predictor (X*coef_+intercept_) in
         the GLM.
 
@@ -486,38 +483,38 @@ class GammaRegressor(GeneralizedLinearRegressor):
 
     Parameters
     ----------
-    alpha : float, optional (default=1)
+    alpha : float, default=1
         Constant that multiplies the penalty terms and thus determines the
         regularization strength.  ``alpha = 0`` is equivalent to unpenalized
         GLMs. In this case, the design matrix X must have full column rank
         (no collinearities).
 
-    fit_intercept : boolean, optional (default=True)
+    fit_intercept : bool, default=True
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    max_iter : int, optional (default=100)
+    max_iter : int, default=100
         The maximal number of iterations for the solver.
 
-    tol : float, optional (default=1e-4)
+    tol : float, default=1e-4
         Stopping criterion. For the lbfgs solver,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative) of
         the objective function.
 
-    warm_start : boolean, optional (default=False)
+    warm_start : bool, default=False
         If set to ``True``, reuse the solution of the previous call to ``fit``
         as initialization for ``coef_`` and ``intercept_`` .
 
-    copy_X : boolean, optional, (default=True)
+    copy_X : bool, default=True
         If ``True``, X will be copied; else, it may be overwritten.
 
-    verbose : int, optional (default=0)
+    verbose : int, default=0
         For the lbfgs solver set verbose to any positive number for verbosity.
 
     Attributes
     ----------
-    coef_ : array, shape (n_features,)
+    coef_ : array of shape (n_features,)
         Estimated coefficients for the linear predictor (X*coef_+intercept_) in
         the GLM.
 
@@ -565,7 +562,7 @@ class TweedieRegressor(GeneralizedLinearRegressor):
 
     Parameters
     ----------
-    power : float (default=0)
+    power : float, default=0
             The power determines the underlying target distribution. By
             definition it links distribution variance (:math:`v`) and
             mean (:math:`\y_\textrm{pred}`):
@@ -589,7 +586,7 @@ class TweedieRegressor(GeneralizedLinearRegressor):
             | 3     | Inverse Gaussian       |
             +-------+------------------------+
 
-    alpha : float, optional (default=1)
+    alpha : float, default=1
         Constant that multiplies the penalty terms and thus determines the
         regularization strength.  ``alpha = 0`` is equivalent to unpenalized
         GLMs. In this case, the design matrix X must have full column rank
@@ -604,33 +601,34 @@ class TweedieRegressor(GeneralizedLinearRegressor):
 
         - 'log' for Poisson,  Gamma or Inverse Gaussian distributions
 
-    fit_intercept : boolean, optional (default=True)
+    fit_intercept : bool, default=True
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    max_iter : int, optional (default=100)
+    max_iter : int, default=100
         The maximal number of iterations for the solver.
 
-    tol : float, optional (default=1e-4)
+    tol : float, default=1e-4
         Stopping criterion. For the lbfgs solver,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative) of
         the objective function.
 
-    warm_start : boolean, optional (default=False)
+    warm_start : bool, default=False
         If set to ``True``, reuse the solution of the previous call to ``fit``
         as initialization for ``coef_`` and ``intercept_`` .
 
-    copy_X : boolean, optional, (default=True)
+    copy_X : bool, default=True
         If ``True``, X will be copied; else, it may be overwritten.
 
-    verbose : int, optional (default=0)
+    verbose : int, default=0
         For the lbfgs solver set verbose to any positive number for verbosity.
 
     Attributes
     ----------
-    coef_ : array, shape (n_features,) Estimated coefficients for the linear
-        predictor (X*coef_+intercept_) in the GLM.
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (X*coef_+intercept_)
+        in the GLM.
 
     intercept_ : float
         Intercept (a.k.a. bias) added to linear predictor.
diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py
index 7c404798b2c18..e8d3c792d3efe 100644
--- a/sklearn/linear_model/_glm/link.py
+++ b/sklearn/linear_model/_glm/link.py
@@ -23,7 +23,7 @@ def __call__(self, y_pred):
 
         Parameters
         ----------
-        y_pred : array, shape (n_samples,)
+        y_pred : array of shape (n_samples,)
             Usually the (predicted) mean.
         """
         pass  # pragma: no cover
@@ -34,7 +34,7 @@ def derivative(self, y_pred):
 
         Parameters
         ----------
-        y_pred : array, shape (n_samples,)
+        y_pred : array of shape (n_samples,)
             Usually the (predicted) mean.
         """
         pass  # pragma: no cover
@@ -48,7 +48,7 @@ def inverse(self, lin_pred):
 
         Parameters
         ----------
-        lin_pred : array, shape (n_samples,)
+        lin_pred : array of shape (n_samples,)
             Usually the (fitted) linear predictor.
         """
         pass  # pragma: no cover
@@ -59,7 +59,7 @@ def inverse_derivative(self, lin_pred):
 
         Parameters
         ----------
-        lin_pred : array, shape (n_samples,)
+        lin_pred : array of shape (n_samples,)
             Usually the (fitted) linear predictor.
         """
         pass  # pragma: no cover
diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py
index d1b4498b05196..0ec3db5b6fad8 100644
--- a/sklearn/metrics/regression.py
+++ b/sklearn/metrics/regression.py
@@ -638,7 +638,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0):
     y_pred : array-like of shape (n_samples,)
         Estimated target values.
 
-    sample_weight : array-like, shape (n_samples,), optional
+    sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
     power : float, default=0
@@ -705,7 +705,7 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None):
     y_pred : array-like of shape (n_samples,)
         Estimated target values. Requires y_pred > 0.
 
-    sample_weight : array-like, shape (n_samples,), optional
+    sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
     Returns
@@ -743,7 +743,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None):
     y_pred : array-like of shape (n_samples,)
         Estimated target values. Requires y_pred > 0.
 
-    sample_weight : array-like, shape (n_samples,), optional
+    sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
     Returns

From 3bfb54e9e172b6f2cae3e48a7ece4d011937e786 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 10 Oct 2019 11:53:42 +0200
Subject: [PATCH 200/209] Drop check_input parameter

---
 sklearn/linear_model/_glm/glm.py            | 22 ++++++---------------
 sklearn/linear_model/_glm/tests/test_glm.py | 12 +----------
 2 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 2012f3cbb32a9..990de8114a717 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -92,11 +92,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     copy_X : bool, default=True
         If ``True``, X will be copied; else, it may be overwritten.
 
-    check_input : bool, default=True
-        Allow to bypass several checks on input: y values in range of family,
-        sample_weight non-negative.
-        Don't use this parameter unless you know what you do.
-
     verbose : int, default=0
         For the lbfgs solver set verbose to any positive number for verbosity.
 
@@ -115,7 +110,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     def __init__(self, alpha=1.0,
                  fit_intercept=True, family='normal', link='auto',
                  solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
-                 copy_X=True, check_input=True, verbose=0):
+                 copy_X=True, verbose=0):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
         self.family = family
@@ -125,7 +120,6 @@ def __init__(self, alpha=1.0,
         self.tol = tol
         self.warm_start = warm_start
         self.copy_X = copy_X
-        self.check_input = check_input
         self.verbose = verbose
 
     def fit(self, X, y, sample_weight=None):
@@ -213,9 +207,6 @@ def fit(self, X, y, sample_weight=None):
         if not isinstance(self.copy_X, bool):
             raise ValueError("The argument copy_X must be bool;"
                              " got {0}".format(self.copy_X))
-        if not isinstance(self.check_input, bool):
-            raise ValueError("The argument check_input must be bool; got "
-                             "(check_input={0})".format(self.check_input))
 
         family = self._family_instance
         link = self._link_instance
@@ -228,12 +219,11 @@ def fit(self, X, y, sample_weight=None):
 
         _, n_features = X.shape
 
-        if self.check_input:
-            if not np.all(family.in_y_range(y)):
-                raise ValueError("Some value(s) of y are out of the valid "
-                                 "range for family {0}"
-                                 .format(family.__class__.__name__))
-            # TODO: if alpha=0 check that X is not rank deficient
+        if not np.all(family.in_y_range(y)):
+            raise ValueError("Some value(s) of y are out of the valid "
+                             "range for family {0}"
+                             .format(family.__class__.__name__))
+        # TODO: if alpha=0 check that X is not rank deficient
 
         # rescaling of sample_weight
         #
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 9e21ae7775cf4..c0ff6508db9c9 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -164,16 +164,6 @@ def test_glm_copy_X_argument(copy_X):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]])
-def test_glm_check_input_argument(check_input):
-    """Test GLM for invalid check_input argument."""
-    y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    glm = GeneralizedLinearRegressor(check_input=check_input)
-    with pytest.raises(ValueError, match="check_input must be bool"):
-        glm.fit(X, y)
-
-
 def test_glm_identity_regression():
     """Test GLM regression with identity link on a simple dataset."""
     coef = [1., 2.]
@@ -291,7 +281,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept):
 
     glm = GeneralizedLinearRegressor(alpha=1.0, family='normal',
                                      link='identity', fit_intercept=True,
-                                     check_input=False, max_iter=300)
+                                     max_iter=300)
     glm.fit(X_train, y_train)
     assert glm.coef_.shape == (X.shape[1], )
     assert_allclose(glm.coef_, ridge.coef_, atol=5e-5)

From d325fe23348f8dabfcc55dd4fbd8fa82fd60ff8d Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 10 Oct 2019 11:57:10 +0200
Subject: [PATCH 201/209] Use keyword only arguments SLEP009

---
 sklearn/linear_model/_glm/glm.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 990de8114a717..8ef912f9596b6 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -107,7 +107,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     n_iter_ : int
         Actual number of iterations used in the solver.
     """
-    def __init__(self, alpha=1.0,
+    def __init__(self, *, alpha=1.0,
                  fit_intercept=True, family='normal', link='auto',
                  solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
                  copy_X=True, verbose=0):
@@ -435,8 +435,8 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     n_iter_ : int
         Actual number of iterations used in the solver.
     """
-    def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4,
-                 warm_start=False, copy_X=True, verbose=0):
+    def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100,
+                 tol=1e-4, warm_start=False, copy_X=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
                          family="poisson", link='log', max_iter=max_iter,
@@ -514,8 +514,8 @@ class GammaRegressor(GeneralizedLinearRegressor):
     n_iter_ : int
         Actual number of iterations used in the solver.
     """
-    def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4,
-                 warm_start=False, copy_X=True, verbose=0):
+    def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100,
+                 tol=1e-4, warm_start=False, copy_X=True, verbose=0):
 
         super().__init__(alpha=alpha, fit_intercept=fit_intercept,
                          family="gamma", link='log', max_iter=max_iter,
@@ -626,7 +626,7 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     n_iter_ : int
         Actual number of iterations used in the solver.
     """
-    def __init__(self, power=0.0, alpha=1.0, fit_intercept=True,
+    def __init__(self, *, power=0.0, alpha=1.0, fit_intercept=True,
                  link='auto', max_iter=100, tol=1e-4,
                  warm_start=False, copy_X=True, verbose=0):
 

From 661cf56e04bc23e53a7173bc43d998187892d9ec Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Thu, 10 Oct 2019 12:04:37 +0200
Subject: [PATCH 202/209] Move _y_pred_deviance_derivative from losses as a
 private function

---
 sklearn/_loss/glm_distribution.py | 19 -------------------
 sklearn/linear_model/_glm/glm.py  | 25 +++++++++++++++++++++++--
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py
index 4020f74427c44..55365d382c03b 100644
--- a/sklearn/_loss/glm_distribution.py
+++ b/sklearn/_loss/glm_distribution.py
@@ -13,14 +13,6 @@
 from scipy.special import xlogy
 
 
-def _safe_lin_pred(X, coef):
-    """Compute the linear predictor taking care if intercept is present."""
-    if coef.size == X.shape[1] + 1:
-        return X @ coef[1:] + coef[0]
-    else:
-        return X @ coef
-
-
 DistributionBoundary = namedtuple("DistributionBoundary",
                                   ("value", "inclusive"))
 
@@ -198,17 +190,6 @@ def deviance_derivative(self, y, y_pred, weights=1):
         """
         return weights * self.unit_deviance_derivative(y, y_pred)
 
-    def _y_pred_deviance_derivative(self, coef, X, y, weights, link):
-        """Compute y_pred and the derivative of the deviance w.r.t coef."""
-        lin_pred = _safe_lin_pred(X, coef)
-        y_pred = link.inverse(lin_pred)
-        d1 = link.inverse_derivative(lin_pred)
-        temp = d1 * self.deviance_derivative(y, y_pred, weights)
-        if coef.size == X.shape[1] + 1:
-            devp = np.concatenate(([temp.sum()], temp @ X))
-        else:
-            devp = temp @ X  # same as X.T @ temp
-        return y_pred, devp
 
 
 class TweedieDistribution(ExponentialDispersionModel):
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 8ef912f9596b6..b29dcd89a35a6 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -27,6 +27,27 @@
 )
 
 
+def _safe_lin_pred(X, coef):
+    """Compute the linear predictor taking care if intercept is present."""
+    if coef.size == X.shape[1] + 1:
+        return X @ coef[1:] + coef[0]
+    else:
+        return X @ coef
+
+
+def _y_pred_deviance_derivative(coef, X, y, weights, family, link):
+    """Compute y_pred and the derivative of the deviance w.r.t coef."""
+    lin_pred = _safe_lin_pred(X, coef)
+    y_pred = link.inverse(lin_pred)
+    d1 = link.inverse_derivative(lin_pred)
+    temp = d1 * family.deviance_derivative(y, y_pred, weights)
+    if coef.size == X.shape[1] + 1:
+        devp = np.concatenate(([temp.sum()], temp @ X))
+    else:
+        devp = temp @ X  # same as X.T @ temp
+    return y_pred, devp
+
+
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """Regression via a penalized Generalized Linear Model (GLM).
 
@@ -251,8 +272,8 @@ def fit(self, X, y, sample_weight=None):
 
         if solver == 'lbfgs':
             def func(coef, X, y, weights, alpha, family, link):
-                y_pred, devp = family._y_pred_deviance_derivative(
-                    coef, X, y, weights, link
+                y_pred, devp = _y_pred_deviance_derivative(
+                    coef, X, y, weights, family, link
                 )
                 dev = family.deviance(y, y_pred, weights)
                 intercept = (coef.size == X.shape[1] + 1)

From 560c180fee5c1217353af106beeca04a0b9bd732 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 10 Oct 2019 15:28:21 +0200
Subject: [PATCH 203/209] Fix cumulated claim amount curve in Tweedie
 regression example

---
 ...lot_tweedie_regression_insurance_claims.py | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 4b450fe34bb1e..7e4a8599dec0e 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -539,15 +539,15 @@ def score_estimator(
 # natural difficulty of the prediction problem from few features.
 
 
-def _cumulated_claims(y_true, y_pred, exposure):
-    idx_sort = np.argsort(y_pred)[::-1]  # from riskiest to safest
-    sorted_exposure = exposure[idx_sort]
-    sorted_frequencies = y_true[idx_sort]
-    cumulated_exposure = np.cumsum(sorted_exposure)
+def _cumulated_claim_amount(y_true, y_pred, exposure):
+    ranking = np.argsort(y_pred)[::-1]  # from riskiest to safest
+    ranked_exposure = exposure[ranking]
+    ranked_claim_amount = y_true[ranking]
+    cumulated_exposure = np.cumsum(ranked_exposure)
     cumulated_exposure /= cumulated_exposure[-1]
-    cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies)
-    cumulated_claims /= cumulated_claims[-1]
-    return cumulated_exposure, cumulated_claims
+    cumulated_claim_amount = np.cumsum(ranked_claim_amount)
+    cumulated_claim_amount /= cumulated_claim_amount[-1]
+    return cumulated_exposure, cumulated_claim_amount
 
 
 fig, ax = plt.subplots(figsize=(8, 8))
@@ -557,8 +557,8 @@ def _cumulated_claims(y_true, y_pred, exposure):
 
 for label, y_pred in [("Frequency * Severity model", y_pred_product),
                       ("Compound Poisson Gamma", y_pred_total)]:
-    cum_exposure, cum_claims = _cumulated_claims(
-        df_test["Frequency"].values,
+    cum_exposure, cum_claims = _cumulated_claim_amount(
+        df_test["ClaimAmount"].values,
         y_pred,
         df_test["Exposure"].values)
     area = auc(cum_exposure, cum_claims)
@@ -566,9 +566,9 @@ def _cumulated_claims(y_true, y_pred, exposure):
     ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
 
 # Oracle model: y_pred == y_test
-cum_exposure, cum_claims = _cumulated_claims(
-    df_test["Frequency"].values,
-    df_test["Frequency"].values,
+cum_exposure, cum_claims = _cumulated_claim_amount(
+    df_test["ClaimAmount"].values,
+    df_test["ClaimAmount"].values,
     df_test["Exposure"].values)
 area = auc(cum_exposure, cum_claims)
 label = "Oracle (area under curve: {:.3f})".format(area)

From 0ea2dce29a52829fb2c0ace0c992f68e1f052f0a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 10 Oct 2019 16:15:25 +0200
Subject: [PATCH 204/209] PEP8

---
 sklearn/_loss/glm_distribution.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py
index 55365d382c03b..dbfac6af673ae 100644
--- a/sklearn/_loss/glm_distribution.py
+++ b/sklearn/_loss/glm_distribution.py
@@ -191,7 +191,6 @@ def deviance_derivative(self, y, y_pred, weights=1):
         return weights * self.unit_deviance_derivative(y, y_pred)
 
 
-
 class TweedieDistribution(ExponentialDispersionModel):
     r"""A class for the Tweedie distribution.
 

From a608c703b5960994c569e0191775c3dfa0c805e0 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 10 Oct 2019 19:07:31 +0200
Subject: [PATCH 205/209] WIP implementation of Gini coeff and Lorenz curve

---
 sklearn/metrics/__init__.py |  4 +++
 sklearn/metrics/ranking.py  | 53 +++++++++++++++++++++++++++++++++++++
 sklearn/metrics/scorer.py   |  7 +++--
 3 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index b0846f2ff6828..9f284e9df54fb 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -14,6 +14,8 @@
 from .ranking import precision_recall_curve
 from .ranking import roc_auc_score
 from .ranking import roc_curve
+from .ranking import gini_score
+from .ranking import lorenz_curve
 
 from .classification import accuracy_score
 from .classification import balanced_accuracy_score
@@ -106,6 +108,7 @@
     'fbeta_score',
     'fowlkes_mallows_score',
     'get_scorer',
+    'gini_score',
     'hamming_loss',
     'hinge_loss',
     'homogeneity_completeness_v_measure',
@@ -114,6 +117,7 @@
     'jaccard_similarity_score',
     'label_ranking_average_precision_score',
     'label_ranking_loss',
+    'lorenz_curve',
     'log_loss',
     'make_scorer',
     'nan_euclidean_distances',
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index d1a14910897f1..01f7ee510d845 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -1390,3 +1390,56 @@ def ndcg_score(y_true, y_score, k=None, sample_weight=None, ignore_ties=False):
     _check_dcg_target_type(y_true)
     gain = _ndcg_sample_scores(y_true, y_score, k=k, ignore_ties=ignore_ties)
     return np.average(gain, weights=sample_weight)
+
+
+def lorenz_curve(y_true, y_pred, sample_weight=None,
+                 ascending_predictions=True,
+                 normalize=True,
+                 return_gini=False):
+    y_true = check_array(y_true, ensure_2d=False)
+    y_pred = check_array(y_pred, ensure_2d=False)
+    check_consistent_length(y_true, y_pred)
+    y_true_min = y_true.min()
+    if y_true_min < 0:
+        raise ValueError("lorenz_curve is only defined for regression problems"
+                         " with non-negative target values. Observed minimum"
+                         " target value is %f" % y_true_min)
+    if sample_weight is None:
+        sample_weight = np.ones(len(y_true), dtype=np.float64)
+    else:
+        sample_weight = check_array(sample_weight, ensure_2d=False)
+        check_consistent_length(y_true, sample_weight)
+
+    # Rank the ranking base on y_pred
+    ranking = np.argsort(y_pred)
+    if not ascending_predictions:
+        ranking = ranking[::-1]
+
+    ranked_sample_weight = sample_weight[ranking]
+    ranked_target = y_true[ranking]
+
+    # Accumulate the sample weights and target values
+    cumulated_samples = np.cumsum(ranked_sample_weight)
+    cumulated_target = np.cumsum(ranked_target)
+
+    # Normalize to report fractions instead of absolute values.
+    # Normalization is necessary to compute the Gini index from
+    # the area under the Lorenz curve
+    if normalize:
+        cumulated_samples /= cumulated_samples[-1]
+        cumulated_target /= cumulated_target[-1]
+
+    if return_gini:
+        if not normalize or not ascending_predictions:
+            raise ValueError("Gini coefficient requires normalize=True"
+                             " and ascending_predictions=True")
+        gini = 1 - 2 * auc(cumulated_samples, cumulated_target)
+        return cumulated_samples, cumulated_target, gini
+    return cumulated_samples, cumulated_target
+
+
+def gini_score(y_true, y_pred, sample_weight=None):
+    cumulated_weights, cumulated_values = lorenz_curve(
+        y_true, y_pred, sample_weight=sample_weight,
+        ascending_predictions=True, normalize=True)
+    return 1 - 2 * auc(cumulated_weights, cumulated_values)
diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 25b826ff91f75..06942f71333d6 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -31,7 +31,7 @@
                f1_score, roc_auc_score, average_precision_score,
                precision_score, recall_score, log_loss,
                balanced_accuracy_score, explained_variance_score,
-               brier_score_loss, jaccard_score)
+               brier_score_loss, jaccard_score, gini_score)
 
 from .cluster import adjusted_rand_score
 from .cluster import homogeneity_score
@@ -634,6 +634,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
     mean_gamma_deviance, greater_is_better=False
 )
 
+gini_scorer = make_scorer(gini_score)
+
 # Standard Classification Scores
 accuracy_scorer = make_scorer(accuracy_score)
 balanced_accuracy_scorer = make_scorer(balanced_accuracy_score)
@@ -707,7 +709,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
                mutual_info_score=mutual_info_scorer,
                adjusted_mutual_info_score=adjusted_mutual_info_scorer,
                normalized_mutual_info_score=normalized_mutual_info_scorer,
-               fowlkes_mallows_score=fowlkes_mallows_scorer)
+               fowlkes_mallows_score=fowlkes_mallows_scorer,
+               gini_score=gini_scorer)
 
 
 for name, metric in [('precision', precision_score),

From 853f8b785bf24eff8e39b653eee49a73042a8e86 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 10 Oct 2019 19:08:01 +0200
Subject: [PATCH 206/209] Use Lorenz curve in Tweedie example

---
 ...lot_tweedie_regression_insurance_claims.py | 79 +++++++++----------
 1 file changed, 37 insertions(+), 42 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 7e4a8599dec0e..fb44484c2d0bf 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -48,15 +48,16 @@
 from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
 from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
 
-from sklearn.metrics import mean_absolute_error, mean_squared_error, auc
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+from sklearn.metrics import lorenz_curve
 
 
-def load_mtpl2(n_samples=100000):
+def load_mtpl2(n_samples=None):
     """Fetch the French Motor Third-Party Liability Claims dataset.
 
     Parameters
     ----------
-    n_samples: int, default=100000
+    n_samples: int, default=None
       number of samples to select (for faster run time). Full dataset has
       678013 samples.
     """
@@ -138,7 +139,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 # containing the claim amount (``ClaimAmount``) for the same policy ids
 # (``IDpol``).
 
-df = load_mtpl2(n_samples=60000)
+df = load_mtpl2()
 
 # Note: filter out claims with zero amount, as the severity model
 # requires strictly positive target values.
@@ -188,7 +189,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 # ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution,
 # and use ``Exposure`` as `sample_weight`.
 
-df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)
+df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=40)
 
 # Some of the features are colinear, we use a weak penalization to avoid
 # numerical issues.
@@ -449,7 +450,7 @@ def score_estimator(
 # \in (1, 2)`.
 #
 # We determine the optimal hyperparameter ``p`` with a grid search so as to
-# minimize the deviance:
+# maximize the Gini coefficient (a risk ranking metric):
 
 from sklearn.model_selection import GridSearchCV
 
@@ -458,7 +459,7 @@ def score_estimator(
 params = {"power": np.linspace(1 + eps, 2 - eps, 5)}
 
 X_train_small, _, df_train_small, _ = train_test_split(
-    X_train, df_train, train_size=5000)
+    X_train, df_train, train_size=5000, random_state=0)
 
 # This can takes a while on the full training set, therefore we do the
 # hyper-parameter search on a random subset, hoping that the best value of
@@ -467,15 +468,17 @@ def score_estimator(
 # convergence.
 glm_total = TweedieRegressor(max_iter=10000, alpha=1e-2)
 search = GridSearchCV(
-    glm_total, cv=3,
-    param_grid=params, n_jobs=-1, verbose=10,
-    refit=False,
+    glm_total, param_grid=params, cv=3, scoring="gini_score",
+    n_jobs=-1, verbose=1, refit=False
 )
 search.fit(
     X_train_small, df_train_small["ClaimAmount"],
     sample_weight=df_train_small["Exposure"]
 )
 print("Best hyper-parameters: %s" % search.best_params_)
+cv_results = pd.DataFrame(search.cv_results_).sort_values(
+    "mean_test_score", ascending=False)
+print(cv_results[["param_power", "mean_test_score", "std_test_score"]])
 
 glm_total.set_params(**search.best_params_)
 glm_total.fit(X_train, df_train["ClaimAmount"],
@@ -524,32 +527,26 @@ def score_estimator(
 
 ##############################################################################
 #
-# Finally, we can compare the two models using a plot of cumulated claims: for
-# each model, the policyholders are ranked from riskiest to safest and the
-# actual cumulated claims are plotted against the cumulated exposure.
+# Finally, we can compare the two models using a plot of Lorenz curve of
+# cumulated claims: for each model, the policyholders are ranked from safest
+# to riskiest and the actual cumulated claims are plotted against the
+# cumulated exposure.
 #
-# The area under the curve can be used as a model selection metric to quantify
-# the ability of the model to rank policyholders. Note that this metric does
-# not reflect the ability of the models to make accurate predictions in terms
-# of absolute value of total claim amounts but only in terms of relative
-# amounts as a ranking metric.
+# The Gini coefficient can be computed from the areas under curve to compare
+# the model to the random baseline. This coefficient can be used as a model
+# selection metric to quantify the ability of the model to rank policyholders.
+# A Gini coefficient close to 0 means random ranking, while larger Gini
+# coefficient of 1 mean more discriminative rankings.
+#
+# Note that this metric does not reflect the ability of the models to make
+# accurate predictions in terms of absolute value of total claim amounts but
+# only in terms of relative amounts as a ranking metric.
 #
 # Both models are able to rank policyholders by risky-ness significantly
 # better than chance although they are also both far from perfect due to the
 # natural difficulty of the prediction problem from few features.
 
 
-def _cumulated_claim_amount(y_true, y_pred, exposure):
-    ranking = np.argsort(y_pred)[::-1]  # from riskiest to safest
-    ranked_exposure = exposure[ranking]
-    ranked_claim_amount = y_true[ranking]
-    cumulated_exposure = np.cumsum(ranked_exposure)
-    cumulated_exposure /= cumulated_exposure[-1]
-    cumulated_claim_amount = np.cumsum(ranked_claim_amount)
-    cumulated_claim_amount /= cumulated_claim_amount[-1]
-    return cumulated_exposure, cumulated_claim_amount
-
-
 fig, ax = plt.subplots(figsize=(8, 8))
 
 y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test)
@@ -557,21 +554,19 @@ def _cumulated_claim_amount(y_true, y_pred, exposure):
 
 for label, y_pred in [("Frequency * Severity model", y_pred_product),
                       ("Compound Poisson Gamma", y_pred_total)]:
-    cum_exposure, cum_claims = _cumulated_claim_amount(
-        df_test["ClaimAmount"].values,
-        y_pred,
-        df_test["Exposure"].values)
-    area = auc(cum_exposure, cum_claims)
-    label += " (area under curve: {:.3f})".format(area)
+    cum_exposure, cum_claims, gini = lorenz_curve(
+        df_test["ClaimAmount"], y_pred,
+        sample_weight=df_test["Exposure"],
+        return_gini=True)
+    label += " (Gini coefficient: {:.3f})".format(gini)
     ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
 
 # Oracle model: y_pred == y_test
-cum_exposure, cum_claims = _cumulated_claim_amount(
-    df_test["ClaimAmount"].values,
-    df_test["ClaimAmount"].values,
-    df_test["Exposure"].values)
-area = auc(cum_exposure, cum_claims)
-label = "Oracle (area under curve: {:.3f})".format(area)
+cum_exposure, cum_claims, gini = lorenz_curve(
+    df_test["ClaimAmount"], df_test["ClaimAmount"],
+    sample_weight=df_test["Exposure"],
+    return_gini=True)
+label = "Oracle (Gini coefficient: {:.3f})".format(gini)
 ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)
 
 # Random Baseline
@@ -582,5 +577,5 @@ def _cumulated_claim_amount(y_true, y_pred, exposure):
     xlabel='Fraction of exposure (from riskiest to safest)',
     ylabel='Fraction of total claim amount'
 )
-ax.legend(loc="lower right")
+ax.legend(loc="upper left")
 plt.plot()

From b3b55e8cf36124c15f264b60a97c940db6e138a7 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 10 Oct 2019 19:15:31 +0200
Subject: [PATCH 207/209] PEP8

---
 sklearn/metrics/ranking.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 01f7ee510d845..11879dfd13033 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -993,7 +993,7 @@ def label_ranking_loss(y_true, y_score, sample_weight=None):
             unique_inverse[y_true.indices[start:stop]],
             minlength=len(unique_scores))
         all_at_reversed_rank = np.bincount(unique_inverse,
-                                        minlength=len(unique_scores))
+                                           minlength=len(unique_scores))
         false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank
 
         # if the scores are ordered, it's possible to count the number of

From 640f0179a27af62c4510287fc80a7a63e2db7dc9 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 11 Oct 2019 09:07:52 +0200
Subject: [PATCH 208/209] Make sure labels/weights are floats before
 normalizing

---
 sklearn/metrics/ranking.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 11879dfd13033..2fb8e5c429df5 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -1396,8 +1396,10 @@ def lorenz_curve(y_true, y_pred, sample_weight=None,
                  ascending_predictions=True,
                  normalize=True,
                  return_gini=False):
-    y_true = check_array(y_true, ensure_2d=False)
-    y_pred = check_array(y_pred, ensure_2d=False)
+    y_true = check_array(y_true, ensure_2d=False,
+                         dtype=[np.float64, np.float32])
+    y_pred = check_array(y_pred, ensure_2d=False,
+                         dtype=[np.float64, np.float32])
     check_consistent_length(y_true, y_pred)
     y_true_min = y_true.min()
     if y_true_min < 0:
@@ -1407,7 +1409,8 @@ def lorenz_curve(y_true, y_pred, sample_weight=None,
     if sample_weight is None:
         sample_weight = np.ones(len(y_true), dtype=np.float64)
     else:
-        sample_weight = check_array(sample_weight, ensure_2d=False)
+        sample_weight = check_array(sample_weight, ensure_2d=False,
+                                    dtype=[np.float64, np.float32])
         check_consistent_length(y_true, sample_weight)
 
     # Rank the ranking base on y_pred

From 6dd197ab0774b66921d0e93112664d4755478303 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 11 Oct 2019 09:08:20 +0200
Subject: [PATCH 209/209] Update scorer test framework

---
 sklearn/metrics/tests/test_score_objects.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index cfabed6d2c4ac..8aaa3e0658fdf 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -47,7 +47,8 @@
                       'mean_absolute_error',
                       'mean_squared_error', 'median_absolute_error',
                       'max_error', 'neg_mean_poisson_deviance',
-                      'neg_mean_gamma_deviance']
+                      'neg_mean_gamma_deviance',
+                      'gini_score']
 
 CLF_SCORERS = ['accuracy', 'balanced_accuracy',
                'f1', 'f1_weighted', 'f1_macro', 'f1_micro',
@@ -73,7 +74,8 @@
                            'jaccard_samples']
 
 REQUIRE_POSITIVE_Y_SCORERS = ['neg_mean_poisson_deviance',
-                              'neg_mean_gamma_deviance']
+                              'neg_mean_gamma_deviance',
+                              'gini_score']
 
 
 def _require_positive_y(y):