From d5e88108a71c6c256b67948515f20955cc96fabf Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 18 Jul 2017 21:50:10 +0200
Subject: [PATCH 01/69] [WIP] Add Generalized Linear Model, issue #5975,
 initial commit

---
 sklearn/linear_model/__init__.py       |   7 +
 sklearn/linear_model/glm.py            | 872 +++++++++++++++++++++++++
 sklearn/linear_model/tests/test_glm.py |  73 +++
 3 files changed, 952 insertions(+)
 create mode 100644 sklearn/linear_model/glm.py
 create mode 100644 sklearn/linear_model/tests/test_glm.py

diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 2e01990ccce8c..5acc51e9dc87f 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,6 +18,12 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
+from .glm import (Link, IdentityLink, LogLink,
+                 ExponentialDispersionModel, TweedieDistribution,
+                 NormalDistribution, GaussianDistribution,
+                 PoissonDistribution, GammaDistribution,
+                 InverseGaussianDistribution, GeneralizedHyperbolicSecand,
+                 GeneralizedLinearModel)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
@@ -38,6 +44,7 @@
            'BayesianRidge',
            'ElasticNet',
            'ElasticNetCV',
+           'GeneralizedLinearModel',
            'Hinge',
            'Huber',
            'HuberRegressor',
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
new file mode 100644
index 0000000000000..8b6eb8f3bf16c
--- /dev/null
+++ b/sklearn/linear_model/glm.py
@@ -0,0 +1,872 @@
+"""
+Generalized Linear Models with Exponential Dispersion Family
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.ch>
+# License: BSD 3 clause
+
+# TODO: Which name? GeneralizedLinearModel vs GeneralizedLinearRegression.
+#       So far, it is GeneralizedLinearModel, since it could very easily
+#       extended by Bernoulli/Binomial distribution.
+# TODO: Which name/symbol for coefficients and weights in docu?
+#       sklearn.linear_models uses w for coefficients.
+#       So far, coefficients=beta and weight=w (as standard literature)
+# TODO: Add l2-penalty
+# TODO: Add l1-penalty (elastic net)
+# TODO: Add cross validation
+# TODO: Write docu and examples
+
+# Design Decisions:
+# - The link funtion (instance of class Link) is necessary for the evaluation
+#   of deviance, score, Fisher and Hessian matrix as functions of the
+#   coefficients, which is needed by optimizers.
+#   Solution: link as argument in those functions
+
+from __future__ import division
+from abc import ABCMeta, abstractmethod, abstractproperty
+import numbers
+import numpy as np
+from scipy import linalg, optimize, sparse
+import warnings
+from .base import LinearModel, LinearRegression
+from ..base import RegressorMixin
+from ..utils import check_X_y
+from ..utils.extmath import safe_sparse_dot
+from ..utils.optimize import newton_cg
+from ..utils.validation import check_is_fitted
+
+
+
+class Link(metaclass=ABCMeta):
+    """Abstract base class for Link funtions
+    """
+
+    @abstractmethod
+    def link(self, mu):
+        """The link function g(mu) with argument mu=E[Y] returns the
+        linear predictor.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def derivative(self, mu):
+        """Derivative of the link g'(mu).
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def inverse(self, lin_pred):
+        """The inverse link function h(lin_pred) with the linear predictor as
+        argument returns mu=E[Y].
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def inverse_derivative(self, lin_pred):
+        """Derivative of the inverse link function h'(lin_pred).
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def inverse_derivative2(self, lin_pred):
+        """Second derivative of the inverse link function h''(lin_pred).
+        """
+        raise NotImplementedError
+
+class IdentityLink(Link):
+    """The identity link function g(x)=x.
+    """
+
+    def link(self, mu):
+        return mu
+
+    def derivative(self, mu):
+        return np.ones_like(mu)
+
+    def inverse(self, lin_pred):
+        return lin_pred
+
+    def inverse_derivative(self, lin_pred):
+        return np.ones_like(lin_pred)
+
+    def inverse_derivative2(self, lin_pred):
+        return np.zeros_like(lin_pred)
+
+
+class LogLink(Link):
+    """The log link function g(x)=log(x).
+    """
+
+    def link(self, mu):
+        return np.log(mu)
+
+    def derivative(self, mu):
+        return 1./mu
+
+    def inverse(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative2(self, lin_pred):
+        return np.exp(lin_pred)
+
+
+class ExponentialDispersionModel(metaclass=ABCMeta):
+    """Base class for reproductive Exponential Dispersion Models (EDM).
+
+    The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
+
+    .. math:: p(y| \theta, \phi) = c(y, \phi)
+        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
+        = \tilde{c}(y, \phi)
+            \exp\left(-\frac{d(y, \mu)}{2\phi}\right)
+
+    with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`,
+    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`,
+    unit variance :math:`v(\mu)` and
+    unit deviance :math:`d(y,\mu)`.
+
+    Attributes
+    ----------
+    lower_bound
+    upper_bound
+
+    Methods
+    -------
+    in_y_range
+    unit_variance
+    unit_variance_derivative
+    variance
+    variance_derivative
+    unit_deviance
+    unit_deviance_derivative
+    deviance
+    deviance_derivative
+    starting_mu
+
+    _score
+    _fisher_matrix
+    _observed_information
+    _deviance
+    _deviance_derivative
+    _deviance_hessian
+
+    References
+    ----------
+    See https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+    """
+
+    @abstractproperty
+    def lower_bound(self):
+        """The lower bound of values of Y~EDM.
+        """
+        raise NotImplementedError()
+
+    @abstractproperty
+    def upper_bound(self):
+        """The upper bound of values of Y~EDM.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def in_y_range(self, x):
+        """Returns true if x is in the valid range of Y~EDM.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def unit_variance(self, mu):
+        """The unit variance :math:`v(mu)` determines the variance as
+        a function of the mean mu by
+        :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`.
+        It can also be derived from the unit deviance :math:`d(y,\mu)` as
+
+        .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{
+            \partial\mu^2}}\big|_{y=\mu}
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def unit_variance_derivative(self, mu):
+        """The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`.
+        """
+        raise NotImplementedError()
+
+    def variance(self, mu, phi=1, weight=1):
+        """The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is
+        :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`,
+        with unit variance v(mu).
+        """
+        return phi/weight * self.unit_variance(mu)
+
+    def variance_derivative(self, mu, phi=1, weight=1):
+        """The derivative of the variance w.r.t. mu,
+        :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
+        =phi/w_i*v'(\mu_i)`, with unit variance v(mu).
+        """
+        return phi/weight * self.unit_variance_derivative(mu)
+
+    @abstractmethod
+    def unit_deviance(self, y, mu):
+        """The unit_deviance :math:`d(y,\mu)`.
+        In terms of the log-likelihood it is given by
+        :math:`d(y,\mu) = -2\phi\cdot
+        \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).`
+        """
+        raise NotImplementedError()
+
+    def unit_deviance_derivative(self, y, mu):
+        """The derivative w.r.t. mu of the unit_deviance
+        :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
+        with unit variance :math:`v(\mu)`.
+
+        Returns
+        -------
+        derivative: array, shape = (n_samples,)
+        """
+        return -2*(y-mu)/self.unit_variance(mu)
+
+    def deviance(self, y, mu, weight=1):
+        """The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu)
+        with weight :math:`w_i` and unit_deviance :math:`d(y,mu)`.
+        In terms of the likelihood it is :math:`D = -2\phi\cdot
+        \left(loglike(y,\mu,\frac{phi}{w})
+        - loglike(y,y,\frac{phi}{w})\right).`
+        """
+        return np.sum(weight*self.unit_deviance(y,mu))
+
+    def _deviance(self, coef, X, y, weight, link):
+        """The deviance as a function of the coefficients ``coef``
+        (:math:`beta`).
+        """
+        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        mu = link.inverse(lin_pred)
+        return self.deviance(y, mu, weight)
+
+    def deviance_derivative(self, y, mu, weight=1):
+        """The derivative w.r.t. mu of the deviance.`
+        """
+        return weight*self.unit_deviance_derivative(y,mu)
+
+    def _score(self, coef, phi, X, y, weight, link):
+        """The score function :math:`s` is the derivative of the
+        log-likelihood w.r.t. the ``coef`` (:math:`\beta`).
+        It is given by
+
+        .. math:
+
+            \mathbf{s}(\boldsymbol{\beta}) = \mathbf{X}^T \mathbf{D}
+            \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,,
+
+        with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and
+        :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}(y_1),\ldots)`.
+        """
+        n_samples = X.shape[0]
+        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
+        d = link.inverse_derivative(lin_pred)
+        d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0),
+            shape=(n_samples, n_samples))
+        temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False)
+        score = safe_sparse_dot(X.T, temp, dense_output=False)
+        return score
+
+    def _fisher_matrix(self, coef, phi, X, y, weight, link):
+        """The Fisher information matrix, also known as expected
+        information matrix. It is given by
+
+        .. math:
+
+            \mathbf{F}(\boldsymbol{\beta}) = \mathrm{E}\left[
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta}
+            \partial\boldsymbol{\beta}^T}\right]
+            = \mathbf{X}^T W \mathbf{X} \,,
+
+        with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`,
+        see score function.
+        """
+        n_samples = X.shape[0]
+        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
+        d2 = link.inverse_derivative(lin_pred)**2
+        d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0),
+            shape=(n_samples, n_samples))
+        temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False)
+        fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False)
+        return fisher_matrix
+
+    def _observed_information(self, coef, phi, X, y, weight, link):
+        """The observed information matrix, also known as the negative of
+        the Hessian matrix of the log-likelihood. It is given by
+
+        .. math:
+
+            \mathbf{H}(\boldsymbol{\beta}) =
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta}
+            \partial\boldsymbol{\beta}^T}
+            = \mathbf{X}^T \legt[
+            - \mathbf{D}' \mathbf{R}
+            + \mathbf{D}^2 \mathbf{V} \mathbf{R}
+            + \mathbf{D}^2
+            \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,,
+
+        with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`,
+        :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{
+        v(\mu_i)}
+        \right)`,
+        see score function and Fisher matrix.
+        """
+        n_samples = X.shape[0]
+        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
+        dp = link.inverse_derivative2(lin_pred)
+        d2 = link.inverse_derivative(lin_pred)**2
+        v = self.unit_variance_derivative(mu)/self.unit_variance(mu)
+        r = y - mu
+        temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0),
+            shape=(n_samples, n_samples))
+        temp = safe_sparse_dot(temp, X, dense_output=False)
+        observed_information = safe_sparse_dot(X.T, temp, dense_output=False)
+        return observed_information
+
+    def _deviance_derivative(self, coef, X, y, weight, link):
+        """The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a
+        function of the coefficients ``coef``.
+        This is equivalent to :math:`-2\phi` times the score function
+        :math:`s` (derivative of the log-likelihood).
+        """
+        score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight,
+            link=link)
+        return -2*score
+
+    def _deviance_hessian(self, coef, X, y, weight, link):
+        """The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance
+        as a function of the coefficients ``coef``.
+        This is equivalent to :math:`+2\phi` times the observed information
+        matrix.
+        """
+        info_matrix = self._observed_information(coef=coef, phi=1,
+            X=X, y=y, weight=weight, link=link)
+        return 2*info_matrix
+
+    def starting_mu(self, y, weight=1):
+        """Starting values for the mean mu_i in IRLS."""
+        return (weight*y+np.mean(weight*y))/(2.*np.sum(np.ones_like(y)*weight))
+
+
+class TweedieDistribution(ExponentialDispersionModel):
+    """A class for the Tweedie distribution.
+    They have mu=E[X] and Var[X] \propto mu**power.
+
+    Attributes
+    ----------
+    power : float
+            The variance power of the unit_variance
+            :math:`v(mu) = mu^{power}`.
+    """
+    def __init__(self, power=0):
+        self.power = power
+        self._upper_bound = np.Inf
+        self._upper_compare = lambda x: np.less(x, self.upper_bound)
+        if power < 0:
+            #Extreme Stable
+            self._lower_bound = -np.Inf
+            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+        elif power == 0:
+            #GaussianDistribution
+            self._lower_bound = -np.Inf
+            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+        elif (power > 0) and (power < 1):
+            raise ValueError('For 0<power<1, no distribution exists.')
+        elif power == 1:
+            #PoissonDistribution
+            self._lower_bound = 0
+            self._lower_compare = (
+                lambda x: np.greater_equal(x, self.lower_bound))
+        elif (power > 1) and (power < 2):
+            #Compound Poisson
+            self._lower_bound = 0
+            self._lower_compare = (
+                lambda x: np.greater_equal(x, self.lower_bound))
+        elif power == 2:
+            #GammaDistribution
+            self._lower_bound = 0
+            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+        elif (power > 2) and (power < 3):
+            #Positive Stable
+            self._lower_bound = 0
+            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+        elif power == 3:
+            #InverseGaussianDistribution
+            self._lower_bound = 0
+            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+        elif power > 3:
+            #Positive Stable
+            self._lower_bound = 0
+            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+
+    @property
+    def power(self):
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        if not isinstance(power, numbers.Real):
+            raise TypeError('power must be a real number, input was {0}'
+                .format(power))
+        self._power = power
+
+    @property
+    def lower_bound(self):
+        return self._lower_bound
+
+    @property
+    def upper_bound(self):
+        return self._upper_bound
+
+    def in_y_range(self, x):
+        return np.logical_and(self._lower_compare(x), self._upper_compare(x))
+
+    def unit_variance(self, mu):
+        """The unit variance of a Tweedie distribution is v(mu)=mu**power.
+        """
+        return np.power(mu, self.power)
+
+    def unit_variance_derivative(self, mu):
+        """The derivative of the unit variance of a Tweedie distribution is
+        v(mu)=power*mu**(power-1).
+        """
+        return self.power*np.power(mu, self.power-1)
+
+    def unit_deviance(self, y, mu):
+        p = self.power
+        if p == 0:
+            #NormalDistribution
+            return (y-mu)**2
+        if p == 1:
+            #PoissonDistribution
+            return 2 * (np.where(y==0,0,y*np.log(y/mu))-y+mu)
+        elif p == 2:
+            #GammaDistribution
+            return 2 * (np.log(mu/y)+y/mu-1)
+        else:
+            #return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
+            #    - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p))
+            return 2 * (np.power(np.maximum(y,0), 2-p)/((1-p)*(2-p))
+                - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
+
+    def likelihood(self, y, X, beta, phi, weight=1):
+        raise NotImplementedError('This function is not (yet) implemented.')
+
+
+class NormalDistribution(TweedieDistribution):
+    """Class for the Normal (aka Gaussian) distribution"""
+    def __init__(self):
+        super(NormalDistribution, self).__init__(power=0)
+
+GaussianDistribution = NormalDistribution
+
+class PoissonDistribution(TweedieDistribution):
+    """Class for the scaled Poisson distribution"""
+    def __init__(self):
+        super(PoissonDistribution, self).__init__(power=1)
+
+class GammaDistribution(TweedieDistribution):
+    """Class for the Gamma distribution"""
+    def __init__(self):
+        super(GammaDistribution, self).__init__(power=2)
+
+class InverseGaussianDistribution(TweedieDistribution):
+    """Class for the scaled InverseGaussianDistribution distribution"""
+    def __init__(self):
+        super(InverseGaussianDistribution, self).__init__(power=3)
+
+class GeneralizedHyperbolicSecand(ExponentialDispersionModel):
+    """A class for the von Generalized Hyperbolic Secand (GHS) distribution.
+
+    The GHS distribution is for data y in (-inf, inf).
+    """
+    def __init__(self):
+        self._lower_bound = -np.Inf
+        self._upper_bound = np.Inf
+
+    @property
+    def lower_bound(self):
+        return self._lower_bound
+
+    @property
+    def upper_bound(self):
+        return self._upper_bound
+
+    def in_y_range(self, x):
+        np.logical_and(
+            np.greater(x, self.lower_bound),
+            np.less(x, self.lower_bound)
+            )
+
+    def unit_variance(self, mu):
+        return 1 + mu**2
+
+    def unit_variance_derivative(self, mu):
+        return 2*mu
+
+    def unit_deviance(self, y, mu):
+        return (2*y*(np.arctan(y) - np.arctan(mu))
+            + np.log((1+mu**2)/(1+y**2)))
+
+
+
+class GeneralizedLinearModel(LinearModel, RegressorMixin):
+    """
+    Class to fit a Generalized Linear Model (GLM) based on reproductive
+    Exponential Dispersion Models (EDM).
+
+    Assumptions:
+
+    - The target values y_i are realizations of random variables
+      :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion
+      parameter :math:`\phi` and weights :math:`w_i`.
+    - The expectation of :math:`Y_i` is :math:`mu_i=\mathrm{E}[Y]=h(\eta_i)`
+      whith the linear predictor :math:`\eta=X*\beta`, inverse link function
+      :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta`
+      to be estimated.
+
+    Note that the first assumption implies
+    :math:`\mathrm{Var}[Y_i]=\frac{\phi}{w_i} v(\mu_i)` with uni variance
+    function :math:`v(\mu)`.
+
+    The fit itself does not need Y to be from an EDM, but only assumes
+    the first two moments :math:`E[Y_i]=\mu_i=h(\eta_i)` and
+    :math:`Var[Y_i]=\frac{\phi}{w_i} v(\mu_i)`
+
+    The parameters :math:`\beta` are estimated by maximum likelihood which is
+    equivalent to minimizing the deviance.
+
+    TODO: Estimation of the dispersion parameter phi.
+
+    TODO: Notes on 'scaled' Poisson and weights
+
+    Parameters
+    ----------
+    fit_intercept : boolean, optional, default True
+        whether to calculate the intercept for this model. If set
+        to False, no intercept will be used in calculations
+        (e.g. data is expected to be already centered).
+
+    family : ExponentialDispersionModel, optional, default NormalDistribution()
+        the distributional assumption of the GLM
+
+    link : Link, optional, default IdentityLink()
+        the link function (class) of the GLM
+
+    fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr'
+        method for estimation of the dispersion parameter phi. Whether to use
+        the chi squared statisic or the deviance statistic. If None, the
+        dispersion is not estimated.
+
+    solver : {'irls', 'newton-cg', 'lbfgs'}, defaul 'irls'
+        Algorithm to use in the optimization problem.
+
+        - 'irls' is iterated reweighted least squares. It is the standard
+            algorithm for GLMs.
+
+        - 'newton-cg', 'lbfgs'
+
+    max_iter : int, default 100
+        TODO
+
+    tol : float
+        Stopping criterion. For the irls, newton-cg and lbfgs solvers,
+        the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol``
+        where ``g_i`` is the i-th component of the gradient (derivative of
+        the deviance).
+
+    start_params : {array shape (n_features, ), 'ols'}, default None
+        sets the start values for coef_ in the fit.
+        If None, default values are taken.
+        If 'ols' the result of an ordinary least squares in the link space
+        (linear predictor) is taken.
+        If an array is given, these values are taken as coef_ to start with.
+        If fit_intercept is true, the first value is assumed to be the start
+        value for the intercept_.
+
+    verbose : int, default: 0
+        For the lbfgs solver set verbose to any positive
+        number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array, shape (1, n_features)
+        Estimated coefficients for the linear predictor (X*coef_) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    dispersion_ : float
+        The dispersion parameter :math:`\phi` if fit_dispersion is set.
+
+    n_iter_ : int
+        Actual number of iterations of the solver.
+
+    Notes
+    -----
+
+    References
+    ----------
+    TODO
+    """
+
+    def __init__(self, fit_intercept=True, family=NormalDistribution(),
+        link=IdentityLink(), fit_dispersion='chisqr', solver='irls', max_iter=100,
+        tol=1e-4, start_params=None, verbose=0):
+        self.fit_intercept = fit_intercept
+        self.family = family
+        self.link = link
+        self.fit_dispersion = fit_dispersion
+        self.solver = solver
+        self.max_iter = 100
+        self.tol = tol
+        self.start_params = start_params
+        self.verbose = verbose
+
+    def fit(self, X, y, weight=None):
+        """
+        Fit a generalized linear model.
+
+        Parameters
+        ----------
+        X : numpy array or sparse matrix of shape [n_samples,n_features]
+            Training data
+
+        y : numpy array of shape [n_samples]
+            Target values
+
+        weight : numpy array of shape [n_samples]
+            Individual weights for each sample.
+            Var[Y_i]=phi/weight_i * v(mu)
+            If Y_i ~ EDM(mu, phi/w_i) then
+            sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w))
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        if not isinstance(self.family, ExponentialDispersionModel):
+            raise ValueError("The argument family must be an instance of class"
+                             "ExponentialDispersionModel.")
+        if not isinstance(self.fit_intercept, bool):
+            raise ValueError("The argument fit_intercept must be bool,"
+                             " got {0}".format(self.fit_intercept))
+        if not self.solver in ['irls', 'lbfgs', 'newton-cg']:
+            raise ValueError("GLM Regression supports only irls, lbfgs and"
+                             "newton-cg solvers, got {0}".format(self.solver))
+        if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
+            raise ValueError("Maximum number of iteration must be positive;"
+                             " got (max_iter={0!r})".format(self.max_iter))
+        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
+            raise ValueError("Tolerance for stopping criteria must be "
+                             "positive; got (tol={0!r})".format(self.tol))
+        start_params = self.start_params
+        if start_params is not None and start_params is not 'ols':
+            start_params = np.atleast_1d(start_params)
+            if start_params.shape[0] != X.shape[1] + self.fit_intercept:
+                raise ValueError("Start values for parameters must have the"
+                    "right length; required length {0}, got {1}".format(
+                    X.shape[1] + self.fit_intercept, start_params.shape[0]))
+
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                         y_numeric=True, multi_output=False)
+        y = y.astype(np.float64)
+
+        if not np.all(self.family.in_y_range(y)):
+            raise ValueError("Some value(s) of y are out of the valid "
+            "range for family {0}".format(self.family.__class__.__name__))
+
+        if weight is None:
+            weight = np.ones_like(y)
+        elif np.isscalar(weight):
+            weight = weight*np.ones_like(y)
+        else:
+            weight = np.atleast_1d(weight)
+            if weight.ndim > 1:
+                raise ValueError("Weights must be 1D array or scalar")
+            elif weight.shape[0] != y.shape[0]:
+                raise ValueError("Weights must have the same length as y")
+
+
+        if self.fit_intercept:
+            #intercept is first column <=> coef[0] is for intecept
+            if sparse.issparse(X):
+                Xnew = sparse.hstack([np.ones([X.shape[0],1]), X])
+            else:
+                Xnew = np.concatenate((np.ones((X.shape[0],1)), X), axis=1)
+        else:
+            Xnew = X
+
+        n_samples, n_features = Xnew.shape
+
+        #Note: Since phi does not enter the estimation of mu_i=E[y_i]
+        #      set it to 1 where convenient.
+
+        #set start values for coef
+        coef = None
+        if start_params is None:
+            #Use mu_start and apply one irls step to calculate coef
+            mu = self.family.starting_mu(y, weight)
+            #linear predictor
+            eta = self.link.link(mu)
+            #h'(eta)
+            hp = self.link.inverse_derivative(eta)
+            #working weights w, in principle a diagonal matrix
+            #therefore here just as 1d array
+            w = (hp**2 / self.family.variance(mu, phi=1, weight=weight))
+            wroot = np.sqrt(w)
+            #working observations
+            yw = eta + (y-mu)/hp
+            #least squares rescaled with wroot
+            wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples))
+            X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True)
+            yw_rescale = safe_sparse_dot(wroot, y, dense_output=True)
+            coef = linalg.lstsq(X_rescale, yw_rescale)[0]
+        elif start_params is 'ols':
+            reg = LinearRegression(copy_X=False,
+                fit_intercept=False)
+            reg.fit(Xnew, self.link.link(y))
+            coef = reg.coef_
+        else:
+            coef = start_params
+
+        #algorithms for optimiation
+        #TODO: Parallelize it
+        self.n_iter_ = 0
+        converged = False
+        if self.solver == 'irls':
+            #linear predictor
+            eta = safe_sparse_dot(Xnew, coef, dense_output=True)
+            mu = self.link.inverse(eta)
+            while self.n_iter_ < self.max_iter:
+                self.n_iter_ += 1
+                #coef_old not used so far.
+                #coef_old = coef
+                #h'(eta)
+                hp = self.link.inverse_derivative(eta)
+                #working weights w, in principle a diagonal matrix
+                #therefore here just as 1d array
+                w = (hp**2 / self.family.variance(mu, phi=1, weight=weight))
+                wroot = np.sqrt(w)
+                #working observations
+                yw = eta + (y-mu)/hp
+                #least squares rescaled with wroot
+                wroot = sparse.dia_matrix((wroot, 0),
+                    shape=(n_samples, n_samples))
+                X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True)
+                yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True)
+                coef, residues, rank, singular_ =  (
+                    linalg.lstsq(X_rescale, yw_rescale))
+
+                #updated linear predictor
+                #do it here for updated values for tolerance
+                eta = safe_sparse_dot(Xnew, coef, dense_output=True)
+                mu = self.link.inverse(eta)
+
+                #which tolerace? |coef - coef_old| or gradient?
+                #use gradient for compliance with newton-cg and lbfgs
+                #TODO: faster computation of gradient, use mu and eta directly
+                gradient = self.family._deviance_derivative(coef=coef,
+                    X=Xnew, y=y, weight=weight, link=self.link)
+                if (np.max(np.abs(gradient)) <= self.tol):
+                    converged = True
+                    break
+
+            if not converged:
+                warnings.warn("irls failed to converge. Increase the number "
+                    "of iterations (currently {0})".format(self.max_iter))
+
+        #TODO: performance: make one function return both deviance and gradient
+        elif self.solver == 'lbfgs':
+            func = self.family._deviance
+            fprime = self.family._deviance_derivative
+            args = (Xnew, y, weight, self.link)
+            coef, loss, info = optimize.fmin_l_bfgs_b(
+                func, coef, fprime=fprime,
+                args=args,
+                iprint=(self.verbose > 0) - 1, pgtol=self.tol,
+                maxiter=self.max_iter)
+            if self.verbose > 0:
+                if info["warnflag"] == 1:
+                    warnings.warn("lbfgs failed to converge."
+                        " Increase the number of iterations.")
+                elif info["warnflag"] == 2:
+                    warnings.warn("lbfgs failed for the reason: {0}".format(
+                        info["task"]))
+            self.n_iter_ = info['nit']
+        elif self.solver == 'newton-cg':
+            func = self.family._deviance
+            grad = self.family._deviance_derivative
+            def grad_hess(coef, X, y, weight, link):
+                grad = (self.family
+                    ._deviance_derivative(coef, X, y, weight, link))
+                hessian = (self.family
+                    ._deviance_hessian(coef, X, y, weight,link))
+                def Hs(s):
+                    ret = np.dot(hessian, s)
+                    return ret
+                return grad, Hs
+            hess = grad_hess
+            args = (Xnew, y, weight, self.link)
+            coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args,
+                maxiter=self.max_iter, tol=self.tol)
+            self.coef_ = coef
+
+        if self.fit_intercept is True:
+            self.intercept_ = coef[0]
+            self.coef_ = coef[1:]
+        else:
+            self.coef_ = coef
+
+        if self.fit_dispersion in ['chisqr', 'deviance']:
+            self.dispersion_ = self.estimate_phi(y, X, weight)
+
+        return self
+
+    def predict(self, X, weight=1):
+        check_is_fitted(self, "coef_")
+        eta = safe_sparse_dot(X, self.coef_, dense_output=True)
+        if self.fit_intercept is True:
+            eta += self.intercept_
+        mu = self.link.inverse(eta)
+        return mu*weight
+
+    def estimate_phi(self, y, X, weight):
+        n_samples, n_features = X.shape
+        eta = safe_sparse_dot(X, self.coef_, dense_output=True)
+        if self.fit_intercept is True:
+            eta += self.intercept_
+        mu = self.link.inverse(eta)
+        if self.fit_dispersion == 'chisqr':
+            chisq = np.sum(weight*(y-mu)**2/self.family.unit_variance(mu))
+            return chisq/(n_samples - n_features)
+        elif self.fit_dispersion == 'deviance':
+            dev = self.family.deviance(y, mu, weight)
+            return dev/(n_samples - n_features)
+
+    def score(self, X, y, weight=1):
+        """The natural score for a GLM is -deviance.
+        Returns the weight averaged negitive deviance (the better the score,
+        the better the fit). Maximum score is therefore 0.
+        """
+        #RegressorMixin has R^2 score.
+        #TODO: Make it more compatible with the score function in
+        #      sklearn.metrics.regression.py
+        eta = safe_sparse_dot(X, self.coef_, dense_output=True)
+        if self.fit_intercept is True:
+            eta += self.intercept_
+        mu = self.link.inverse(eta)
+        output_errors = self.family.unit_deviance(y,mu)
+        weight = weight * np.ones_like(y)
+        return np.average(output_errors, weights=weight)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
new file mode 100644
index 0000000000000..2a88a7ec899d9
--- /dev/null
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -0,0 +1,73 @@
+import numpy as np
+
+from sklearn.linear_model.glm import (Link, IdentityLink, LogLink,
+                 ExponentialDispersionModel, TweedieDistribution,
+                 NormalDistribution, GaussianDistribution,
+                 PoissonDistribution, GammaDistribution,
+                 InverseGaussianDistribution, GeneralizedHyperbolicSecand,
+                 GeneralizedLinearModel)
+
+from sklearn.utils.testing import (assert_equal, assert_array_equal,
+    assert_array_almost_equal)
+
+def test_family_bounds():
+    """Test the valid range of distributions
+    """
+    family = NormalDistribution()
+    result = family.in_y_range([-1,0,1])
+    assert_array_equal(result, [True, True, True])
+
+    family = PoissonDistribution()
+    result = family.in_y_range([-1,0,1])
+    assert_array_equal(result, [False, True, True])
+
+    family = TweedieDistribution(power=1.5)
+    result = family.in_y_range([-1,0,1])
+    assert_array_equal(result, [False, True, True])
+
+    family = GammaDistribution()
+    result = family.in_y_range([-1,0,1])
+    assert_array_equal(result, [False, False, True])
+
+    family = InverseGaussianDistribution()
+    result = family.in_y_range([-1,0,1])
+    assert_array_equal(result, [False, False, True])
+
+    family = TweedieDistribution(power=4.5)
+    result = family.in_y_range([-1,0,1])
+    assert_array_equal(result, [False, False, True])
+
+def test_glm_identiy_regression():
+    """Test linear regression on a simple dataset
+    """
+    coef = [1,2]
+    X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T
+    y = np.dot(X, coef)
+    for solver in ['irls', 'lbfgs', 'newton-cg']:
+        for family in (GaussianDistribution(), PoissonDistribution(),
+            GammaDistribution(), InverseGaussianDistribution(),
+            TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)):
+            glm = GeneralizedLinearModel(family=family,
+                fit_intercept=False, solver=solver)
+            res = glm.fit(X, y)
+            assert_array_almost_equal(res.coef_, coef)
+
+def test_glm_log_regression():
+    """Test linear regression on a simple dataset
+    """
+    coef = [1,2]
+    X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T
+    y = np.exp(np.dot(X, coef))
+    #for solver in ['irls', 'lbfgs', 'newton-cg']:
+    for solver in ['irls']:
+        #for family in [GaussianDistribution(), PoissonDistribution(),
+        #    GammaDistribution(), InverseGaussianDistribution(),
+        #    TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]:
+        for family in [GaussianDistribution()]:
+            glm = GeneralizedLinearModel(family=family,
+                link=LogLink(),
+                fit_intercept=False, solver=solver, start_params='ols')
+            res = glm.fit(X, y)
+            assert_array_almost_equal(res.coef_, coef)
+
+#TODO: Test compatibility with R's glm, glmnet

From 2fc189d8351c9710c1329750545539afe3e6e40c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 19 Jul 2017 17:33:04 +0200
Subject: [PATCH 02/69] [WIP] Add Generalized Linear Models (#9405)

* Fixed pep8
* Fixed flake8
* Rename GeneralizedLinearModel as GeneralizedLinearRegressor
* Use of six.with_metaclass
* PEP257: summary should be on same line as quotes
* Docstring of class GeneralizedLinearRegressor: \ before mu
* Arguments family and link accept strings
* Use of ConvergenceWarning
---
 sklearn/linear_model/__init__.py       |  13 +-
 sklearn/linear_model/glm.py            | 231 ++++++++++++++-----------
 sklearn/linear_model/tests/test_glm.py |  75 ++++----
 3 files changed, 180 insertions(+), 139 deletions(-)

diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 5acc51e9dc87f..0c5840f343a3a 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,12 +18,8 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
-from .glm import (Link, IdentityLink, LogLink,
-                 ExponentialDispersionModel, TweedieDistribution,
-                 NormalDistribution, GaussianDistribution,
-                 PoissonDistribution, GammaDistribution,
-                 InverseGaussianDistribution, GeneralizedHyperbolicSecand,
-                 GeneralizedLinearModel)
+from .glm import (TweedieDistribution,
+                  GeneralizedLinearRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
@@ -44,7 +40,6 @@
            'BayesianRidge',
            'ElasticNet',
            'ElasticNetCV',
-           'GeneralizedLinearModel',
            'Hinge',
            'Huber',
            'HuberRegressor',
@@ -84,4 +79,6 @@
            'orthogonal_mp',
            'orthogonal_mp_gram',
            'ridge_regression',
-           'RANSACRegressor']
+           'RANSACRegressor',
+           'GeneralizedLinearRegressor',
+           'TweedieDistribution']
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 8b6eb8f3bf16c..cf91a64fafc12 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -5,9 +5,6 @@
 # Author: Christian Lorentzen <lorentzen.ch@googlemail.ch>
 # License: BSD 3 clause
 
-# TODO: Which name? GeneralizedLinearModel vs GeneralizedLinearRegression.
-#       So far, it is GeneralizedLinearModel, since it could very easily
-#       extended by Bernoulli/Binomial distribution.
 # TODO: Which name/symbol for coefficients and weights in docu?
 #       sklearn.linear_models uses w for coefficients.
 #       So far, coefficients=beta and weight=w (as standard literature)
@@ -17,6 +14,10 @@
 # TODO: Write docu and examples
 
 # Design Decisions:
+# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
+#   So far, it is GeneralizedLinearModel, since it could very easily
+#   extended by Bernoulli/Binomial distribution.
+#   Solution: GeneralizedLinearRegressor
 # - The link funtion (instance of class Link) is necessary for the evaluation
 #   of deviance, score, Fisher and Hessian matrix as functions of the
 #   coefficients, which is needed by optimizers.
@@ -28,16 +29,17 @@
 import numpy as np
 from scipy import linalg, optimize, sparse
 import warnings
-from .base import LinearModel, LinearRegression
-from ..base import RegressorMixin
+from .base import LinearRegression
+from ..base import BaseEstimator, RegressorMixin
+from ..exceptions import ConvergenceWarning
+from ..externals import six
 from ..utils import check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import newton_cg
 from ..utils.validation import check_is_fitted
 
 
-
-class Link(metaclass=ABCMeta):
+class Link(six.with_metaclass(ABCMeta)):
     """Abstract base class for Link funtions
     """
 
@@ -73,6 +75,7 @@ def inverse_derivative2(self, lin_pred):
         """
         raise NotImplementedError
 
+
 class IdentityLink(Link):
     """The identity link function g(x)=x.
     """
@@ -113,7 +116,7 @@ def inverse_derivative2(self, lin_pred):
         return np.exp(lin_pred)
 
 
-class ExponentialDispersionModel(metaclass=ABCMeta):
+class ExponentialDispersionModel(six.with_metaclass(ABCMeta)):
     """Base class for reproductive Exponential Dispersion Models (EDM).
 
     The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
@@ -235,7 +238,7 @@ def deviance(self, y, mu, weight=1):
         \left(loglike(y,\mu,\frac{phi}{w})
         - loglike(y,y,\frac{phi}{w})\right).`
         """
-        return np.sum(weight*self.unit_deviance(y,mu))
+        return np.sum(weight*self.unit_deviance(y, mu))
 
     def _deviance(self, coef, X, y, weight, link):
         """The deviance as a function of the coefficients ``coef``
@@ -248,7 +251,7 @@ def _deviance(self, coef, X, y, weight, link):
     def deviance_derivative(self, y, mu, weight=1):
         """The derivative w.r.t. mu of the deviance.`
         """
-        return weight*self.unit_deviance_derivative(y,mu)
+        return weight*self.unit_deviance_derivative(y, mu)
 
     def _score(self, coef, phi, X, y, weight, link):
         """The score function :math:`s` is the derivative of the
@@ -269,7 +272,7 @@ def _score(self, coef, phi, X, y, weight, link):
         sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
         d = link.inverse_derivative(lin_pred)
         d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0),
-            shape=(n_samples, n_samples))
+                                        shape=(n_samples, n_samples))
         temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False)
         score = safe_sparse_dot(X.T, temp, dense_output=False)
         return score
@@ -294,7 +297,7 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link):
         sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
         d2 = link.inverse_derivative(lin_pred)**2
         d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0),
-            shape=(n_samples, n_samples))
+                                         shape=(n_samples, n_samples))
         temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False)
         fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False)
         return fisher_matrix
@@ -329,7 +332,7 @@ def _observed_information(self, coef, phi, X, y, weight, link):
         v = self.unit_variance_derivative(mu)/self.unit_variance(mu)
         r = y - mu
         temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0),
-            shape=(n_samples, n_samples))
+                                 shape=(n_samples, n_samples))
         temp = safe_sparse_dot(temp, X, dense_output=False)
         observed_information = safe_sparse_dot(X.T, temp, dense_output=False)
         return observed_information
@@ -341,7 +344,7 @@ def _deviance_derivative(self, coef, X, y, weight, link):
         :math:`s` (derivative of the log-likelihood).
         """
         score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight,
-            link=link)
+                            link=link)
         return -2*score
 
     def _deviance_hessian(self, coef, X, y, weight, link):
@@ -350,8 +353,8 @@ def _deviance_hessian(self, coef, X, y, weight, link):
         This is equivalent to :math:`+2\phi` times the observed information
         matrix.
         """
-        info_matrix = self._observed_information(coef=coef, phi=1,
-            X=X, y=y, weight=weight, link=link)
+        info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y,
+                                                 weight=weight, link=link)
         return 2*info_matrix
 
     def starting_mu(self, y, weight=1):
@@ -374,39 +377,39 @@ def __init__(self, power=0):
         self._upper_bound = np.Inf
         self._upper_compare = lambda x: np.less(x, self.upper_bound)
         if power < 0:
-            #Extreme Stable
+            # Extreme Stable
             self._lower_bound = -np.Inf
             self._lower_compare = lambda x: np.greater(x, self.lower_bound)
         elif power == 0:
-            #GaussianDistribution
+            # GaussianDistribution
             self._lower_bound = -np.Inf
             self._lower_compare = lambda x: np.greater(x, self.lower_bound)
         elif (power > 0) and (power < 1):
             raise ValueError('For 0<power<1, no distribution exists.')
         elif power == 1:
-            #PoissonDistribution
+            # PoissonDistribution
             self._lower_bound = 0
             self._lower_compare = (
                 lambda x: np.greater_equal(x, self.lower_bound))
         elif (power > 1) and (power < 2):
-            #Compound Poisson
+            # Compound Poisson
             self._lower_bound = 0
             self._lower_compare = (
                 lambda x: np.greater_equal(x, self.lower_bound))
         elif power == 2:
-            #GammaDistribution
+            # GammaDistribution
             self._lower_bound = 0
             self._lower_compare = lambda x: np.greater(x, self.lower_bound)
         elif (power > 2) and (power < 3):
-            #Positive Stable
+            # Positive Stable
             self._lower_bound = 0
             self._lower_compare = lambda x: np.greater(x, self.lower_bound)
         elif power == 3:
-            #InverseGaussianDistribution
+            # InverseGaussianDistribution
             self._lower_bound = 0
             self._lower_compare = lambda x: np.greater(x, self.lower_bound)
         elif power > 3:
-            #Positive Stable
+            # Positive Stable
             self._lower_bound = 0
             self._lower_compare = lambda x: np.greater(x, self.lower_bound)
 
@@ -418,7 +421,7 @@ def power(self):
     def power(self, power):
         if not isinstance(power, numbers.Real):
             raise TypeError('power must be a real number, input was {0}'
-                .format(power))
+                            .format(power))
         self._power = power
 
     @property
@@ -446,19 +449,19 @@ def unit_variance_derivative(self, mu):
     def unit_deviance(self, y, mu):
         p = self.power
         if p == 0:
-            #NormalDistribution
+            # NormalDistribution
             return (y-mu)**2
         if p == 1:
-            #PoissonDistribution
-            return 2 * (np.where(y==0,0,y*np.log(y/mu))-y+mu)
+            # PoissonDistribution
+            return 2 * (np.where(y == 0, 0, y*np.log(y/mu))-y+mu)
         elif p == 2:
-            #GammaDistribution
+            # GammaDistribution
             return 2 * (np.log(mu/y)+y/mu-1)
         else:
-            #return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
+            # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
             #    - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p))
-            return 2 * (np.power(np.maximum(y,0), 2-p)/((1-p)*(2-p))
-                - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
+            return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
+                        y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
 
     def likelihood(self, y, X, beta, phi, weight=1):
         raise NotImplementedError('This function is not (yet) implemented.')
@@ -469,23 +472,25 @@ class NormalDistribution(TweedieDistribution):
     def __init__(self):
         super(NormalDistribution, self).__init__(power=0)
 
-GaussianDistribution = NormalDistribution
 
 class PoissonDistribution(TweedieDistribution):
     """Class for the scaled Poisson distribution"""
     def __init__(self):
         super(PoissonDistribution, self).__init__(power=1)
 
+
 class GammaDistribution(TweedieDistribution):
     """Class for the Gamma distribution"""
     def __init__(self):
         super(GammaDistribution, self).__init__(power=2)
 
+
 class InverseGaussianDistribution(TweedieDistribution):
     """Class for the scaled InverseGaussianDistribution distribution"""
     def __init__(self):
         super(InverseGaussianDistribution, self).__init__(power=3)
 
+
 class GeneralizedHyperbolicSecand(ExponentialDispersionModel):
     """A class for the von Generalized Hyperbolic Secand (GHS) distribution.
 
@@ -516,12 +521,11 @@ def unit_variance_derivative(self, mu):
         return 2*mu
 
     def unit_deviance(self, y, mu):
-        return (2*y*(np.arctan(y) - np.arctan(mu))
-            + np.log((1+mu**2)/(1+y**2)))
-
+        return (2*y*(np.arctan(y) - np.arctan(mu)) +
+                np.log((1+mu**2)/(1+y**2)))
 
 
-class GeneralizedLinearModel(LinearModel, RegressorMixin):
+class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """
     Class to fit a Generalized Linear Model (GLM) based on reproductive
     Exponential Dispersion Models (EDM).
@@ -531,7 +535,7 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin):
     - The target values y_i are realizations of random variables
       :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion
       parameter :math:`\phi` and weights :math:`w_i`.
-    - The expectation of :math:`Y_i` is :math:`mu_i=\mathrm{E}[Y]=h(\eta_i)`
+    - The expectation of :math:`Y_i` is :math:`\mu_i=\mathrm{E}[Y]=h(\eta_i)`
       whith the linear predictor :math:`\eta=X*\beta`, inverse link function
       :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta`
       to be estimated.
@@ -549,7 +553,9 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin):
 
     TODO: Estimation of the dispersion parameter phi.
 
-    TODO: Notes on 'scaled' Poisson and weights
+    TODO: Notes on weights and 'scaled' Poisson, e.g. fit y = x/w with
+    with x=counts and w=exposure (time, money, persons, ...) => y is a
+    ratio with weights w.
 
     Parameters
     ----------
@@ -558,10 +564,12 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin):
         to False, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
-    family : ExponentialDispersionModel, optional, default NormalDistribution()
-        the distributional assumption of the GLM
+    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance
+        of a subclass of ExponentialDispersionModel, optional, default 'normal'
+        the distributional assumption of the GLM.
 
-    link : Link, optional, default IdentityLink()
+    link : {'identity', 'log'} or an instance of a subclass of Link,
+        optional, default IdentityLink()
         the link function (class) of the GLM
 
     fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr'
@@ -622,8 +630,8 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin):
     """
 
     def __init__(self, fit_intercept=True, family=NormalDistribution(),
-        link=IdentityLink(), fit_dispersion='chisqr', solver='irls', max_iter=100,
-        tol=1e-4, start_params=None, verbose=0):
+                 link=IdentityLink(), fit_dispersion='chisqr', solver='irls',
+                 max_iter=100, tol=1e-4, start_params=None, verbose=0):
         self.fit_intercept = fit_intercept
         self.family = family
         self.link = link
@@ -635,8 +643,7 @@ def __init__(self, fit_intercept=True, family=NormalDistribution(),
         self.verbose = verbose
 
     def fit(self, X, y, weight=None):
-        """
-        Fit a generalized linear model.
+        """Fit a generalized linear model.
 
         Parameters
         ----------
@@ -657,12 +664,32 @@ def fit(self, X, y, weight=None):
         self : returns an instance of self.
         """
         if not isinstance(self.family, ExponentialDispersionModel):
-            raise ValueError("The argument family must be an instance of class"
-                             "ExponentialDispersionModel.")
+            if self.family == 'normal':
+                self.family = NormalDistribution()
+            elif self.family == 'poisson':
+                self.family = PoissonDistribution()
+            elif self.family == 'gamma':
+                self.family = GammaDistribution()
+            elif self.family == 'inverse.gaussian':
+                self.family = InverseGaussianDistribution()
+            else:
+                raise ValueError(
+                    "The argument family must be an instance of class"
+                    " ExponentialDispersionModel or an element of"
+                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].")
+        if not isinstance(self.link, Link):
+            if self.link == 'identity':
+                self.link = IdentityLink()
+            if self.link == 'log':
+                self.link = LogLink()
+            else:
+                raise ValueError(
+                    "The argument link must be an instance of class Link or"
+                    " an element of ['identity', 'log'].")
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool,"
                              " got {0}".format(self.fit_intercept))
-        if not self.solver in ['irls', 'lbfgs', 'newton-cg']:
+        if self.solver not in ['irls', 'lbfgs', 'newton-cg']:
             raise ValueError("GLM Regression supports only irls, lbfgs and"
                              "newton-cg solvers, got {0}".format(self.solver))
         if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
@@ -676,8 +703,9 @@ def fit(self, X, y, weight=None):
             start_params = np.atleast_1d(start_params)
             if start_params.shape[0] != X.shape[1] + self.fit_intercept:
                 raise ValueError("Start values for parameters must have the"
-                    "right length; required length {0}, got {1}".format(
-                    X.shape[1] + self.fit_intercept, start_params.shape[0]))
+                                 "right length; required length {0}, got {1}"
+                                 .format(X.shape[1] + self.fit_intercept,
+                                         start_params.shape[0]))
 
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          y_numeric=True, multi_output=False)
@@ -685,7 +713,8 @@ def fit(self, X, y, weight=None):
 
         if not np.all(self.family.in_y_range(y)):
             raise ValueError("Some value(s) of y are out of the valid "
-            "range for family {0}".format(self.family.__class__.__name__))
+                             "range for family {0}"
+                             .format(self.family.__class__.__name__))
 
         if weight is None:
             weight = np.ones_like(y)
@@ -698,96 +727,96 @@ def fit(self, X, y, weight=None):
             elif weight.shape[0] != y.shape[0]:
                 raise ValueError("Weights must have the same length as y")
 
-
         if self.fit_intercept:
-            #intercept is first column <=> coef[0] is for intecept
+            # intercept is first column <=> coef[0] is for intecept
             if sparse.issparse(X):
-                Xnew = sparse.hstack([np.ones([X.shape[0],1]), X])
+                Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X])
             else:
-                Xnew = np.concatenate((np.ones((X.shape[0],1)), X), axis=1)
+                Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
         else:
             Xnew = X
 
         n_samples, n_features = Xnew.shape
 
-        #Note: Since phi does not enter the estimation of mu_i=E[y_i]
-        #      set it to 1 where convenient.
+        # Note: Since dispersion_ alias phi does not enter the estimation
+        #       of mu_i=E[y_i] set it to 1 where convenient.
 
-        #set start values for coef
+        # set start values for coef
         coef = None
         if start_params is None:
-            #Use mu_start and apply one irls step to calculate coef
+            # Use mu_start and apply one irls step to calculate coef
             mu = self.family.starting_mu(y, weight)
-            #linear predictor
+            # linear predictor
             eta = self.link.link(mu)
-            #h'(eta)
+            # h'(eta)
             hp = self.link.inverse_derivative(eta)
-            #working weights w, in principle a diagonal matrix
-            #therefore here just as 1d array
+            # working weights w, in principle a diagonal matrix
+            # therefore here just as 1d array
             w = (hp**2 / self.family.variance(mu, phi=1, weight=weight))
             wroot = np.sqrt(w)
-            #working observations
+            # working observations
             yw = eta + (y-mu)/hp
-            #least squares rescaled with wroot
+            # least squares rescaled with wroot
             wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples))
             X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True)
             yw_rescale = safe_sparse_dot(wroot, y, dense_output=True)
             coef = linalg.lstsq(X_rescale, yw_rescale)[0]
         elif start_params is 'ols':
-            reg = LinearRegression(copy_X=False,
-                fit_intercept=False)
+            reg = LinearRegression(copy_X=False, fit_intercept=False)
             reg.fit(Xnew, self.link.link(y))
             coef = reg.coef_
         else:
             coef = start_params
 
-        #algorithms for optimiation
-        #TODO: Parallelize it
+        # algorithms for optimiation
+        # TODO: Parallelize it
         self.n_iter_ = 0
         converged = False
         if self.solver == 'irls':
-            #linear predictor
+            # linear predictor
             eta = safe_sparse_dot(Xnew, coef, dense_output=True)
             mu = self.link.inverse(eta)
             while self.n_iter_ < self.max_iter:
                 self.n_iter_ += 1
-                #coef_old not used so far.
-                #coef_old = coef
-                #h'(eta)
+                # coef_old not used so far.
+                # coef_old = coef
+                # h'(eta)
                 hp = self.link.inverse_derivative(eta)
-                #working weights w, in principle a diagonal matrix
-                #therefore here just as 1d array
+                # working weights w, in principle a diagonal matrix
+                # therefore here just as 1d array
                 w = (hp**2 / self.family.variance(mu, phi=1, weight=weight))
                 wroot = np.sqrt(w)
-                #working observations
+                # working observations
                 yw = eta + (y-mu)/hp
-                #least squares rescaled with wroot
+                # least squares rescaled with wroot
                 wroot = sparse.dia_matrix((wroot, 0),
-                    shape=(n_samples, n_samples))
+                                          shape=(n_samples, n_samples))
                 X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True)
                 yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True)
-                coef, residues, rank, singular_ =  (
+                coef, residues, rank, singular_ = (
                     linalg.lstsq(X_rescale, yw_rescale))
 
-                #updated linear predictor
-                #do it here for updated values for tolerance
+                # updated linear predictor
+                # do it here for updated values for tolerance
                 eta = safe_sparse_dot(Xnew, coef, dense_output=True)
                 mu = self.link.inverse(eta)
 
-                #which tolerace? |coef - coef_old| or gradient?
-                #use gradient for compliance with newton-cg and lbfgs
-                #TODO: faster computation of gradient, use mu and eta directly
-                gradient = self.family._deviance_derivative(coef=coef,
-                    X=Xnew, y=y, weight=weight, link=self.link)
+                # which tolerace? |coef - coef_old| or gradient?
+                # use gradient for compliance with newton-cg and lbfgs
+                # TODO: faster computation of gradient, use mu and eta directly
+                gradient = self.family._deviance_derivative(
+                    coef=coef, X=Xnew, y=y, weight=weight, link=self.link)
                 if (np.max(np.abs(gradient)) <= self.tol):
                     converged = True
                     break
 
             if not converged:
                 warnings.warn("irls failed to converge. Increase the number "
-                    "of iterations (currently {0})".format(self.max_iter))
+                              "of iterations (currently {0})"
+                              .format(self.max_iter), ConvergenceWarning)
 
-        #TODO: performance: make one function return both deviance and gradient
+        # TODO: performance: make one function return both deviance and
+        #       gradient of deviance
         elif self.solver == 'lbfgs':
             func = self.family._deviance
             fprime = self.family._deviance_derivative
@@ -800,7 +829,8 @@ def fit(self, X, y, weight=None):
             if self.verbose > 0:
                 if info["warnflag"] == 1:
                     warnings.warn("lbfgs failed to converge."
-                        " Increase the number of iterations.")
+                                  " Increase the number of iterations.",
+                                  ConvergenceWarning)
                 elif info["warnflag"] == 2:
                     warnings.warn("lbfgs failed for the reason: {0}".format(
                         info["task"]))
@@ -808,11 +838,13 @@ def fit(self, X, y, weight=None):
         elif self.solver == 'newton-cg':
             func = self.family._deviance
             grad = self.family._deviance_derivative
+
             def grad_hess(coef, X, y, weight, link):
-                grad = (self.family
-                    ._deviance_derivative(coef, X, y, weight, link))
-                hessian = (self.family
-                    ._deviance_hessian(coef, X, y, weight,link))
+                grad = (self.family._deviance_derivative(
+                    coef, X, y, weight, link))
+                hessian = (self.family._deviance_hessian(
+                    coef, X, y, weight, link))
+
                 def Hs(s):
                     ret = np.dot(hessian, s)
                     return ret
@@ -820,7 +852,7 @@ def Hs(s):
             hess = grad_hess
             args = (Xnew, y, weight, self.link)
             coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args,
-                maxiter=self.max_iter, tol=self.tol)
+                                       maxiter=self.max_iter, tol=self.tol)
             self.coef_ = coef
 
         if self.fit_intercept is True:
@@ -835,6 +867,9 @@ def Hs(s):
         return self
 
     def predict(self, X, weight=1):
+        """Prediction with features X.
+        If weights are given, returns prediction*weights.
+        """
         check_is_fitted(self, "coef_")
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
@@ -860,13 +895,13 @@ def score(self, X, y, weight=1):
         Returns the weight averaged negitive deviance (the better the score,
         the better the fit). Maximum score is therefore 0.
         """
-        #RegressorMixin has R^2 score.
-        #TODO: Make it more compatible with the score function in
+        # RegressorMixin has R^2 score.
+        # TODO: Make it more compatible with the score function in
         #      sklearn.metrics.regression.py
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
             eta += self.intercept_
         mu = self.link.inverse(eta)
-        output_errors = self.family.unit_deviance(y,mu)
+        output_errors = self.family.unit_deviance(y, mu)
         weight = weight * np.ones_like(y)
         return np.average(output_errors, weights=weight)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 2a88a7ec899d9..a4d4ea8650860 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -1,73 +1,82 @@
 import numpy as np
 
-from sklearn.linear_model.glm import (Link, IdentityLink, LogLink,
-                 ExponentialDispersionModel, TweedieDistribution,
-                 NormalDistribution, GaussianDistribution,
-                 PoissonDistribution, GammaDistribution,
-                 InverseGaussianDistribution, GeneralizedHyperbolicSecand,
-                 GeneralizedLinearModel)
+from sklearn.linear_model.glm import (
+    # Link, IdentityLink,
+    LogLink,
+    TweedieDistribution,
+    NormalDistribution, PoissonDistribution,
+    GammaDistribution, InverseGaussianDistribution,
+    # GeneralizedHyperbolicSecand,
+    GeneralizedLinearRegressor)
+
+from sklearn.utils.testing import (
+    # assert_equal,
+    assert_array_equal, assert_array_almost_equal)
 
-from sklearn.utils.testing import (assert_equal, assert_array_equal,
-    assert_array_almost_equal)
 
 def test_family_bounds():
     """Test the valid range of distributions
     """
     family = NormalDistribution()
-    result = family.in_y_range([-1,0,1])
+    result = family.in_y_range([-1, 0, 1])
     assert_array_equal(result, [True, True, True])
 
     family = PoissonDistribution()
-    result = family.in_y_range([-1,0,1])
+    result = family.in_y_range([-1, 0, 1])
     assert_array_equal(result, [False, True, True])
 
     family = TweedieDistribution(power=1.5)
-    result = family.in_y_range([-1,0,1])
+    result = family.in_y_range([-1, 0, 1])
     assert_array_equal(result, [False, True, True])
 
     family = GammaDistribution()
-    result = family.in_y_range([-1,0,1])
+    result = family.in_y_range([-1, 0, 1])
     assert_array_equal(result, [False, False, True])
 
     family = InverseGaussianDistribution()
-    result = family.in_y_range([-1,0,1])
+    result = family.in_y_range([-1, 0, 1])
     assert_array_equal(result, [False, False, True])
 
     family = TweedieDistribution(power=4.5)
-    result = family.in_y_range([-1,0,1])
+    result = family.in_y_range([-1, 0, 1])
     assert_array_equal(result, [False, False, True])
 
+
 def test_glm_identiy_regression():
     """Test linear regression on a simple dataset
     """
-    coef = [1,2]
-    X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T
+    coef = [1, 2]
+    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
+    families = (
+        NormalDistribution(), PoissonDistribution(),
+        GammaDistribution(), InverseGaussianDistribution(),
+        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5))
     for solver in ['irls', 'lbfgs', 'newton-cg']:
-        for family in (GaussianDistribution(), PoissonDistribution(),
-            GammaDistribution(), InverseGaussianDistribution(),
-            TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)):
-            glm = GeneralizedLinearModel(family=family,
-                fit_intercept=False, solver=solver)
+        for family in families:
+            glm = GeneralizedLinearRegressor(
+                family=family, fit_intercept=False, solver=solver)
             res = glm.fit(X, y)
             assert_array_almost_equal(res.coef_, coef)
 
+
 def test_glm_log_regression():
     """Test linear regression on a simple dataset
     """
-    coef = [1,2]
-    X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T
+    coef = [1, 2]
+    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.exp(np.dot(X, coef))
-    #for solver in ['irls', 'lbfgs', 'newton-cg']:
-    for solver in ['irls']:
-        #for family in [GaussianDistribution(), PoissonDistribution(),
-        #    GammaDistribution(), InverseGaussianDistribution(),
-        #    TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]:
-        for family in [GaussianDistribution()]:
-            glm = GeneralizedLinearModel(family=family,
-                link=LogLink(),
-                fit_intercept=False, solver=solver, start_params='ols')
+    families = (
+        NormalDistribution(), PoissonDistribution(),
+        GammaDistribution(), InverseGaussianDistribution(),
+        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5))
+    for solver in ['irls', 'lbfgs', 'newton-cg']:
+        for family in families:
+            glm = GeneralizedLinearRegressor(
+                family=family, link=LogLink(), fit_intercept=False,
+                solver=solver, start_params='ols')
             res = glm.fit(X, y)
             assert_array_almost_equal(res.coef_, coef)
 
-#TODO: Test compatibility with R's glm, glmnet
+
+# TODO: Test compatibility with R's glm, glmnet

From a6137d85401ef72976327c211f44d721d9f81e00 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 19 Jul 2017 17:41:49 +0200
Subject: [PATCH 03/69] [WIP] Add Generalized Linear Models (#9405)

* GeneralizedLinearRegressor added to doc/modules/classes.rst
---
 doc/modules/classes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index e09ca0422d8a7..2d451b6758eb1 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -720,6 +720,7 @@ Kernels:
    linear_model.BayesianRidge
    linear_model.ElasticNet
    linear_model.ElasticNetCV
+   linear_model.GeneralizedLinearRegressor
    linear_model.HuberRegressor
    linear_model.Lars
    linear_model.LarsCV

From b0be167080588a35dc1f4b762d961edb897b1019 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 9 Aug 2017 13:38:49 +0200
Subject: [PATCH 04/69] [WIP] Add Generalized Linear Models (#9405)

* fixed bug: init parameter max_iter
* fix API for family and link:
  default parameter changed to string
  non public variables self._family_instance and self._link_instance
* fixed bug in score, minus sign forgotten
* added check_is_fitted to estimate_phi and score
* added check_array(X) in predict
* replaced lambda functions in TweedieDistribution
* some documentation
---
 sklearn/linear_model/glm.py | 196 ++++++++++++++++++++++--------------
 1 file changed, 122 insertions(+), 74 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index cf91a64fafc12..0ee1564049329 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -15,9 +15,10 @@
 
 # Design Decisions:
 # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
-#   So far, it is GeneralizedLinearModel, since it could very easily
-#   extended by Bernoulli/Binomial distribution.
-#   Solution: GeneralizedLinearRegressor
+#   Estimators in sklearn are either regressors or classifiers. A Generalized
+#   Linear Model does both depending on the chosen distribution, e.g. Normal =>
+#   regressor, Bernoulli/Binomial => classifier.
+#   Solution: GeneralizedLinearRegressor since this is the focus.
 # - The link funtion (instance of class Link) is necessary for the evaluation
 #   of deviance, score, Fisher and Hessian matrix as functions of the
 #   coefficients, which is needed by optimizers.
@@ -33,7 +34,7 @@
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..externals import six
-from ..utils import check_X_y
+from ..utils import check_array, check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import newton_cg
 from ..utils.validation import check_is_fitted
@@ -372,46 +373,67 @@ class TweedieDistribution(ExponentialDispersionModel):
             The variance power of the unit_variance
             :math:`v(mu) = mu^{power}`.
     """
+    def _less_upper_bound(self, x):
+        return np.less(x, self.upper_bound)
+
+    def _less_equal_upper_bound(self, x):
+        return np.less_equal(x, self.upper_bound)
+
+    def _greater_lower_bound(self, x):
+        return np.greater(x, self.lower_bound)
+
+    def _greater_equal_lower_bound(self, x):
+        return np.greater_equal(x, self.lower_bound)
+
     def __init__(self, power=0):
         self.power = power
         self._upper_bound = np.Inf
-        self._upper_compare = lambda x: np.less(x, self.upper_bound)
+        # self._upper_compare = lambda x: np.less(x, self.upper_bound)
+        self._upper_compare = self._less_upper_bound
         if power < 0:
             # Extreme Stable
             self._lower_bound = -np.Inf
-            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            self._lower_compare = self._greater_lower_bound
         elif power == 0:
-            # GaussianDistribution
+            # NormalDistribution
             self._lower_bound = -np.Inf
-            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            self._lower_compare = self._greater_lower_bound
         elif (power > 0) and (power < 1):
             raise ValueError('For 0<power<1, no distribution exists.')
         elif power == 1:
             # PoissonDistribution
             self._lower_bound = 0
-            self._lower_compare = (
-                lambda x: np.greater_equal(x, self.lower_bound))
+            # self._lower_compare = (
+            #     lambda x: np.greater_equal(x, self.lower_bound))
+            self._lower_compare = self._greater_equal_lower_bound
         elif (power > 1) and (power < 2):
             # Compound Poisson
             self._lower_bound = 0
-            self._lower_compare = (
-                lambda x: np.greater_equal(x, self.lower_bound))
+            # self._lower_compare = (
+            #     lambda x: np.greater_equal(x, self.lower_bound))
+            self._lower_compare = self._greater_equal_lower_bound
         elif power == 2:
             # GammaDistribution
             self._lower_bound = 0
-            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            self._lower_compare = self._greater_lower_bound
         elif (power > 2) and (power < 3):
             # Positive Stable
             self._lower_bound = 0
-            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            self._lower_compare = self._greater_lower_bound
         elif power == 3:
             # InverseGaussianDistribution
             self._lower_bound = 0
-            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            self._lower_compare = self._greater_lower_bound
         elif power > 3:
             # Positive Stable
             self._lower_bound = 0
-            self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
+            self._lower_compare = self._greater_lower_bound
 
     @property
     def power(self):
@@ -530,6 +552,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     Class to fit a Generalized Linear Model (GLM) based on reproductive
     Exponential Dispersion Models (EDM).
 
+    #TODO: This belongs to User Guide
     Assumptions:
 
     - The target values y_i are realizations of random variables
@@ -559,25 +582,26 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     Parameters
     ----------
-    fit_intercept : boolean, optional, default True
-        whether to calculate the intercept for this model. If set
-        to False, no intercept will be used in calculations
-        (e.g. data is expected to be already centered).
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
 
     family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance
-        of a subclass of ExponentialDispersionModel, optional, default 'normal'
+        of a subclass of ExponentialDispersionModel, optional
+        (default='normal')
         the distributional assumption of the GLM.
 
     link : {'identity', 'log'} or an instance of a subclass of Link,
-        optional, default IdentityLink()
-        the link function (class) of the GLM
+        optional (default='identity')
+        the link function of the GLM, i.e. mapping from linear predictor
+        (X*coef) to expectation (mu).
 
-    fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr'
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul='chisqr')
         method for estimation of the dispersion parameter phi. Whether to use
         the chi squared statisic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'irls', 'newton-cg', 'lbfgs'}, defaul 'irls'
+    solver : {'irls', 'newton-cg', 'lbfgs'}, optional (defaul='irls')
         Algorithm to use in the optimization problem.
 
         - 'irls' is iterated reweighted least squares. It is the standard
@@ -585,16 +609,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         - 'newton-cg', 'lbfgs'
 
-    max_iter : int, default 100
+    max_iter : int, optional (default=100)
         TODO
 
-    tol : float
+    tol : float, optional (default=1e-4)
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
         the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative of
         the deviance).
 
-    start_params : {array shape (n_features, ), 'ols'}, default None
+    start_params : {array shape (n_features, ), 'ols'}, optional (default=None)
         sets the start values for coef_ in the fit.
         If None, default values are taken.
         If 'ols' the result of an ordinary least squares in the link space
@@ -603,9 +627,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         If fit_intercept is true, the first value is assumed to be the start
         value for the intercept_.
 
-    verbose : int, default: 0
-        For the lbfgs solver set verbose to any positive
-        number for verbosity.
+    verbose : int, optional (default=0)
+        For the lbfgs solver set verbose to any positive number for verbosity.
 
     Attributes
     ----------
@@ -629,15 +652,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     TODO
     """
 
-    def __init__(self, fit_intercept=True, family=NormalDistribution(),
-                 link=IdentityLink(), fit_dispersion='chisqr', solver='irls',
+    def __init__(self, fit_intercept=True, family='normal',
+                 link='identity', fit_dispersion='chisqr', solver='irls',
                  max_iter=100, tol=1e-4, start_params=None, verbose=0):
         self.fit_intercept = fit_intercept
         self.family = family
         self.link = link
         self.fit_dispersion = fit_dispersion
         self.solver = solver
-        self.max_iter = 100
+        self.max_iter = max_iter
         self.tol = tol
         self.start_params = start_params
         self.verbose = verbose
@@ -663,29 +686,38 @@ def fit(self, X, y, weight=None):
         -------
         self : returns an instance of self.
         """
-        if not isinstance(self.family, ExponentialDispersionModel):
+        # Garantee that self._family_instance is an instance of class
+        # ExponentialDispersionModel
+        if isinstance(self.family, ExponentialDispersionModel):
+            self._family_instance = self.family
+        else:
             if self.family == 'normal':
-                self.family = NormalDistribution()
+                self._family_instance = NormalDistribution()
             elif self.family == 'poisson':
-                self.family = PoissonDistribution()
+                self._family_instance = PoissonDistribution()
             elif self.family == 'gamma':
-                self.family = GammaDistribution()
+                self._family_instance = GammaDistribution()
             elif self.family == 'inverse.gaussian':
-                self.family = InverseGaussianDistribution()
+                self._family_instance = InverseGaussianDistribution()
             else:
                 raise ValueError(
-                    "The argument family must be an instance of class"
+                    "The family must be an instance of class"
                     " ExponentialDispersionModel or an element of"
                     " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].")
-        if not isinstance(self.link, Link):
+
+        # Garantee that self._link_instance is set to an instance of class Link
+        if isinstance(self.link, Link):
+            self._link_instance = self.link
+        else:
             if self.link == 'identity':
-                self.link = IdentityLink()
-            if self.link == 'log':
-                self.link = LogLink()
+                self._link_instance = IdentityLink()
+            elif self.link == 'log':
+                self._link_instance = LogLink()
             else:
                 raise ValueError(
-                    "The argument link must be an instance of class Link or"
+                    "The link must be an instance of class Link or"
                     " an element of ['identity', 'log'].")
+
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool,"
                              " got {0}".format(self.fit_intercept))
@@ -711,10 +743,13 @@ def fit(self, X, y, weight=None):
                          y_numeric=True, multi_output=False)
         y = y.astype(np.float64)
 
-        if not np.all(self.family.in_y_range(y)):
+        family = self._family_instance
+        link = self._link_instance
+
+        if not np.all(family.in_y_range(y)):
             raise ValueError("Some value(s) of y are out of the valid "
                              "range for family {0}"
-                             .format(self.family.__class__.__name__))
+                             .format(family.__class__.__name__))
 
         if weight is None:
             weight = np.ones_like(y)
@@ -745,14 +780,14 @@ def fit(self, X, y, weight=None):
         coef = None
         if start_params is None:
             # Use mu_start and apply one irls step to calculate coef
-            mu = self.family.starting_mu(y, weight)
+            mu = family.starting_mu(y, weight)
             # linear predictor
-            eta = self.link.link(mu)
+            eta = link.link(mu)
             # h'(eta)
-            hp = self.link.inverse_derivative(eta)
+            hp = link.inverse_derivative(eta)
             # working weights w, in principle a diagonal matrix
             # therefore here just as 1d array
-            w = (hp**2 / self.family.variance(mu, phi=1, weight=weight))
+            w = (hp**2 / family.variance(mu, phi=1, weight=weight))
             wroot = np.sqrt(w)
             # working observations
             yw = eta + (y-mu)/hp
@@ -763,7 +798,7 @@ def fit(self, X, y, weight=None):
             coef = linalg.lstsq(X_rescale, yw_rescale)[0]
         elif start_params is 'ols':
             reg = LinearRegression(copy_X=False, fit_intercept=False)
-            reg.fit(Xnew, self.link.link(y))
+            reg.fit(Xnew, link.link(y))
             coef = reg.coef_
         else:
             coef = start_params
@@ -775,16 +810,16 @@ def fit(self, X, y, weight=None):
         if self.solver == 'irls':
             # linear predictor
             eta = safe_sparse_dot(Xnew, coef, dense_output=True)
-            mu = self.link.inverse(eta)
+            mu = link.inverse(eta)
             while self.n_iter_ < self.max_iter:
                 self.n_iter_ += 1
                 # coef_old not used so far.
                 # coef_old = coef
                 # h'(eta)
-                hp = self.link.inverse_derivative(eta)
+                hp = link.inverse_derivative(eta)
                 # working weights w, in principle a diagonal matrix
                 # therefore here just as 1d array
-                w = (hp**2 / self.family.variance(mu, phi=1, weight=weight))
+                w = (hp**2 / family.variance(mu, phi=1, weight=weight))
                 wroot = np.sqrt(w)
                 # working observations
                 yw = eta + (y-mu)/hp
@@ -799,13 +834,13 @@ def fit(self, X, y, weight=None):
                 # updated linear predictor
                 # do it here for updated values for tolerance
                 eta = safe_sparse_dot(Xnew, coef, dense_output=True)
-                mu = self.link.inverse(eta)
+                mu = link.inverse(eta)
 
                 # which tolerace? |coef - coef_old| or gradient?
                 # use gradient for compliance with newton-cg and lbfgs
                 # TODO: faster computation of gradient, use mu and eta directly
-                gradient = self.family._deviance_derivative(
-                    coef=coef, X=Xnew, y=y, weight=weight, link=self.link)
+                gradient = family._deviance_derivative(
+                    coef=coef, X=Xnew, y=y, weight=weight, link=link)
                 if (np.max(np.abs(gradient)) <= self.tol):
                     converged = True
                     break
@@ -818,9 +853,9 @@ def fit(self, X, y, weight=None):
         # TODO: performance: make one function return both deviance and
         #       gradient of deviance
         elif self.solver == 'lbfgs':
-            func = self.family._deviance
-            fprime = self.family._deviance_derivative
-            args = (Xnew, y, weight, self.link)
+            func = family._deviance
+            fprime = family._deviance_derivative
+            args = (Xnew, y, weight, link)
             coef, loss, info = optimize.fmin_l_bfgs_b(
                 func, coef, fprime=fprime,
                 args=args,
@@ -836,13 +871,13 @@ def fit(self, X, y, weight=None):
                         info["task"]))
             self.n_iter_ = info['nit']
         elif self.solver == 'newton-cg':
-            func = self.family._deviance
-            grad = self.family._deviance_derivative
+            func = family._deviance
+            grad = family._deviance_derivative
 
             def grad_hess(coef, X, y, weight, link):
-                grad = (self.family._deviance_derivative(
+                grad = (family._deviance_derivative(
                     coef, X, y, weight, link))
-                hessian = (self.family._deviance_hessian(
+                hessian = (family._deviance_hessian(
                     coef, X, y, weight, link))
 
                 def Hs(s):
@@ -850,7 +885,7 @@ def Hs(s):
                     return ret
                 return grad, Hs
             hess = grad_hess
-            args = (Xnew, y, weight, self.link)
+            args = (Xnew, y, weight, link)
             coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args,
                                        maxiter=self.max_iter, tol=self.tol)
             self.coef_ = coef
@@ -871,37 +906,50 @@ def predict(self, X, weight=1):
         If weights are given, returns prediction*weights.
         """
         check_is_fitted(self, "coef_")
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
+        # TODO: validation of weight
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
             eta += self.intercept_
-        mu = self.link.inverse(eta)
+        mu = self._link_instance.inverse(eta)
         return mu*weight
 
     def estimate_phi(self, y, X, weight):
+        """Estimation of the dispersion parameter.
+        Returns the estimate.
+        """
+        check_is_fitted(self, "coef_")
         n_samples, n_features = X.shape
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
             eta += self.intercept_
-        mu = self.link.inverse(eta)
+        mu = self._link_instance.inverse(eta)
         if self.fit_dispersion == 'chisqr':
-            chisq = np.sum(weight*(y-mu)**2/self.family.unit_variance(mu))
+            chisq = np.sum(weight*(y-mu)**2 /
+                           self._family_instance.unit_variance(mu))
             return chisq/(n_samples - n_features)
         elif self.fit_dispersion == 'deviance':
-            dev = self.family.deviance(y, mu, weight)
+            dev = self._family_instance.deviance(y, mu, weight)
             return dev/(n_samples - n_features)
 
+# TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5"
+#       in check_estimator for score
+#       from sklearn.utils.estimator_checks import check_estimator
+#       from sklearn.linear_model import GeneralizedLinearRegressor
+#       check_estimator(GeneralizedLinearRegressor)
     def score(self, X, y, weight=1):
         """The natural score for a GLM is -deviance.
-        Returns the weight averaged negitive deviance (the better the score,
+        Returns the weight averaged negative deviance (the better the score,
         the better the fit). Maximum score is therefore 0.
         """
         # RegressorMixin has R^2 score.
         # TODO: Make it more compatible with the score function in
         #      sklearn.metrics.regression.py
+        check_is_fitted(self, "coef_")
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
             eta += self.intercept_
-        mu = self.link.inverse(eta)
-        output_errors = self.family.unit_deviance(y, mu)
+        mu = self._link_instance.inverse(eta)
+        output_errors = self._family_instance.unit_deviance(y, mu)
         weight = weight * np.ones_like(y)
-        return np.average(output_errors, weights=weight)
+        return -np.average(output_errors, weights=weight)

From 85c52ec9c6adb3b1f75650cfa7fe0b770393d24e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 13 Aug 2017 01:46:16 +0200
Subject: [PATCH 05/69] [WIP] Add Generalized Linear Models (#9405)

* make raw docstrings where appropriate
* make ExponentialDispersionModel (i.e. TweedieDistribution) pickable:
  ExponentialDispersionModel has new properties include_lower_bound,
  method in_y_range is not abstract anymore.
* set self.intercept_=0 if fit_intercept=False, such that it is always defined.
* set score to D2, a generalized R2 with deviance instead of squared error,
  as does glmnet. This also solves issues with
  check_regressors_train(GeneralizedLinearRegressor), which assumes R2 score.
* change of names: weight to weights in ExponentialDispersionModel and to
  sample_weight in GeneralizedLinearRegressor
* add class method linear_predictor
---
 sklearn/linear_model/glm.py | 330 ++++++++++++++++++++++--------------
 1 file changed, 199 insertions(+), 131 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 0ee1564049329..b80842f817f4d 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -5,13 +5,16 @@
 # Author: Christian Lorentzen <lorentzen.ch@googlemail.ch>
 # License: BSD 3 clause
 
+# TODO: Write more tests
 # TODO: Which name/symbol for coefficients and weights in docu?
 #       sklearn.linear_models uses w for coefficients.
-#       So far, coefficients=beta and weight=w (as standard literature)
-# TODO: Add l2-penalty
+#       So far, coefficients=beta and weights=w (as standard literature)
+# TODO: Add l2-penalty (maybe more general w.P.w with P penalty matrix)
 # TODO: Add l1-penalty (elastic net)
 # TODO: Add cross validation
 # TODO: Write docu and examples
+# TODO: Make it as much consistent to other estimators in linear_model as
+#       possible
 
 # Design Decisions:
 # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
@@ -118,7 +121,7 @@ def inverse_derivative2(self, lin_pred):
 
 
 class ExponentialDispersionModel(six.with_metaclass(ABCMeta)):
-    """Base class for reproductive Exponential Dispersion Models (EDM).
+    r"""Base class for reproductive Exponential Dispersion Models (EDM).
 
     The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
 
@@ -136,6 +139,8 @@ class ExponentialDispersionModel(six.with_metaclass(ABCMeta)):
     ----------
     lower_bound
     upper_bound
+    include_lower_bound
+    include_upper_bound
 
     Methods
     -------
@@ -174,15 +179,39 @@ def upper_bound(self):
         """
         raise NotImplementedError()
 
-    @abstractmethod
+    @abstractproperty
+    def include_lower_bound(self):
+        """If True, values of y may equal lower bound: y >= lower_bound.
+        """
+        raise NotImplementedError()
+
+    @abstractproperty
+    def include_upper_bound(self):
+        """If True, values of y may equal upper bound: y <= upper_bound.
+        """
+        raise NotImplementedError()
+
     def in_y_range(self, x):
         """Returns true if x is in the valid range of Y~EDM.
         """
-        raise NotImplementedError()
+        if self.include_lower_bound:
+            if self.include_upper_bound:
+                return np.logical_and(np.greater_equal(x, self.lower_bound),
+                                      np.less_equal(x, self.upper_bound))
+            else:
+                return np.logical_and(np.greater_equal(x, self.lower_bound),
+                                      np.less(x, self.upper_bound))
+        else:
+            if self.include_upper_bound:
+                return np.logical_and(np.greater(x, self.lower_bound),
+                                      np.less_equal(x, self.upper_bound))
+            else:
+                return np.logical_and(np.greater(x, self.lower_bound),
+                                      np.less(x, self.upper_bound))
 
     @abstractmethod
     def unit_variance(self, mu):
-        """The unit variance :math:`v(mu)` determines the variance as
+        r"""The unit variance :math:`v(mu)` determines the variance as
         a function of the mean mu by
         :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`.
         It can also be derived from the unit deviance :math:`d(y,\mu)` as
@@ -194,27 +223,27 @@ def unit_variance(self, mu):
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
-        """The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`.
+        r"""The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`.
         """
         raise NotImplementedError()
 
-    def variance(self, mu, phi=1, weight=1):
-        """The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is
+    def variance(self, mu, phi=1, weights=1):
+        r"""The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is
         :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`,
         with unit variance v(mu).
         """
-        return phi/weight * self.unit_variance(mu)
+        return phi/weights * self.unit_variance(mu)
 
-    def variance_derivative(self, mu, phi=1, weight=1):
-        """The derivative of the variance w.r.t. mu,
+    def variance_derivative(self, mu, phi=1, weights=1):
+        r"""The derivative of the variance w.r.t. mu,
         :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
         =phi/w_i*v'(\mu_i)`, with unit variance v(mu).
         """
-        return phi/weight * self.unit_variance_derivative(mu)
+        return phi/weights * self.unit_variance_derivative(mu)
 
     @abstractmethod
     def unit_deviance(self, y, mu):
-        """The unit_deviance :math:`d(y,\mu)`.
+        r"""The unit_deviance :math:`d(y,\mu)`.
         In terms of the log-likelihood it is given by
         :math:`d(y,\mu) = -2\phi\cdot
         \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).`
@@ -222,7 +251,7 @@ def unit_deviance(self, y, mu):
         raise NotImplementedError()
 
     def unit_deviance_derivative(self, y, mu):
-        """The derivative w.r.t. mu of the unit_deviance
+        r"""The derivative w.r.t. mu of the unit_deviance
         :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
         with unit variance :math:`v(\mu)`.
 
@@ -232,30 +261,30 @@ def unit_deviance_derivative(self, y, mu):
         """
         return -2*(y-mu)/self.unit_variance(mu)
 
-    def deviance(self, y, mu, weight=1):
-        """The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu)
-        with weight :math:`w_i` and unit_deviance :math:`d(y,mu)`.
+    def deviance(self, y, mu, weights=1):
+        r"""The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu)
+        with weights :math:`w_i` and unit_deviance :math:`d(y,mu)`.
         In terms of the likelihood it is :math:`D = -2\phi\cdot
         \left(loglike(y,\mu,\frac{phi}{w})
         - loglike(y,y,\frac{phi}{w})\right).`
         """
-        return np.sum(weight*self.unit_deviance(y, mu))
+        return np.sum(weights*self.unit_deviance(y, mu))
 
-    def _deviance(self, coef, X, y, weight, link):
+    def _deviance(self, coef, X, y, weights, link):
         """The deviance as a function of the coefficients ``coef``
         (:math:`beta`).
         """
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
         mu = link.inverse(lin_pred)
-        return self.deviance(y, mu, weight)
+        return self.deviance(y, mu, weights)
 
-    def deviance_derivative(self, y, mu, weight=1):
+    def deviance_derivative(self, y, mu, weights=1):
         """The derivative w.r.t. mu of the deviance.`
         """
-        return weight*self.unit_deviance_derivative(y, mu)
+        return weights*self.unit_deviance_derivative(y, mu)
 
-    def _score(self, coef, phi, X, y, weight, link):
-        """The score function :math:`s` is the derivative of the
+    def _score(self, coef, phi, X, y, weights, link):
+        r"""The score function :math:`s` is the derivative of the
         log-likelihood w.r.t. the ``coef`` (:math:`\beta`).
         It is given by
 
@@ -270,7 +299,7 @@ def _score(self, coef, phi, X, y, weight, link):
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
         mu = link.inverse(lin_pred)
-        sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         d = link.inverse_derivative(lin_pred)
         d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0),
                                         shape=(n_samples, n_samples))
@@ -278,8 +307,8 @@ def _score(self, coef, phi, X, y, weight, link):
         score = safe_sparse_dot(X.T, temp, dense_output=False)
         return score
 
-    def _fisher_matrix(self, coef, phi, X, y, weight, link):
-        """The Fisher information matrix, also known as expected
+    def _fisher_matrix(self, coef, phi, X, y, weights, link):
+        r"""The Fisher information matrix, also known as expected
         information matrix. It is given by
 
         .. math:
@@ -295,7 +324,7 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link):
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
         mu = link.inverse(lin_pred)
-        sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         d2 = link.inverse_derivative(lin_pred)**2
         d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0),
                                          shape=(n_samples, n_samples))
@@ -303,8 +332,8 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link):
         fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False)
         return fisher_matrix
 
-    def _observed_information(self, coef, phi, X, y, weight, link):
-        """The observed information matrix, also known as the negative of
+    def _observed_information(self, coef, phi, X, y, weights, link):
+        r"""The observed information matrix, also known as the negative of
         the Hessian matrix of the log-likelihood. It is given by
 
         .. math:
@@ -327,7 +356,7 @@ def _observed_information(self, coef, phi, X, y, weight, link):
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
         mu = link.inverse(lin_pred)
-        sigma_inv = 1/self.variance(mu, phi=phi, weight=weight)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         dp = link.inverse_derivative2(lin_pred)
         d2 = link.inverse_derivative(lin_pred)**2
         v = self.unit_variance_derivative(mu)/self.unit_variance(mu)
@@ -338,33 +367,34 @@ def _observed_information(self, coef, phi, X, y, weight, link):
         observed_information = safe_sparse_dot(X.T, temp, dense_output=False)
         return observed_information
 
-    def _deviance_derivative(self, coef, X, y, weight, link):
-        """The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a
+    def _deviance_derivative(self, coef, X, y, weights, link):
+        r"""The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a
         function of the coefficients ``coef``.
         This is equivalent to :math:`-2\phi` times the score function
         :math:`s` (derivative of the log-likelihood).
         """
-        score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight,
+        score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights,
                             link=link)
         return -2*score
 
-    def _deviance_hessian(self, coef, X, y, weight, link):
-        """The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance
+    def _deviance_hessian(self, coef, X, y, weights, link):
+        r"""The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance
         as a function of the coefficients ``coef``.
         This is equivalent to :math:`+2\phi` times the observed information
         matrix.
         """
         info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y,
-                                                 weight=weight, link=link)
+                                                 weights=weights, link=link)
         return 2*info_matrix
 
-    def starting_mu(self, y, weight=1):
+    def starting_mu(self, y, weights=1):
         """Starting values for the mean mu_i in IRLS."""
-        return (weight*y+np.mean(weight*y))/(2.*np.sum(np.ones_like(y)*weight))
+        return ((weights*y+np.mean(weights*y))
+                / (2.*np.sum(np.ones_like(y)*weights)))
 
 
 class TweedieDistribution(ExponentialDispersionModel):
-    """A class for the Tweedie distribution.
+    r"""A class for the Tweedie distribution.
     They have mu=E[X] and Var[X] \propto mu**power.
 
     Attributes
@@ -373,67 +403,44 @@ class TweedieDistribution(ExponentialDispersionModel):
             The variance power of the unit_variance
             :math:`v(mu) = mu^{power}`.
     """
-    def _less_upper_bound(self, x):
-        return np.less(x, self.upper_bound)
-
-    def _less_equal_upper_bound(self, x):
-        return np.less_equal(x, self.upper_bound)
-
-    def _greater_lower_bound(self, x):
-        return np.greater(x, self.lower_bound)
-
-    def _greater_equal_lower_bound(self, x):
-        return np.greater_equal(x, self.lower_bound)
-
     def __init__(self, power=0):
         self.power = power
         self._upper_bound = np.Inf
-        # self._upper_compare = lambda x: np.less(x, self.upper_bound)
-        self._upper_compare = self._less_upper_bound
+        self._include_upper_bound = False
         if power < 0:
             # Extreme Stable
             self._lower_bound = -np.Inf
-            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
-            self._lower_compare = self._greater_lower_bound
+            self._include_lower_bound = False
         elif power == 0:
             # NormalDistribution
             self._lower_bound = -np.Inf
-            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
-            self._lower_compare = self._greater_lower_bound
+            self._include_lower_bound = False
         elif (power > 0) and (power < 1):
             raise ValueError('For 0<power<1, no distribution exists.')
         elif power == 1:
             # PoissonDistribution
             self._lower_bound = 0
-            # self._lower_compare = (
-            #     lambda x: np.greater_equal(x, self.lower_bound))
-            self._lower_compare = self._greater_equal_lower_bound
+            self._include_lower_bound = True
         elif (power > 1) and (power < 2):
             # Compound Poisson
             self._lower_bound = 0
-            # self._lower_compare = (
-            #     lambda x: np.greater_equal(x, self.lower_bound))
-            self._lower_compare = self._greater_equal_lower_bound
+            self._include_lower_bound = True
         elif power == 2:
             # GammaDistribution
             self._lower_bound = 0
-            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
-            self._lower_compare = self._greater_lower_bound
+            self._include_lower_bound = False
         elif (power > 2) and (power < 3):
             # Positive Stable
             self._lower_bound = 0
-            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
-            self._lower_compare = self._greater_lower_bound
+            self._include_lower_bound = False
         elif power == 3:
             # InverseGaussianDistribution
             self._lower_bound = 0
-            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
-            self._lower_compare = self._greater_lower_bound
+            self._include_lower_bound = False
         elif power > 3:
             # Positive Stable
             self._lower_bound = 0
-            # self._lower_compare = lambda x: np.greater(x, self.lower_bound)
-            self._lower_compare = self._greater_lower_bound
+            self._include_lower_bound = False
 
     @property
     def power(self):
@@ -454,8 +461,13 @@ def lower_bound(self):
     def upper_bound(self):
         return self._upper_bound
 
-    def in_y_range(self, x):
-        return np.logical_and(self._lower_compare(x), self._upper_compare(x))
+    @property
+    def include_lower_bound(self):
+        return self._include_lower_bound
+
+    @property
+    def include_upper_bound(self):
+        return self._include_upper_bound
 
     def unit_variance(self, mu):
         """The unit variance of a Tweedie distribution is v(mu)=mu**power.
@@ -485,7 +497,7 @@ def unit_deviance(self, y, mu):
             return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
                         y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
 
-    def likelihood(self, y, X, beta, phi, weight=1):
+    def likelihood(self, y, X, beta, phi, weights=1):
         raise NotImplementedError('This function is not (yet) implemented.')
 
 
@@ -521,6 +533,8 @@ class GeneralizedHyperbolicSecand(ExponentialDispersionModel):
     def __init__(self):
         self._lower_bound = -np.Inf
         self._upper_bound = np.Inf
+        self._include_lower_bound = False
+        self._include_upper_bound = False
 
     @property
     def lower_bound(self):
@@ -530,11 +544,13 @@ def lower_bound(self):
     def upper_bound(self):
         return self._upper_bound
 
-    def in_y_range(self, x):
-        np.logical_and(
-            np.greater(x, self.lower_bound),
-            np.less(x, self.lower_bound)
-            )
+    @property
+    def include_lower_bound(self):
+        return self._include_lower_bound
+
+    @property
+    def include_upper_bound(self):
+        return self._include_upper_bound
 
     def unit_variance(self, mu):
         return 1 + mu**2
@@ -548,7 +564,7 @@ def unit_deviance(self, y, mu):
 
 
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
-    """
+    r"""
     Class to fit a Generalized Linear Model (GLM) based on reproductive
     Exponential Dispersion Models (EDM).
 
@@ -665,7 +681,7 @@ def __init__(self, fit_intercept=True, family='normal',
         self.start_params = start_params
         self.verbose = verbose
 
-    def fit(self, X, y, weight=None):
+    def fit(self, X, y, sample_weight=None):
         """Fit a generalized linear model.
 
         Parameters
@@ -676,11 +692,12 @@ def fit(self, X, y, weight=None):
         y : numpy array of shape [n_samples]
             Target values
 
-        weight : numpy array of shape [n_samples]
+        sample_weight : numpy array of shape [n_samples]
             Individual weights for each sample.
             Var[Y_i]=phi/weight_i * v(mu)
             If Y_i ~ EDM(mu, phi/w_i) then
-            sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w))
+            sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
+            weighted average with weights=sample_weight.
 
         Returns
         -------
@@ -751,16 +768,17 @@ def fit(self, X, y, weight=None):
                              "range for family {0}"
                              .format(family.__class__.__name__))
 
-        if weight is None:
-            weight = np.ones_like(y)
-        elif np.isscalar(weight):
-            weight = weight*np.ones_like(y)
+        if sample_weight is None:
+            weights = np.ones_like(y)
+        elif np.isscalar(sample_weight):
+            weights = sample_weight*np.ones_like(y)
         else:
-            weight = np.atleast_1d(weight)
-            if weight.ndim > 1:
-                raise ValueError("Weights must be 1D array or scalar")
-            elif weight.shape[0] != y.shape[0]:
-                raise ValueError("Weights must have the same length as y")
+            weights = np.atleast_1d(sample_weight)
+            if weights.ndim > 1:
+                raise ValueError("Sample weight must be 1D array or scalar")
+            elif weights.shape[0] != y.shape[0]:
+                raise ValueError("Sample weights must have the same length as"
+                                 " y")
 
         if self.fit_intercept:
             # intercept is first column <=> coef[0] is for intecept
@@ -780,14 +798,14 @@ def fit(self, X, y, weight=None):
         coef = None
         if start_params is None:
             # Use mu_start and apply one irls step to calculate coef
-            mu = family.starting_mu(y, weight)
+            mu = family.starting_mu(y, weights)
             # linear predictor
             eta = link.link(mu)
             # h'(eta)
             hp = link.inverse_derivative(eta)
             # working weights w, in principle a diagonal matrix
             # therefore here just as 1d array
-            w = (hp**2 / family.variance(mu, phi=1, weight=weight))
+            w = (hp**2 / family.variance(mu, phi=1, weights=weights))
             wroot = np.sqrt(w)
             # working observations
             yw = eta + (y-mu)/hp
@@ -819,7 +837,7 @@ def fit(self, X, y, weight=None):
                 hp = link.inverse_derivative(eta)
                 # working weights w, in principle a diagonal matrix
                 # therefore here just as 1d array
-                w = (hp**2 / family.variance(mu, phi=1, weight=weight))
+                w = (hp**2 / family.variance(mu, phi=1, weights=weights))
                 wroot = np.sqrt(w)
                 # working observations
                 yw = eta + (y-mu)/hp
@@ -840,7 +858,7 @@ def fit(self, X, y, weight=None):
                 # use gradient for compliance with newton-cg and lbfgs
                 # TODO: faster computation of gradient, use mu and eta directly
                 gradient = family._deviance_derivative(
-                    coef=coef, X=Xnew, y=y, weight=weight, link=link)
+                    coef=coef, X=Xnew, y=y, weights=weights, link=link)
                 if (np.max(np.abs(gradient)) <= self.tol):
                     converged = True
                     break
@@ -855,7 +873,7 @@ def fit(self, X, y, weight=None):
         elif self.solver == 'lbfgs':
             func = family._deviance
             fprime = family._deviance_derivative
-            args = (Xnew, y, weight, link)
+            args = (Xnew, y, weights, link)
             coef, loss, info = optimize.fmin_l_bfgs_b(
                 func, coef, fprime=fprime,
                 args=args,
@@ -874,47 +892,72 @@ def fit(self, X, y, weight=None):
             func = family._deviance
             grad = family._deviance_derivative
 
-            def grad_hess(coef, X, y, weight, link):
+            def grad_hess(coef, X, y, weights, link):
                 grad = (family._deviance_derivative(
-                    coef, X, y, weight, link))
+                    coef, X, y, weights, link))
                 hessian = (family._deviance_hessian(
-                    coef, X, y, weight, link))
+                    coef, X, y, weights, link))
 
                 def Hs(s):
                     ret = np.dot(hessian, s)
                     return ret
                 return grad, Hs
             hess = grad_hess
-            args = (Xnew, y, weight, link)
+            args = (Xnew, y, weights, link)
             coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args,
                                        maxiter=self.max_iter, tol=self.tol)
             self.coef_ = coef
 
-        if self.fit_intercept is True:
+        if self.fit_intercept:
             self.intercept_ = coef[0]
             self.coef_ = coef[1:]
         else:
+            self.intercept_ = 0.
             self.coef_ = coef
 
         if self.fit_dispersion in ['chisqr', 'deviance']:
-            self.dispersion_ = self.estimate_phi(y, X, weight)
+            self.dispersion_ = self.estimate_phi(y, X, weights)
 
         return self
 
-    def predict(self, X, weight=1):
-        """Prediction with features X.
-        If weights are given, returns prediction*weights.
+    def linear_predictor(self, X):
+        """The linear_predictor X*coef_ + intercept_.
+
+        Parameters
+        ----------
+        X : numpy array or sparse matrix of shape [n_samples,n_features]
+            Samples.
+
+        Returns
+        -------
+        C : array, shape = (n_samples)
+            Returns predicted values of linear predictor.
         """
         check_is_fitted(self, "coef_")
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
-        # TODO: validation of weight
-        eta = safe_sparse_dot(X, self.coef_, dense_output=True)
-        if self.fit_intercept is True:
-            eta += self.intercept_
+        return safe_sparse_dot(X, self.coef_,
+                               dense_output=True) + self.intercept_
+
+    def predict(self, X, sample_weight=1):
+        """Predict uing GLM with feature matrix X.
+        If sample_weight is given, returns prediction*sample_weight.
+
+        Parameters
+        ----------
+        X : numpy array or sparse matrix of shape [n_samples,n_features]
+            Samples.
+
+        Returns
+        -------
+        C : array, shape = (n_samples)
+            Returns predicted values times sample_weight.
+        """
+        # TODO: validation of sample_weight
+        eta = self.linear_predictor(X)
         mu = self._link_instance.inverse(eta)
-        return mu*weight
+        return mu*sample_weight
 
-    def estimate_phi(self, y, X, weight):
+    def estimate_phi(self, y, X, sample_weight):
         """Estimation of the dispersion parameter.
         Returns the estimate.
         """
@@ -925,11 +968,11 @@ def estimate_phi(self, y, X, weight):
             eta += self.intercept_
         mu = self._link_instance.inverse(eta)
         if self.fit_dispersion == 'chisqr':
-            chisq = np.sum(weight*(y-mu)**2 /
+            chisq = np.sum(sample_weight*(y-mu)**2 /
                            self._family_instance.unit_variance(mu))
             return chisq/(n_samples - n_features)
         elif self.fit_dispersion == 'deviance':
-            dev = self._family_instance.deviance(y, mu, weight)
+            dev = self._family_instance.deviance(y, mu, sample_weight)
             return dev/(n_samples - n_features)
 
 # TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5"
@@ -937,19 +980,44 @@ def estimate_phi(self, y, X, weight):
 #       from sklearn.utils.estimator_checks import check_estimator
 #       from sklearn.linear_model import GeneralizedLinearRegressor
 #       check_estimator(GeneralizedLinearRegressor)
-    def score(self, X, y, weight=1):
-        """The natural score for a GLM is -deviance.
-        Returns the weight averaged negative deviance (the better the score,
-        the better the fit). Maximum score is therefore 0.
+    def score(self, X, y, sample_weight=None):
+        r"""Returns D^2, a generalization of the coefficient of determination
+        R^2, which uses deviance instead of squared error.
+
+        D^2 is defined as
+        :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}`
+        is the null deviance, i.e. the deviance of a model with intercept
+        alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean
+        :math:`\bar{y}` is average by sample_weight. In the case of a Normal
+        distribution, this D^2 equals R^2.
+        Best possible score is 1.0 and it can be negative (because the
+        model can be arbitrarily worse).
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Test samples
+
+        y : array-like of shape = (n_samples)
+            True valeus for X.
+
+        sample_weight : array-like, shape = (n_samples), optional
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            D^2 of self.predict(X) wrt. y.
         """
-        # RegressorMixin has R^2 score.
-        # TODO: Make it more compatible with the score function in
-        #      sklearn.metrics.regression.py
-        check_is_fitted(self, "coef_")
-        eta = safe_sparse_dot(X, self.coef_, dense_output=True)
-        if self.fit_intercept is True:
-            eta += self.intercept_
-        mu = self._link_instance.inverse(eta)
-        output_errors = self._family_instance.unit_deviance(y, mu)
-        weight = weight * np.ones_like(y)
-        return -np.average(output_errors, weights=weight)
+        # Note, default score defined in RegressorMixin is R^2 score.
+        # TODO: make D^2 a score function in module metrics (and thereby get
+        #       input validation and so on)
+        if sample_weight is None:
+            weights = np.ones_like(y)
+        else:
+            weights = np.atleast_1d(sample_weight)
+        mu = self.predict(X)
+        dev = self._family_instance.deviance(y, mu, weights=weights)
+        y_mean = np.average(y, weights=weights)
+        dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
+        return 1. - dev / dev_null

From 0f4bdb3a8c5c45e80786b8156398da93bfc597e8 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Mon, 18 Sep 2017 23:41:19 +0200
Subject: [PATCH 06/69] [WIP] Add Generalized Linear Models (#9405)

* added L2 penalty
* api change: alpha, l1_ratio, P1, P2, warm_start, check_input, copy_X
* added entry in user guide
* improved docstrings
* helper function _irls_step
---
 doc/modules/linear_model.rst           | 113 +++-
 sklearn/linear_model/glm.py            | 702 ++++++++++++++++++-------
 sklearn/linear_model/tests/test_glm.py | 115 +++-
 3 files changed, 721 insertions(+), 209 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 75b95f6c7a44f..51b3821fa6207 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -786,7 +786,7 @@ non-smooth `penalty="l1"`. This is therefore the solver of choice for sparse
 multinomial logistic regression. It is also the only solver that supports
 `penalty="elasticnet"`.
 
-The "lbfgs" is an optimization algorithm that approximates the 
+The "lbfgs" is an optimization algorithm that approximates the
 Broyden–Fletcher–Goldfarb–Shanno algorithm [8]_, which belongs to
 quasi-Newton methods. The "lbfgs" solver is recommended for use for
 small data-sets but for larger datasets its performance suffers. [9]_
@@ -874,6 +874,117 @@ to warm-starting (see :term:`Glossary <warm_start>`).
     .. [9] `"Performance Evaluation of Lbfgs vs other solvers"
             <http://www.fuzihao.org/blog/2016/01/16/Comparison-of-Gradient-Descent-Stochastic-Gradient-Descent-and-L-BFGS/>`_
 
+.. _Generalized_linear_regression:
+
+Generalized linear regression
+=============================
+
+:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two
+ways [1]_. First, the predicted values :math:`\hat{y}` are linked to a linear
+combination of the input variables :math:`X` via an inverse link function
+:math:`h` as
+
+.. math::    \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p).
+
+Secondly, the squared loss function is replaced by the deviance :math:`D` of an
+exponential dispersion model (EDM) [2]_. The objective function beeing minimized
+becomes
+
+.. math::    \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1
+            +\frac{\alpha(1-\rho)}{2} w^T P_2 w
+
+with sample weights :math:`s`.
+:math:`P_1` can be used to exclude some of the coefficients in the L1
+penalty, :math:`P_2` (must be positive semi-definite) allows for a more
+versatile L2 penalty.
+
+Use cases, where a loss different from the squared loss might be appropriate,
+are the following:
+
+  * If the target values :math:`y` are counts (integer valued) or frequencies, you might try a Poisson deviance.
+
+  * If the target values are positive valued and skewed, you might try a Gamma deviance.
+
+  * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family).
+
+Since the linear predictor :math:`Xw` can be negative and
+Poisson, Gamma and Inverse Gaussian distributions don't have negative values,
+it is convenient to apply a link function different from the identity link
+:math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with
+:math:`h(Xw)=\exp(Xw)`.
+
+Note that the feature matrix `X` should be standardized before fitting. This
+ensures that the penalty treats features equally.
+
+    >>> from sklearn import linear_model
+    >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, l1_ratio=0)
+    >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
+    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
+    >>> reg.coef_
+    array([ 0.24630255,  0.43373521])
+    >>> reg.intercept_
+    -0.76383575123143277
+
+Mathematical formulation
+------------------------
+
+In the unpenalized case, the assumptions are the folowing:
+
+    * The target values :math:`y_i` are realizations of random variables
+      :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})`
+      with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter
+      :math:`\phi` and sample weights :math:`s_i`.
+    * The aim is to predict the expectation :math:`\mu_i` with
+      :math:`\hat{y_i} = h(\eta_i)`, linear predictor
+      :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`.
+
+Note that the first assumption implies
+:math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance
+function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the
+same as specifying a unit variance function (they are one-to-one).
+
+Including penalties helps to avoid overfitting or, in case of L1 penalty, to
+obtain sparse solutions. But there are also other motivations to include them,
+e.g. accounting fo dependence structure of :math:`y`.
+
+The objective function, which is independent of :math:`\phi`, is minimized with
+respect to the coefficients :math:`w`.
+
+The deviance is defined by
+
+.. math::     D(y, \mu) = -2\phi\cdot
+              \left(loglike(y,\mu,\frac{\phi}{s})
+              - loglike(y,y,\frac{\phi}{s})\right)
+
+=====================================  =================================
+Distribution                           Variance Function :math:`v(\mu)`
+=====================================  =================================
+Normal ("normal")                      :math:`1`
+Poisson ("poisson")                    :math:`\mu`
+Gamma ("gamma")                        :math:`\mu^2`
+Inverse Gaussian ("inverse.gaussian")  :math:`\mu^3`
+=====================================  =================================
+
+Two remarks:
+
+* The deviances for at least Normal, Poisson and Gamma distributions are
+  strictly consistent scoring functions for the mean :math:`\mu`, see Eq.
+  (19)-(20) in [3]_.
+
+* If you want to model a frequency, i.e. counts per exposure (time, volume, ...)
+  you can do so by a Poisson distribution and passing
+  :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together
+  with :math:`s=\mathrm{exposure}` as sample weights.
+
+
+.. topic:: References:
+
+    .. [1] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+
+    .. [2] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
+           See also `Exponential dispersion model. <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
+
+    .. [3] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
 
 Stochastic Gradient Descent - SGD
 =================================
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index b80842f817f4d..2db3c56d5e1c1 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -6,15 +6,15 @@
 # License: BSD 3 clause
 
 # TODO: Write more tests
-# TODO: Which name/symbol for coefficients and weights in docu?
-#       sklearn.linear_models uses w for coefficients.
-#       So far, coefficients=beta and weights=w (as standard literature)
-# TODO: Add l2-penalty (maybe more general w.P.w with P penalty matrix)
 # TODO: Add l1-penalty (elastic net)
+# TODO: deal with option self.copy_X
+# TODO: Should the option `normalize` be included (like other linear models)?
+#       So far, it is not included. User must pass a normalized X.
 # TODO: Add cross validation
-# TODO: Write docu and examples
+# TODO: Write examples and more docu
 # TODO: Make it as much consistent to other estimators in linear_model as
 #       possible
+# TODO: options P1 and P2 in fit() or in __init__()???
 
 # Design Decisions:
 # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
@@ -22,10 +22,21 @@
 #   Linear Model does both depending on the chosen distribution, e.g. Normal =>
 #   regressor, Bernoulli/Binomial => classifier.
 #   Solution: GeneralizedLinearRegressor since this is the focus.
+# - Allow for finer control of penalty terms:
+#   L1: ||P1*w||_1 with P1*w a componentwise product, this allows to exclude
+#       factors from the L1 penalty.
+#   L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be
+#   a 1st or 2nd order difference matrix (compare B-spline penalties and
+#   Tikhonov regularization).
 # - The link funtion (instance of class Link) is necessary for the evaluation
 #   of deviance, score, Fisher and Hessian matrix as functions of the
 #   coefficients, which is needed by optimizers.
 #   Solution: link as argument in those functions
+# - Which name/symbol for sample_weight in docu?
+#   sklearn.linear_models uses w for coefficients, standard literature on
+#   GLMs use beta for coefficients and w for (sample) weights.
+#   So far, coefficients=w and sample weights=s.
+
 
 from __future__ import division
 from abc import ABCMeta, abstractmethod, abstractproperty
@@ -34,6 +45,8 @@
 from scipy import linalg, optimize, sparse
 import warnings
 from .base import LinearRegression
+from .coordinate_descent import ElasticNet
+from .ridge import Ridge
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..externals import six
@@ -164,7 +177,8 @@ class ExponentialDispersionModel(six.with_metaclass(ABCMeta)):
 
     References
     ----------
-    See https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+
+    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
     """
 
     @abstractproperty
@@ -192,7 +206,7 @@ def include_upper_bound(self):
         raise NotImplementedError()
 
     def in_y_range(self, x):
-        """Returns true if x is in the valid range of Y~EDM.
+        """Returns true if `x` is in the valid range of Y~EDM.
         """
         if self.include_lower_bound:
             if self.include_upper_bound:
@@ -211,33 +225,36 @@ def in_y_range(self, x):
 
     @abstractmethod
     def unit_variance(self, mu):
-        r"""The unit variance :math:`v(mu)` determines the variance as
-        a function of the mean mu by
-        :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`.
+        r"""The unit variance :math:`v(\mu)` determines the variance as
+        a function of the mean :math:`\mu` by
+        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`.
         It can also be derived from the unit deviance :math:`d(y,\mu)` as
 
         .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{
             \partial\mu^2}}\big|_{y=\mu}
+
+        See also :func:`variance`.
         """
         raise NotImplementedError()
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
-        r"""The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`.
+        r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`.
         """
         raise NotImplementedError()
 
     def variance(self, mu, phi=1, weights=1):
-        r"""The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is
-        :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`,
-        with unit variance v(mu).
+        r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
+        :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`,
+        with unit variance :math:`v(\mu)` and weights :math:`s_i`.
         """
         return phi/weights * self.unit_variance(mu)
 
     def variance_derivative(self, mu, phi=1, weights=1):
-        r"""The derivative of the variance w.r.t. mu,
+        r"""The derivative of the variance w.r.t. `mu`,
         :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
-        =phi/w_i*v'(\mu_i)`, with unit variance v(mu).
+        =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)`
+        and weights :math:`s_i`.
         """
         return phi/weights * self.unit_variance_derivative(mu)
 
@@ -251,8 +268,8 @@ def unit_deviance(self, y, mu):
         raise NotImplementedError()
 
     def unit_deviance_derivative(self, y, mu):
-        r"""The derivative w.r.t. mu of the unit_deviance
-        :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
+        r"""The derivative w.r.t. `mu` of the unit deviance
+        :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
         with unit variance :math:`v(\mu)`.
 
         Returns
@@ -262,39 +279,39 @@ def unit_deviance_derivative(self, y, mu):
         return -2*(y-mu)/self.unit_variance(mu)
 
     def deviance(self, y, mu, weights=1):
-        r"""The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu)
-        with weights :math:`w_i` and unit_deviance :math:`d(y,mu)`.
+        r"""The deviance is given by :math:`D = \sum_i s_i \cdot d(y, \mu)
+        with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`.
         In terms of the likelihood it is :math:`D = -2\phi\cdot
-        \left(loglike(y,\mu,\frac{phi}{w})
-        - loglike(y,y,\frac{phi}{w})\right).`
+        \left(loglike(y,\mu,\frac{phi}{s})
+        - loglike(y,y,\frac{phi}{s})\right)`.
         """
         return np.sum(weights*self.unit_deviance(y, mu))
 
     def _deviance(self, coef, X, y, weights, link):
-        """The deviance as a function of the coefficients ``coef``
-        (:math:`beta`).
+        """The deviance as a function of the coefficients `coef`
+        (:math:`w`).
         """
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
         mu = link.inverse(lin_pred)
         return self.deviance(y, mu, weights)
 
     def deviance_derivative(self, y, mu, weights=1):
-        """The derivative w.r.t. mu of the deviance.`
+        """The derivative w.r.t. `mu` of the deviance.
         """
         return weights*self.unit_deviance_derivative(y, mu)
 
     def _score(self, coef, phi, X, y, weights, link):
-        r"""The score function :math:`s` is the derivative of the
-        log-likelihood w.r.t. the ``coef`` (:math:`\beta`).
+        r"""The score function is the derivative of the
+        log-likelihood w.r.t. `coef` (:math:`w`).
         It is given by
 
         .. math:
 
-            \mathbf{s}(\boldsymbol{\beta}) = \mathbf{X}^T \mathbf{D}
+            \mathbf{score}(\boldsymbol{w}) = \mathbf{X}^T \mathbf{D}
             \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,,
 
         with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and
-        :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}(y_1),\ldots)`.
+        :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`.
         """
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
@@ -303,23 +320,27 @@ def _score(self, coef, phi, X, y, weights, link):
         d = link.inverse_derivative(lin_pred)
         d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0),
                                         shape=(n_samples, n_samples))
-        temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False)
-        score = safe_sparse_dot(X.T, temp, dense_output=False)
+        temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=True)
+        score = safe_sparse_dot(X.T, temp, dense_output=True)
         return score
 
     def _fisher_matrix(self, coef, phi, X, y, weights, link):
-        r"""The Fisher information matrix, also known as expected
-        information matrix. It is given by
+        r"""The Fisher information matrix.
+        The Fisher information matrix, also known as expected information
+        matrix is given by
 
         .. math:
 
-            \mathbf{F}(\boldsymbol{\beta}) = \mathrm{E}\left[
-            -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta}
-            \partial\boldsymbol{\beta}^T}\right]
+            \mathbf{F}(\boldsymbol{w}) =
+            \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial
+            \boldsymbol{w}} \right]
+            = \mathrm{E}\left[
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
+            \partial\boldsymbol{w}^T}\right]
             = \mathbf{X}^T W \mathbf{X} \,,
 
         with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`,
-        see score function.
+        see func:`score_function`.
         """
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
@@ -333,14 +354,15 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link):
         return fisher_matrix
 
     def _observed_information(self, coef, phi, X, y, weights, link):
-        r"""The observed information matrix, also known as the negative of
+        r"""The observed information matrix.
+        The observed information matrix, also known as the negative of
         the Hessian matrix of the log-likelihood. It is given by
 
         .. math:
 
-            \mathbf{H}(\boldsymbol{\beta}) =
-            -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta}
-            \partial\boldsymbol{\beta}^T}
+            \mathbf{H}(\boldsymbol{w}) =
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
+            \partial\boldsymbol{w}^T}
             = \mathbf{X}^T \legt[
             - \mathbf{D}' \mathbf{R}
             + \mathbf{D}^2 \mathbf{V} \mathbf{R}
@@ -351,7 +373,7 @@ def _observed_information(self, coef, phi, X, y, weights, link):
         :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{
         v(\mu_i)}
         \right)`,
-        see score function and Fisher matrix.
+        see :func:`score_` function and :func:`_fisher_matrix`.
         """
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
@@ -368,18 +390,18 @@ def _observed_information(self, coef, phi, X, y, weights, link):
         return observed_information
 
     def _deviance_derivative(self, coef, X, y, weights, link):
-        r"""The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a
-        function of the coefficients ``coef``.
+        r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a
+        function of the coefficients `coef`.
         This is equivalent to :math:`-2\phi` times the score function
-        :math:`s` (derivative of the log-likelihood).
+        :func:`score_function` (derivative of the log-likelihood).
         """
         score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights,
                             link=link)
         return -2*score
 
     def _deviance_hessian(self, coef, X, y, weights, link):
-        r"""The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance
-        as a function of the coefficients ``coef``.
+        r"""The hessian matrix w.r.t. `coef` (:math:`w`) of the deviance
+        as a function of the coefficients `coef`.
         This is equivalent to :math:`+2\phi` times the observed information
         matrix.
         """
@@ -388,20 +410,21 @@ def _deviance_hessian(self, coef, X, y, weights, link):
         return 2*info_matrix
 
     def starting_mu(self, y, weights=1):
-        """Starting values for the mean mu_i in IRLS."""
-        return ((weights*y+np.mean(weights*y))
-                / (2.*np.sum(np.ones_like(y)*weights)))
+        """Starting values for the mean mu_i in (unpenalized) IRLS."""
+        return ((weights*y+np.mean(weights*y)) /
+                (2.*np.sum(np.ones_like(y)*weights)))
 
 
 class TweedieDistribution(ExponentialDispersionModel):
     r"""A class for the Tweedie distribution.
-    They have mu=E[X] and Var[X] \propto mu**power.
+    They have :math:`\mu=\mathrm{E}[Y]` and
+    :math:`\mathrm{Var}[Y] \propto \mu^power.
 
     Attributes
     ----------
     power : float
             The variance power of the unit_variance
-            :math:`v(mu) = mu^{power}`.
+            :math:`v(\mu) = \mu^{power}`.
     """
     def __init__(self, power=0):
         self.power = power
@@ -497,7 +520,7 @@ def unit_deviance(self, y, mu):
             return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
                         y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
 
-    def likelihood(self, y, X, beta, phi, weights=1):
+    def likelihood(self, y, X, w, phi, weights=1):
         raise NotImplementedError('This function is not (yet) implemented.')
 
 
@@ -563,51 +586,135 @@ def unit_deviance(self, y, mu):
                 np.log((1+mu**2)/(1+y**2)))
 
 
+def _irls_step(X, W, P2, z):
+    """One step in iteratively reweighted least squares
+
+    Solve A w = b for w with
+    A = (X' W X + P2)
+    b = X' W z
+    z = eta + D^-1 (y-mu)
+
+    See also fit method of :class:`GeneralizedLinearRegressor`.
+
+    Parameters
+    ----------
+    X : numpy array or sparse matrix of shape (n_samples, n_features)
+        Training data (with intercept included if present)
+
+    W : numpy array of shape (n_samples, )
+
+    P2 : numpy array or sparse matrix of shape (n_features, n_features)
+        The l2-penalty matrix or vector (=diagonal matrix)
+
+    z  : numpy array of shape (n_samples, )
+        Working observations
+
+    Returns
+    -------
+    coef: array, shape = (X.shape[1])
+    """
+    # TODO: scipy.linalg.solve if faster, but ordinary least squares uses
+    #       scipy.linalg.lstsq. What is more appropriate?
+    n_samples, n_features = X.shape
+    if sparse.issparse(X):
+        W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr()
+        if P2.ndim == 1:
+            L2 = (sparse.dia_matrix((P2, 0), shape=(n_features, n_features))
+                  ).tocsr()
+        else:
+            L2 = sparse.csr_matrix(P2)
+        XtW = X.transpose() * W
+        A = XtW * X + L2
+        b = XtW * z
+        coef = sparse.linalg.spsolve(A, b)
+    else:
+        XtW = (X.T * W)
+        A = XtW.dot(X)
+        if P2.ndim == 1:
+            A[np.diag_indices_from(A)] += P2
+        else:
+            A += P2
+        b = XtW.dot(z)
+        coef = linalg.solve(A, b)
+    return coef
+
+
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
-    r"""
-    Class to fit a Generalized Linear Model (GLM) based on reproductive
-    Exponential Dispersion Models (EDM).
+    """Regression via a Generalized Linear Model (GLM) based on reproductive
+    Exponential Dispersion Models (EDM) with combined L1 and L2 priors as
+    regularizer.
+
+    Minimizes the objective function::
+
+            1/(2s) * deviance(y, h(X*w))
+            + alpha * l1_ratio * ||P1*w||_1
+            + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
+
+    with inverse link function `h` and s=sum of `sample_weight` (which equals
+    n_samples for `sample_weight=None`).
+    For `P1`=`P2`=identity, the penalty is the elastic net::
 
-    #TODO: This belongs to User Guide
-    Assumptions:
+            alpha * l1_ratio * ||w||_1
+            + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2
 
-    - The target values y_i are realizations of random variables
-      :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion
-      parameter :math:`\phi` and weights :math:`w_i`.
-    - The expectation of :math:`Y_i` is :math:`\mu_i=\mathrm{E}[Y]=h(\eta_i)`
-      whith the linear predictor :math:`\eta=X*\beta`, inverse link function
-      :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta`
-      to be estimated.
+    If you are interested in controlling the L1 and L2 penalty
+    separately, keep in mind that this is equivalent to::
 
-    Note that the first assumption implies
-    :math:`\mathrm{Var}[Y_i]=\frac{\phi}{w_i} v(\mu_i)` with uni variance
-    function :math:`v(\mu)`.
+            a * L1 + b * L2
+
+    where::
+
+            alpha = a + b and l1_ratio = a / (a + b)
+
+    The parameter `l1_ratio` corresponds to alpha in the glmnet R package while
+    alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio
+    = 1 is the lasso penalty.
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
 
     The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments :math:`E[Y_i]=\mu_i=h(\eta_i)` and
-    :math:`Var[Y_i]=\frac{\phi}{w_i} v(\mu_i)`
+    the first two moments :math:`E[Y_i]=\\mu_i=h(\\eta_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{w_i} v(\\mu_i)`.
+
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    (penalized) maximum likelihood which is equivalent to minimizing the
+    deviance.
 
-    The parameters :math:`\beta` are estimated by maximum likelihood which is
-    equivalent to minimizing the deviance.
+    TODO: For `alpha` > 0, the feature matrix `X` is assumed to be
+    standardized. Call
+    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
 
     TODO: Estimation of the dispersion parameter phi.
 
-    TODO: Notes on weights and 'scaled' Poisson, e.g. fit y = x/w with
-    with x=counts and w=exposure (time, money, persons, ...) => y is a
-    ratio with weights w.
+    TODO: Notes on weights and 'scaled' distributions. For Poisson, this means
+    to fit y = z/w with z=counts and w=exposure (time, money, persons, ...)
+    => y is a ratio with weights w. Same for other distributions.
 
     Parameters
     ----------
+    alpha : float, optional (default=1)
+        Constant that multiplies the penalty terms und thus determines the
+        regularization strength.
+        See the notes for the exact mathematical meaning of this
+        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        case, the design matrix X must have full column rank
+        (no collinearities).
+
+    l1_ratio : float, optional (defaul=0)
+        The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For
+        ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it
+        is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
+        combination of L1 and L2.
+
     fit_intercept : boolean, optional (default=True)
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance
-        of a subclass of ExponentialDispersionModel, optional
-        (default='normal')
+    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\
+            of class ExponentialDispersionModel, optional(default='normal')
         the distributional assumption of the GLM.
 
-    link : {'identity', 'log'} or an instance of a subclass of Link,
+    link : {'identity', 'log'} or an instance of class Link,
         optional (default='identity')
         the link function of the GLM, i.e. mapping from linear predictor
         (X*coef) to expectation (mu).
@@ -634,28 +741,41 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         where ``g_i`` is the i-th component of the gradient (derivative of
         the deviance).
 
-    start_params : {array shape (n_features, ), 'ols'}, optional (default=None)
-        sets the start values for coef_ in the fit.
-        If None, default values are taken.
-        If 'ols' the result of an ordinary least squares in the link space
-        (linear predictor) is taken.
-        If an array is given, these values are taken as coef_ to start with.
-        If fit_intercept is true, the first value is assumed to be the start
-        value for the intercept_.
+    warm_start : boolean, optional (default=False)
+        If set to ``True``, reuse the solution of the previous call to fit as
+        initialization for ``coef_`` and ``intercept_`` (supersedes option
+        ``start_params``). If set to ``True`` or if the attribute ``coef_``
+        does not exit (first call to fit), option ``start_params`` sets the
+        starting values for ``coef_`` and ``intercept_``.
+
+    start_params : None or array of shape (n_features, ) or 'least_squares'}, \
+            optional (default=None)
+        If an array of size n_features is supplied, use these as start values
+        for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
+        is assumed to be the start value for the ``intercept_``.
+        If 'least_squares' is set, the result of a least squares fit in the
+        link space (linear predictor) is taken. If ``None``, the start values
+        are calculated by setting mu to family.starting_mu(..) and one step of
+        irls.
+        This option only applies if ``warm_start=False`` or if fit is called
+        the first time (``self.coef_`` does not exist).
+
+    copy_X : boolean, optional, default True
+        If ``True``, X will be copied; else, it may be overwritten.
 
     verbose : int, optional (default=0)
         For the lbfgs solver set verbose to any positive number for verbosity.
 
     Attributes
     ----------
-    coef_ : array, shape (1, n_features)
+    coef_ : array, shape (n_features, )
         Estimated coefficients for the linear predictor (X*coef_) in the GLM.
 
     intercept_ : float
         Intercept (a.k.a. bias) added to linear predictor.
 
     dispersion_ : float
-        The dispersion parameter :math:`\phi` if fit_dispersion is set.
+        The dispersion parameter :math:`\\phi` if fit_dispersion is set.
 
     n_iter_ : int
         Actual number of iterations of the solver.
@@ -667,10 +787,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     ----------
     TODO
     """
-
-    def __init__(self, fit_intercept=True, family='normal',
-                 link='identity', fit_dispersion='chisqr', solver='irls',
-                 max_iter=100, tol=1e-4, start_params=None, verbose=0):
+    def __init__(self, alpha=1.0, l1_ratio=0,
+                 fit_intercept=True, family='normal', link='identity',
+                 fit_dispersion='chisqr', solver='irls', max_iter=100,
+                 tol=1e-4, warm_start=False, start_params=None, copy_X=True,
+                 verbose=0):
+        self.alpha = alpha
+        self.l1_ratio = l1_ratio
         self.fit_intercept = fit_intercept
         self.family = family
         self.link = link
@@ -678,31 +801,86 @@ def __init__(self, fit_intercept=True, family='normal',
         self.solver = solver
         self.max_iter = max_iter
         self.tol = tol
+        self.warm_start = warm_start
         self.start_params = start_params
+        self.copy_X = copy_X
         self.verbose = verbose
 
-    def fit(self, X, y, sample_weight=None):
+    def fit(self, X, y, sample_weight=None, P1=None, P2=None,
+            check_input=True):
         """Fit a generalized linear model.
 
         Parameters
         ----------
-        X : numpy array or sparse matrix of shape [n_samples,n_features]
+        X : numpy array or sparse matrix of shape (n_samples, n_features)
             Training data
 
-        y : numpy array of shape [n_samples]
+        y : numpy array of shape (n_samples, )
             Target values
 
-        sample_weight : numpy array of shape [n_samples]
+        sample_weight : array of shape (n_samples, ) or None,\
+                optinal (default=None)
             Individual weights for each sample.
             Var[Y_i]=phi/weight_i * v(mu)
             If Y_i ~ EDM(mu, phi/w_i) then
             sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
             weighted average with weights=sample_weight.
 
+        P1 : None or array of shape (n_features*, ), optional\
+                (default=None)
+            With this array, you can exclude coefficients from ths L1 penalty.
+            Set the corresponding value to 1 (include) or 0 (exclude). The
+            default value ``None`` is the same as an array of ones.
+            Note that n_features* = X.shape[1] = length of coef_ (intercept
+            always excluded from counting).
+
+        P2 : None or array of shape (n_features*, n_features*)
+            With this square matrix the L2 penalty is calculated as `w P2 w`.
+            This gives a fine control over this penalty (Tikhonov
+            regularization).
+            Note that n_features* = X.shape[1] = length of coef_ (intercept
+            always excluded from counting).
+
+        check_input : boolean, optional (default=True)
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you do.
+
         Returns
         -------
         self : returns an instance of self.
         """
+        #######################################################################
+        # 1. input validation                                                 #
+        #######################################################################
+        # 1.1 validate arguments of fit #######################################
+        _dtype = [np.float64, np.float32]
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                         dtype=_dtype, y_numeric=True, multi_output=False)
+        y = y.astype(np.float64)
+
+        if sample_weight is None:
+            weights = np.ones_like(y)
+        elif np.isscalar(sample_weight):
+            weights = sample_weight*np.ones_like(y)
+        else:
+            weights = np.atleast_1d(sample_weight)
+            if weights.ndim > 1:
+                raise ValueError("Sample weight must be 1D array or scalar")
+            elif weights.shape[0] != y.shape[0]:
+                raise ValueError("Sample weights must have the same length as"
+                                 " y")
+        # IMPORTANT NOTE: Since we want to minimize
+        # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
+        # deviance = sum(sample_weight * unit_deviance),
+        # we rescale weights such that sum(weights) = 1 and this becomes
+        # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
+        weights = weights/np.sum(weights)
+
+        if not isinstance(check_input, bool):
+            raise ValueError("The argument check_input must be bool; got "
+                             "(check_input={0})".format(check_input))
+
+        # 1.2 validate arguments of __init__ ##################################
         # Garantee that self._family_instance is an instance of class
         # ExponentialDispersionModel
         if isinstance(self.family, ExponentialDispersionModel):
@@ -720,7 +898,8 @@ def fit(self, X, y, sample_weight=None):
                 raise ValueError(
                     "The family must be an instance of class"
                     " ExponentialDispersionModel or an element of"
-                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].")
+                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];"
+                    " got (family={0})".format(self.family))
 
         # Garantee that self._link_instance is set to an instance of class Link
         if isinstance(self.link, Link):
@@ -733,132 +912,232 @@ def fit(self, X, y, sample_weight=None):
             else:
                 raise ValueError(
                     "The link must be an instance of class Link or"
-                    " an element of ['identity', 'log'].")
-
+                    " an element of ['identity', 'log']; got (link={0})"
+                    .format(self.link))
+
+        if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
+            raise ValueError("Penalty term must be non-negative;"
+                             " got (alpha={0})".format(self.alpha))
+        if (not isinstance(self.l1_ratio, numbers.Number) or
+                self.l1_ratio < 0 or self.l1_ratio > 1):
+            raise ValueError("l1_ratio must be in interval [0, 1]; got"
+                             " (l1_ratio={0]})".format(self.l1_ratio))
         if not isinstance(self.fit_intercept, bool):
-            raise ValueError("The argument fit_intercept must be bool,"
+            raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
         if self.solver not in ['irls', 'lbfgs', 'newton-cg']:
             raise ValueError("GLM Regression supports only irls, lbfgs and"
                              "newton-cg solvers, got {0}".format(self.solver))
+        if self.alpha > 0:
+            if (self.l1_ratio > 0 and
+                    self.solver not in []):
+                # TODO: Add solver for L1
+                # raise ValueError("The solver option (solver={0}) is not "
+                #                  "appropriate for the chosen penalty which"
+                #                  " includes L1 (alpha={1})."
+                #                  .format(self.solver, self.alpha))
+                raise NotImplementedError("Currently, no solver is implemented"
+                                          " that can deal with L1 penalties.")
         if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
             raise ValueError("Maximum number of iteration must be positive;"
                              " got (max_iter={0!r})".format(self.max_iter))
         if not isinstance(self.tol, numbers.Number) or self.tol < 0:
             raise ValueError("Tolerance for stopping criteria must be "
                              "positive; got (tol={0!r})".format(self.tol))
+        if not isinstance(self.warm_start, bool):
+            raise ValueError("The argument warm_start must be bool;"
+                             " got {0}".format(self.warm_start))
         start_params = self.start_params
-        if start_params is not None and start_params is not 'ols':
+        if start_params is not None and start_params is not 'least_squares':
             start_params = np.atleast_1d(start_params)
-            if start_params.shape[0] != X.shape[1] + self.fit_intercept:
+            if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
+                    (start_params.ndim != 1)):
                 raise ValueError("Start values for parameters must have the"
-                                 "right length; required length {0}, got {1}"
+                                 "right length and dimension; required (length"
+                                 "={0}, ndim=1), got (length={1}, ndim={2})."
                                  .format(X.shape[1] + self.fit_intercept,
-                                         start_params.shape[0]))
-
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                         y_numeric=True, multi_output=False)
-        y = y.astype(np.float64)
+                                         start_params.shape[0],
+                                         start_params.ndim))
+        if not isinstance(self.copy_X, bool):
+            raise ValueError("The argument copy_X must be bool;"
+                             " got {0}".format(self.copy_X))
+
+        if P1 is None:
+            P1 = np.ones(X.shape[1])
+        else:
+            P1 = np.atleast_1d(P1)
+            if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1):
+                raise ValueError("P1 must be either None or an 1D array with "
+                                 "the length of X.shape[1]; "
+                                 "got (P1.shape[0]={0}), "
+                                 "needed (X.shape[1]={1})."
+                                 .format(P1.shape[0], X.shape[1]))
+        if P2 is None:
+            P2 = np.ones(X.shape[1])
+            if sparse.issparse(X):
+                P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0),
+                      shape=(X.shape[1], X.shape[1]))).tocsr()
+        else:
+            P2 = check_array(P2, accept_sparse=['csr', 'csc', 'coo'],
+                             dtype="numeric", ensure_2d=True)
+            if ((P2.shape[0] != P2.shape[1]) or
+                (P2.shape[0] != X.shape[1]) or
+                    (P2.ndim != 2)):
+                raise ValueError("P2 must be either None or an array of shape "
+                                 "(n_features, n_features) with "
+                                 "n_features=X.shape[1]; "
+                                 "got (P2.shape=({0},{1})), needed ({3},{3})"
+                                 .format(P2.shape[0], P2.shape[1], X.shape[1]))
 
         family = self._family_instance
         link = self._link_instance
 
-        if not np.all(family.in_y_range(y)):
-            raise ValueError("Some value(s) of y are out of the valid "
-                             "range for family {0}"
-                             .format(family.__class__.__name__))
-
-        if sample_weight is None:
-            weights = np.ones_like(y)
-        elif np.isscalar(sample_weight):
-            weights = sample_weight*np.ones_like(y)
-        else:
-            weights = np.atleast_1d(sample_weight)
-            if weights.ndim > 1:
-                raise ValueError("Sample weight must be 1D array or scalar")
-            elif weights.shape[0] != y.shape[0]:
-                raise ValueError("Sample weights must have the same length as"
-                                 " y")
-
         if self.fit_intercept:
             # intercept is first column <=> coef[0] is for intecept
             if sparse.issparse(X):
                 Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X])
             else:
                 Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
+            P1 = np.concatenate((np.array([0]), P1))
+            if P2.ndim == 1:
+                P2 = np.concatenate((np.array([0]), P2))
+            elif sparse.issparse(P2):
+                P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2),
+                                       dtype=P2.dtype).tocsr()
+            else:
+                P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))],
+                               [np.zeros((X.shape[1], 1)), P2]])
         else:
             Xnew = X
 
         n_samples, n_features = Xnew.shape
-
-        # Note: Since dispersion_ alias phi does not enter the estimation
-        #       of mu_i=E[y_i] set it to 1 where convenient.
+        l1 = self.alpha * self.l1_ratio
+        l2 = self.alpha * (1-self.l1_ratio)
+        P1 *= l1
+        P2 *= l2
+
+        # 1.3 additional validations ##########################################
+        if check_input:
+            if not np.all(family.in_y_range(y)):
+                raise ValueError("Some value(s) of y are out of the valid "
+                                 "range for family {0}"
+                                 .format(family.__class__.__name__))
+            # TODO: if alpha=0 check that Xnew is not rank deficient
+            # TODO: what else to check?
+
+        #######################################################################
+        # 2. initialization of coef = (intercept_, coef_)                     #
+        #######################################################################
+        # Note: Since phi=self.dispersion_ does not enter the estimation
+        #       of mu_i=E[y_i], set it to 1.
 
         # set start values for coef
         coef = None
-        if start_params is None:
-            # Use mu_start and apply one irls step to calculate coef
-            mu = family.starting_mu(y, weights)
-            # linear predictor
-            eta = link.link(mu)
-            # h'(eta)
-            hp = link.inverse_derivative(eta)
-            # working weights w, in principle a diagonal matrix
-            # therefore here just as 1d array
-            w = (hp**2 / family.variance(mu, phi=1, weights=weights))
-            wroot = np.sqrt(w)
-            # working observations
-            yw = eta + (y-mu)/hp
-            # least squares rescaled with wroot
-            wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples))
-            X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True)
-            yw_rescale = safe_sparse_dot(wroot, y, dense_output=True)
-            coef = linalg.lstsq(X_rescale, yw_rescale)[0]
-        elif start_params is 'ols':
-            reg = LinearRegression(copy_X=False, fit_intercept=False)
-            reg.fit(Xnew, link.link(y))
-            coef = reg.coef_
+        if self.warm_start and hasattr(self, "coef_"):
+            if self.fit_intercept:
+                coef = np.concatenate((self.intercept_, self.coef_))
+            else:
+                coef = self.coef_
+        elif self.start_params is None:
+            if self.l1_ratio == 0:
+                # See 3.1 IRLS
+                # Use mu_start and apply one irls step to calculate coef
+                mu = family.starting_mu(y, weights)
+                # linear predictor
+                eta = link.link(mu)
+                # h'(eta)
+                hp = link.inverse_derivative(eta)
+                # working weights W, in principle a diagonal matrix
+                # therefore here just as 1d array
+                W = (hp**2 / family.variance(mu, phi=1, weights=weights))
+                # working observations
+                z = eta + (y-mu)/hp
+                # solve A*coef = b
+                # A = X' W X + l2 P2, b = X' W z
+                coef = _irls_step(Xnew, W, P2, z)
+            else:
+                # with L1 penalty, start with coef = 0
+                coef = np.zeros(n_features)
+        elif self.start_params is 'least_squares':
+            if self.alpha == 0:
+                reg = LinearRegression(copy_X=True, fit_intercept=False)
+                reg.fit(Xnew, link.link(y))
+                coef = reg.coef_
+            elif self.l1_ratio <= 0.01:
+                # ElasticNet says l1_ratio <= 0.01 is not reliable, use Ridge
+                reg = Ridge(copy_X=True, fit_intercept=False,
+                            alpha=self.alpha)
+                reg.fit(Xnew, link.link(y))
+                coef = reg.coef_
+            else:
+                # TODO: Does this make sense?
+                reg = ElasticNet(copy_X=True, fit_intercept=False,
+                                 alpha=self.alpha, l1_ratio=self.l1_ratio)
+                reg.fit(Xnew, link.link(y))
+                coef = reg.coef_
         else:
             coef = start_params
 
+        #######################################################################
+        # 3. fit                                                              #
+        #######################################################################
         # algorithms for optimiation
         # TODO: Parallelize it
         self.n_iter_ = 0
         converged = False
+        # 3.1 IRLS ############################################################
+        # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj'
+        #   Obj = objective function = 1/2 Dev + l2/2 w P2 w
+        #   Dev = deviance, s = normalized weights, variance V(mu) but phi=1
+        #   D   = link.inverse_derivative(eta) = diag_matrix(h'(X w))
+        #   D2  = link.inverse_derivative(eta)^2 = D^2
+        #   W   = D2/V(mu)
+        #   l2  = alpha * (1 - l1_ratio)
+        #   Obj' = d(Obj)/d(w) = 1/2 Dev' + P2 w
+        #        = -X' D (y-mu)/V(mu) + l2 P2 w
+        #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
+        #   Use Fisher matrix instead of full info matrix -X'(...) X,
+        #    i.e. E[Dev''] with E[y-mu]=0:
+        #   Obj'' ~ X' W X + l2 P2
+        # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu)
+        # Note: P2 = l2*P2, see above
         if self.solver == 'irls':
-            # linear predictor
+            # eta = linear predictor
             eta = safe_sparse_dot(Xnew, coef, dense_output=True)
             mu = link.inverse(eta)
+            # D = h'(eta)
+            hp = link.inverse_derivative(eta)
+            V = family.variance(mu, phi=1, weights=weights)
             while self.n_iter_ < self.max_iter:
                 self.n_iter_ += 1
                 # coef_old not used so far.
                 # coef_old = coef
-                # h'(eta)
-                hp = link.inverse_derivative(eta)
-                # working weights w, in principle a diagonal matrix
+                # working weights W, in principle a diagonal matrix
                 # therefore here just as 1d array
-                w = (hp**2 / family.variance(mu, phi=1, weights=weights))
-                wroot = np.sqrt(w)
+                W = (hp**2 / V)
                 # working observations
-                yw = eta + (y-mu)/hp
-                # least squares rescaled with wroot
-                wroot = sparse.dia_matrix((wroot, 0),
-                                          shape=(n_samples, n_samples))
-                X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True)
-                yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True)
-                coef, residues, rank, singular_ = (
-                    linalg.lstsq(X_rescale, yw_rescale))
+                z = eta + (y-mu)/hp
+                # solve A*coef = b
+                # A = X' W X + l2 P2, b = X' W z
+                coef = _irls_step(Xnew, W, P2, z)
 
                 # updated linear predictor
                 # do it here for updated values for tolerance
                 eta = safe_sparse_dot(Xnew, coef, dense_output=True)
                 mu = link.inverse(eta)
+                hp = link.inverse_derivative(eta)
+                V = family.variance(mu, phi=1, weights=weights)
 
                 # which tolerace? |coef - coef_old| or gradient?
                 # use gradient for compliance with newton-cg and lbfgs
-                # TODO: faster computation of gradient, use mu and eta directly
-                gradient = family._deviance_derivative(
-                    coef=coef, X=Xnew, y=y, weights=weights, link=link)
+                # gradient = family._deviance_derivative(
+                #     coef=coef, X=Xnew, y=y, weights=weights, link=link)
+                # gradient = -X' D (y-mu)/V(mu) + l2 P2 w
+                gradient = -safe_sparse_dot(Xnew.T, hp*(y-mu)/V)
+                if P2.ndim == 1:
+                    gradient += P2*coef
+                else:
+                    gradient += safe_sparse_dot(P2, coef)
                 if (np.max(np.abs(gradient)) <= self.tol):
                     converged = True
                     break
@@ -868,50 +1147,73 @@ def fit(self, X, y, sample_weight=None):
                               "of iterations (currently {0})"
                               .format(self.max_iter), ConvergenceWarning)
 
+        # 3.2 L-BFGS and Newton-CG ############################################
         # TODO: performance: make one function return both deviance and
         #       gradient of deviance
-        elif self.solver == 'lbfgs':
-            func = family._deviance
-            fprime = family._deviance_derivative
-            args = (Xnew, y, weights, link)
-            coef, loss, info = optimize.fmin_l_bfgs_b(
-                func, coef, fprime=fprime,
-                args=args,
-                iprint=(self.verbose > 0) - 1, pgtol=self.tol,
-                maxiter=self.max_iter)
-            if self.verbose > 0:
-                if info["warnflag"] == 1:
-                    warnings.warn("lbfgs failed to converge."
-                                  " Increase the number of iterations.",
-                                  ConvergenceWarning)
-                elif info["warnflag"] == 2:
-                    warnings.warn("lbfgs failed for the reason: {0}".format(
-                        info["task"]))
-            self.n_iter_ = info['nit']
-        elif self.solver == 'newton-cg':
-            func = family._deviance
-            grad = family._deviance_derivative
+        elif self.solver in ['lbfgs', 'newton-cg']:
+            def func(coef, *args):
+                if P2.ndim == 1:
+                    L2 = safe_sparse_dot(coef.T, P2*coef)
+                else:
+                    L2 = safe_sparse_dot(coef.T, safe_sparse_dot(P2, coef))
+                    # A[np.diag_indices_from(A)] += P2
+                return 0.5*family._deviance(coef, *args) + 0.5*L2
+
+            def fprime(coef, *args):
+                if P2.ndim == 1:
+                    L2 = P2*coef
+                else:
+                    L2 = safe_sparse_dot(P2, coef)
+                return 0.5*family._deviance_derivative(coef, *args) + L2
 
             def grad_hess(coef, X, y, weights, link):
-                grad = (family._deviance_derivative(
-                    coef, X, y, weights, link))
-                hessian = (family._deviance_hessian(
-                    coef, X, y, weights, link))
+                if P2.ndim == 1:
+                    L2 = P2*coef
+                else:
+                    L2 = safe_sparse_dot(P2, coef)
+                grad = 0.5*family._deviance_derivative(
+                    coef, X, y, weights, link) + L2
+                hessian = 0.5*family._deviance_hessian(
+                    coef, X, y, weights, link)
+                if P2.ndim == 1:
+                    hessian[np.diag_indices_from(hessian)] += P2
+                else:
+                    hessian += P2
 
                 def Hs(s):
-                    ret = np.dot(hessian, s)
+                    ret = safe_sparse_dot(hessian, s)
                     return ret
                 return grad, Hs
-            hess = grad_hess
+
             args = (Xnew, y, weights, link)
-            coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args,
-                                       maxiter=self.max_iter, tol=self.tol)
-            self.coef_ = coef
 
+            if self.solver == 'lbfgs':
+                coef, loss, info = optimize.fmin_l_bfgs_b(
+                    func, coef, fprime=fprime, args=args,
+                    iprint=(self.verbose > 0) - 1, pgtol=self.tol,
+                    maxiter=self.max_iter)
+                if self.verbose > 0:
+                    if info["warnflag"] == 1:
+                        warnings.warn("lbfgs failed to converge."
+                                      " Increase the number of iterations.",
+                                      ConvergenceWarning)
+                    elif info["warnflag"] == 2:
+                        warnings.warn("lbfgs failed for the reason: {0}"
+                                      .format(info["task"]))
+                self.n_iter_ = info['nit']
+            elif self.solver == 'newton-cg':
+                coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef,
+                                           args=args, maxiter=self.max_iter,
+                                           tol=self.tol)
+
+        #######################################################################
+        # 4. postprocessing                                                   #
+        #######################################################################
         if self.fit_intercept:
             self.intercept_ = coef[0]
             self.coef_ = coef[1:]
         else:
+            # set intercept to zero as the other linear models do
             self.intercept_ = 0.
             self.coef_ = coef
 
@@ -988,8 +1290,8 @@ def score(self, X, y, sample_weight=None):
         :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}`
         is the null deviance, i.e. the deviance of a model with intercept
         alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean
-        :math:`\bar{y}` is average by sample_weight. In the case of a Normal
-        distribution, this D^2 equals R^2.
+        :math:`\bar{y}` is averaged by sample_weight. In the case of a Normal
+        distribution, D^2 equals R^2.
         Best possible score is 1.0 and it can be negative (because the
         model can be arbitrarily worse).
 
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index a4d4ea8650860..df0413b4d7836 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -1,19 +1,34 @@
 import numpy as np
 
 from sklearn.linear_model.glm import (
-    # Link, IdentityLink,
+    Link,
+    IdentityLink,
     LogLink,
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
-    # GeneralizedHyperbolicSecand,
+    GeneralizedHyperbolicSecand,
     GeneralizedLinearRegressor)
+from sklearn.linear_model.ridge import Ridge
 
 from sklearn.utils.testing import (
-    # assert_equal,
+    assert_equal, assert_almost_equal,
     assert_array_equal, assert_array_almost_equal)
 
 
+def test_link_properties():
+    """Test link inverse and derivative
+    """
+    rng = np.random.RandomState(0)
+    x = rng.rand(100)*100
+    from sklearn.linear_model.glm import Link
+    for link in vars()['Link'].__subclasses__():
+        link = link()
+        assert_almost_equal(link.link(link.inverse(x)), x, decimal=10)
+        assert_almost_equal(link.inverse_derivative(link.link(x)),
+                            1/link.derivative(x), decimal=10)
+
+
 def test_family_bounds():
     """Test the valid range of distributions
     """
@@ -42,8 +57,23 @@ def test_family_bounds():
     assert_array_equal(result, [False, False, True])
 
 
+def test_deviance_zero():
+    """Test deviance(y,y) = 0 for different families
+    """
+    for family in [NormalDistribution(), PoissonDistribution(),
+                   GammaDistribution(), InverseGaussianDistribution(),
+                   TweedieDistribution(power=-2.5),
+                   TweedieDistribution(power=-1),
+                   TweedieDistribution(power=1.5),
+                   TweedieDistribution(power=2.5),
+                   TweedieDistribution(power=4),
+                   GeneralizedHyperbolicSecand()]:
+        assert_almost_equal(family.deviance(0.1, 0.1), 0, decimal=10)
+        assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10)
+
+
 def test_glm_identiy_regression():
-    """Test linear regression on a simple dataset
+    """Test GLM regression with identity link on a simple dataset
     """
     coef = [1, 2]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
@@ -55,13 +85,13 @@ def test_glm_identiy_regression():
     for solver in ['irls', 'lbfgs', 'newton-cg']:
         for family in families:
             glm = GeneralizedLinearRegressor(
-                family=family, fit_intercept=False, solver=solver)
+                alpha=0, family=family, fit_intercept=False, solver=solver)
             res = glm.fit(X, y)
             assert_array_almost_equal(res.coef_, coef)
 
 
 def test_glm_log_regression():
-    """Test linear regression on a simple dataset
+    """Test GLM regression with log link on a simple dataset
     """
     coef = [1, 2]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
@@ -73,10 +103,79 @@ def test_glm_log_regression():
     for solver in ['irls', 'lbfgs', 'newton-cg']:
         for family in families:
             glm = GeneralizedLinearRegressor(
-                family=family, link=LogLink(), fit_intercept=False,
-                solver=solver, start_params='ols')
+                alpha=0, family=family, link=LogLink(), fit_intercept=False,
+                solver=solver, start_params='least_squares')
             res = glm.fit(X, y)
             assert_array_almost_equal(res.coef_, coef)
 
 
+def test_normal_ridge():
+    """Test ridge regression for Normal distributions
+
+    Compare to test_ridge in test_ridge.py.
+    """
+    rng = np.random.RandomState(0)
+    alpha = 1.0
+
+    # With more samples than features
+    n_samples, n_features, n_predict = 6, 5, 10
+    y = rng.randn(n_samples)
+    X = rng.randn(n_samples, n_features)
+    T = rng.randn(n_predict, n_features)
+
+    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True)
+    ridge.fit(X, y)
+    for solver in ['irls', 'lbfgs', 'newton-cg']:
+        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+                                         family='normal', link='identity',
+                                         fit_intercept=True, solver=solver)
+        glm.fit(X, y)
+        assert_equal(glm.coef_.shape, (X.shape[1], ))
+        assert_array_almost_equal(glm.coef_, ridge.coef_)
+        assert_almost_equal(glm.intercept_, ridge.intercept_)
+        assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False)
+    ridge.fit(X, y)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+                                     family='normal', link='identity',
+                                     fit_intercept=False, solver='irls')
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_)
+    assert_almost_equal(glm.intercept_, ridge.intercept_)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+
+    # With more features than samples
+    n_samples, n_features, n_predict = 5, 10, 10
+    y = rng.randn(n_samples)
+    X = rng.randn(n_samples, n_features)
+    T = rng.randn(n_predict, n_features)
+
+    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True)
+    ridge.fit(X, y)
+    for solver in ['irls', 'lbfgs', 'newton-cg']:
+        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+                                         family='normal', link='identity',
+                                         fit_intercept=True, solver=solver)
+        glm.fit(X, y)
+        assert_equal(glm.coef_.shape, (X.shape[1], ))
+        assert_array_almost_equal(glm.coef_, ridge.coef_)
+        assert_almost_equal(glm.intercept_, ridge.intercept_)
+        assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False)
+    ridge.fit(X, y)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+                                     family='normal', link='identity',
+                                     fit_intercept=False, solver='irls')
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_)
+    assert_almost_equal(glm.intercept_, ridge.intercept_)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+
+
 # TODO: Test compatibility with R's glm, glmnet

From 5b46c23977a8e386987a2767b2c12d4296d332af Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 19 Sep 2017 00:40:34 +0200
Subject: [PATCH 07/69] [WIP] Add Generalized Linear Models (#9405)

* fix some bugs in user guide linear_model.rst
* fix some pep8 issues in test_glm.py
---
 doc/modules/linear_model.rst           | 28 +++++++++++++++-----------
 sklearn/linear_model/tests/test_glm.py |  7 ++++---
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 51b3821fa6207..98736facd9b76 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -880,14 +880,14 @@ Generalized linear regression
 =============================
 
 :class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two
-ways [1]_. First, the predicted values :math:`\hat{y}` are linked to a linear
+ways [8]_. First, the predicted values :math:`\hat{y}` are linked to a linear
 combination of the input variables :math:`X` via an inverse link function
 :math:`h` as
 
 .. math::    \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p).
 
 Secondly, the squared loss function is replaced by the deviance :math:`D` of an
-exponential dispersion model (EDM) [2]_. The objective function beeing minimized
+exponential dispersion model (EDM) [9]_. The objective function beeing minimized
 becomes
 
 .. math::    \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1
@@ -914,16 +914,20 @@ it is convenient to apply a link function different from the identity link
 :math:`h(Xw)=\exp(Xw)`.
 
 Note that the feature matrix `X` should be standardized before fitting. This
-ensures that the penalty treats features equally.
+ensures that the penalty treats features equally. The estimator can be used as
+follows::
 
-    >>> from sklearn import linear_model
-    >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, l1_ratio=0)
-    >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
+    >>> from sklearn.linear_model import GeneralizedLinearRegressor
+    >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
+    GeneralizedLinearRegressor(alpha=0.5, copy_X=True, family='poisson',
+                  fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0,
+                  link='log', max_iter=100, solver='irls', start_params=None,
+                  tol=0.0001, verbose=0, warm_start=False)
     >>> reg.coef_
     array([ 0.24630255,  0.43373521])
-    >>> reg.intercept_
-    -0.76383575123143277
+    >>> reg.intercept_ #doctest: +ELLIPSIS
+    -0.76383575...
 
 Mathematical formulation
 ------------------------
@@ -969,7 +973,7 @@ Two remarks:
 
 * The deviances for at least Normal, Poisson and Gamma distributions are
   strictly consistent scoring functions for the mean :math:`\mu`, see Eq.
-  (19)-(20) in [3]_.
+  (19)-(20) in [10]_.
 
 * If you want to model a frequency, i.e. counts per exposure (time, volume, ...)
   you can do so by a Poisson distribution and passing
@@ -979,12 +983,12 @@ Two remarks:
 
 .. topic:: References:
 
-    .. [1] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+    .. [8] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
 
-    .. [2] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
+    .. [9] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
            See also `Exponential dispersion model. <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
 
-    .. [3] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
+    .. [10] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
 
 Stochastic Gradient Descent - SGD
 =================================
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index df0413b4d7836..b62b51b5bcb9e 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -2,7 +2,7 @@
 
 from sklearn.linear_model.glm import (
     Link,
-    IdentityLink,
+    # IdentityLink,
     LogLink,
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
@@ -21,8 +21,9 @@ def test_link_properties():
     """
     rng = np.random.RandomState(0)
     x = rng.rand(100)*100
-    from sklearn.linear_model.glm import Link
-    for link in vars()['Link'].__subclasses__():
+    # from sklearn.linear_model.glm import Link
+    # for link in vars()['Link'].__subclasses__():
+    for link in Link.__subclasses__():
         link = link()
         assert_almost_equal(link.link(link.inverse(x)), x, decimal=10)
         assert_almost_equal(link.inverse_derivative(link.link(x)),

From 10dd14603a5fc04f53ca4920621434aaff662064 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 3 Dec 2017 19:54:57 +0100
Subject: [PATCH 08/69] [WIP] Add Generalized Linear Models (#9405)

* added test: ridge poisson with log-link compared to glmnet
* fix ValueError message for l1_ratio
* fix ValueError message for P2
* string comparison: use '==' and '!=' instead of 'is' and 'is not'
* fix RuntimeWarnings in unit_deviance of poisson: x*log(x) as xlogy
* added test for fisher matrix
* added test for family argument
---
 sklearn/linear_model/glm.py            | 29 ++++++-----
 sklearn/linear_model/tests/test_glm.py | 72 ++++++++++++++++++++++++--
 2 files changed, 84 insertions(+), 17 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 2db3c56d5e1c1..93ce358a8a874 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -15,6 +15,8 @@
 # TODO: Make it as much consistent to other estimators in linear_model as
 #       possible
 # TODO: options P1 and P2 in fit() or in __init__()???
+# TODO: Include further classes in class.rst? ExponentialDispersionModel?
+#       TweedieDistribution?
 
 # Design Decisions:
 # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
@@ -42,7 +44,7 @@
 from abc import ABCMeta, abstractmethod, abstractproperty
 import numbers
 import numpy as np
-from scipy import linalg, optimize, sparse
+from scipy import linalg, optimize, sparse, special
 import warnings
 from .base import LinearRegression
 from .coordinate_descent import ElasticNet
@@ -340,7 +342,7 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link):
             = \mathbf{X}^T W \mathbf{X} \,,
 
         with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`,
-        see func:`score_function`.
+        see func:`_score`.
         """
         n_samples = X.shape[0]
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
@@ -363,7 +365,7 @@ def _observed_information(self, coef, phi, X, y, weights, link):
             \mathbf{H}(\boldsymbol{w}) =
             -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
             \partial\boldsymbol{w}^T}
-            = \mathbf{X}^T \legt[
+            = \mathbf{X}^T \left[
             - \mathbf{D}' \mathbf{R}
             + \mathbf{D}^2 \mathbf{V} \mathbf{R}
             + \mathbf{D}^2
@@ -393,7 +395,7 @@ def _deviance_derivative(self, coef, X, y, weights, link):
         r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a
         function of the coefficients `coef`.
         This is equivalent to :math:`-2\phi` times the score function
-        :func:`score_function` (derivative of the log-likelihood).
+        :func:`_score` (derivative of the log-likelihood).
         """
         score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights,
                             link=link)
@@ -510,7 +512,8 @@ def unit_deviance(self, y, mu):
             return (y-mu)**2
         if p == 1:
             # PoissonDistribution
-            return 2 * (np.where(y == 0, 0, y*np.log(y/mu))-y+mu)
+            # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
+            return 2 * (special.xlogy(y, y/mu) - y + mu)
         elif p == 2:
             # GammaDistribution
             return 2 * (np.log(mu/y)+y/mu-1)
@@ -921,7 +924,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
         if (not isinstance(self.l1_ratio, numbers.Number) or
                 self.l1_ratio < 0 or self.l1_ratio > 1):
             raise ValueError("l1_ratio must be in interval [0, 1]; got"
-                             " (l1_ratio={0]})".format(self.l1_ratio))
+                             " (l1_ratio={0})".format(self.l1_ratio))
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
@@ -948,7 +951,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
             raise ValueError("The argument warm_start must be bool;"
                              " got {0}".format(self.warm_start))
         start_params = self.start_params
-        if start_params is not None and start_params is not 'least_squares':
+        if start_params is not None and start_params != 'least_squares':
             start_params = np.atleast_1d(start_params)
             if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
                     (start_params.ndim != 1)):
@@ -986,7 +989,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
                 raise ValueError("P2 must be either None or an array of shape "
                                  "(n_features, n_features) with "
                                  "n_features=X.shape[1]; "
-                                 "got (P2.shape=({0},{1})), needed ({3},{3})"
+                                 "got (P2.shape=({0}, {1})), needed ({2}, {2})"
                                  .format(P2.shape[0], P2.shape[1], X.shape[1]))
 
         family = self._family_instance
@@ -1058,7 +1061,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
             else:
                 # with L1 penalty, start with coef = 0
                 coef = np.zeros(n_features)
-        elif self.start_params is 'least_squares':
+        elif self.start_params == 'least_squares':
             if self.alpha == 0:
                 reg = LinearRegression(copy_X=True, fit_intercept=False)
                 reg.fit(Xnew, link.link(y))
@@ -1277,11 +1280,9 @@ def estimate_phi(self, y, X, sample_weight):
             dev = self._family_instance.deviance(y, mu, sample_weight)
             return dev/(n_samples - n_features)
 
-# TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5"
-#       in check_estimator for score
-#       from sklearn.utils.estimator_checks import check_estimator
-#       from sklearn.linear_model import GeneralizedLinearRegressor
-#       check_estimator(GeneralizedLinearRegressor)
+    # Note: check_estimator(GeneralizedLinearRegressor) might raise
+    # "AssertionError: -0.28014056555724598 not greater than 0.5"
+    # unless GeneralizedLinearRegressor has a score which passes the test.
     def score(self, X, y, sample_weight=None):
         r"""Returns D^2, a generalization of the coefficient of determination
         R^2, which uses deviance instead of squared error.
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index b62b51b5bcb9e..de7de90db967b 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -1,4 +1,6 @@
 import numpy as np
+from numpy.testing import assert_allclose
+import scipy as sp
 
 from sklearn.linear_model.glm import (
     Link,
@@ -73,6 +75,46 @@ def test_deviance_zero():
         assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10)
 
 
+def test_fisher_matrix():
+    """Test the Fisher matrix numerically.
+    Trick: Use numerical differentiation with y = mu"""
+    for family in [NormalDistribution(), PoissonDistribution(),
+                   GammaDistribution(), InverseGaussianDistribution()]:
+        link = LogLink()
+        rng = np.random.RandomState(0)
+        coef = np.array([-2, 1, 0, 1, 2.5])
+        phi = 0.5
+        X = rng.randn(10, 5)
+        lin_pred = np.dot(X, coef)
+        mu = link.inverse(lin_pred)
+        weights = rng.randn(10)**2 + 1
+        fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu,
+                                       weights=weights, link=link)
+        approx = np.array([]).reshape(0, coef.shape[0])
+        for i in range(coef.shape[0]):
+            def f(coef):
+                return -family._score(coef=coef, phi=phi, X=X, y=mu,
+                                      weights=weights, link=link)[i]
+            approx = np.vstack(
+                [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]
+                )
+        assert_allclose(fisher, approx, rtol=1e-3)
+
+
+def test_glm_family_argument():
+    """Test GLM family argument set as string
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for (f, fam) in [('normal', NormalDistribution()),
+                     ('poisson', PoissonDistribution()),
+                     ('gamma', GammaDistribution()),
+                     ('inverse.gaussian', InverseGaussianDistribution())]:
+        glm = GeneralizedLinearRegressor(family=f, fit_intercept=False,
+                                         alpha=0).fit(X, y)
+        assert_equal(type(glm._family_instance), type(fam))
+
+
 def test_glm_identiy_regression():
     """Test GLM regression with identity link on a simple dataset
     """
@@ -82,7 +124,8 @@ def test_glm_identiy_regression():
     families = (
         NormalDistribution(), PoissonDistribution(),
         GammaDistribution(), InverseGaussianDistribution(),
-        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5))
+        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
+        GeneralizedHyperbolicSecand())
     for solver in ['irls', 'lbfgs', 'newton-cg']:
         for family in families:
             glm = GeneralizedLinearRegressor(
@@ -100,7 +143,8 @@ def test_glm_log_regression():
     families = (
         NormalDistribution(), PoissonDistribution(),
         GammaDistribution(), InverseGaussianDistribution(),
-        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5))
+        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
+        GeneralizedHyperbolicSecand())
     for solver in ['irls', 'lbfgs', 'newton-cg']:
         for family in families:
             glm = GeneralizedLinearRegressor(
@@ -179,4 +223,26 @@ def test_normal_ridge():
     assert_array_almost_equal(glm.predict(T), ridge.predict(T))
 
 
-# TODO: Test compatibility with R's glm, glmnet
+def test_poisson_ridge():
+    """Test ridge regression with poisson family and LogLink
+
+    Compare to R's glmnet"""
+    # library("glmnet")
+    # options(digits=10)
+    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
+    # x <- data.matrix(df[,c("a", "b")])
+    # y <- df$y
+    # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson",
+    #               standardize=F, thresh=1e-10, nlambda=10000)
+    # coef(fit, s=1)
+    # (Intercept) -0.12889386979
+    # a            0.29019207995
+    # b            0.03741173122
+    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
+    y = np.array([0, 1, 1, 2])
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, family='poisson',
+                                     link='log', tol=1e-10)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, -0.12889386979, decimal=7)
+    assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
+                              decimal=7)

From 72485b63e89879e65381bca12152b54600fd3970 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Mon, 8 Jan 2018 22:13:45 +0100
Subject: [PATCH 09/69] [WIP] Add Generalized Linear Models (#9405)

* put arguments P1, P2 and check_input from fit to __init__
* added check_input test: is P2 positive definite?
* added solver option: 'auto'
---
 sklearn/linear_model/glm.py | 181 +++++++++++++++++++++++-------------
 1 file changed, 117 insertions(+), 64 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 93ce358a8a874..3f6b91026ef9b 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -14,7 +14,7 @@
 # TODO: Write examples and more docu
 # TODO: Make it as much consistent to other estimators in linear_model as
 #       possible
-# TODO: options P1 and P2 in fit() or in __init__()???
+# TODO: which dtype to force for y and X? Which for P1, P2?
 # TODO: Include further classes in class.rst? ExponentialDispersionModel?
 #       TweedieDistribution?
 
@@ -709,6 +709,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
+    P1 : None or array of shape (n_features*, ), optional\
+            (default=None)
+        With this array, you can exclude coefficients from the L1 penalty.
+        Set the corresponding value to 1 (include) or 0 (exclude). The
+        default value ``None`` is the same as an array of ones.
+        Note that n_features* = X.shape[1] = length of coef_ (intercept
+        always excluded from counting).
+
+    P2 : None or array of shape (n_features*, n_features*)
+        With this square matrix the L2 penalty is calculated as `w P2 w`.
+        This gives a fine control over this penalty (Tikhonov
+        regularization).
+        Note that n_features* = X.shape[1] = length of coef_ (intercept
+        always excluded from counting). P2 must be positive semi-definite.
+
     fit_intercept : boolean, optional (default=True)
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
@@ -727,13 +742,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         the chi squared statisic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'irls', 'newton-cg', 'lbfgs'}, optional (defaul='irls')
+    solver : {'auto', 'irls', 'newton-cg', 'lbfgs'}, optional (defaul='auto')
         Algorithm to use in the optimization problem.
 
-        - 'irls' is iterated reweighted least squares. It is the standard
-            algorithm for GLMs.
+        - 'irls' is iterated reweighted least squares (Fisher scoring).
+            It is the standard algorithm for GLMs. Cannot deal with
+            L1 penalties.
+
+        - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties.
 
-        - 'newton-cg', 'lbfgs'
+        - 'auto' sets 'irls'.
 
     max_iter : int, optional (default=100)
         TODO
@@ -766,6 +784,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
 
+    check_input : boolean, optional (default=True)
+        Allow to bypass several checks on input: y values in range of family,
+        sample_weights non-negative, P2 positive semi-definite.
+        Don't use this parameter unless you know what you do.
+
     verbose : int, optional (default=0)
         For the lbfgs solver set verbose to any positive number for verbosity.
 
@@ -790,13 +813,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     ----------
     TODO
     """
-    def __init__(self, alpha=1.0, l1_ratio=0,
+    def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
                  fit_intercept=True, family='normal', link='identity',
-                 fit_dispersion='chisqr', solver='irls', max_iter=100,
+                 fit_dispersion='chisqr', solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False, start_params=None, copy_X=True,
-                 verbose=0):
+                 check_input=True, verbose=0):
         self.alpha = alpha
         self.l1_ratio = l1_ratio
+        self.P1 = P1
+        self.P2 = P2
         self.fit_intercept = fit_intercept
         self.family = family
         self.link = link
@@ -807,10 +832,10 @@ def __init__(self, alpha=1.0, l1_ratio=0,
         self.warm_start = warm_start
         self.start_params = start_params
         self.copy_X = copy_X
+        self.check_input = check_input
         self.verbose = verbose
 
-    def fit(self, X, y, sample_weight=None, P1=None, P2=None,
-            check_input=True):
+    def fit(self, X, y, sample_weight=None):
         """Fit a generalized linear model.
 
         Parameters
@@ -823,31 +848,13 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
 
         sample_weight : array of shape (n_samples, ) or None,\
                 optinal (default=None)
-            Individual weights for each sample.
-            Var[Y_i]=phi/weight_i * v(mu)
-            If Y_i ~ EDM(mu, phi/w_i) then
+            Individual weights w_i for each sample. Note that for an
+            Exponential Dispersion Model (EDM), one has
+            Var[Y_i]=phi/w_i * v(mu).
+            If Y_i ~ EDM(mu, phi/w_i), then
             sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
             weighted average with weights=sample_weight.
 
-        P1 : None or array of shape (n_features*, ), optional\
-                (default=None)
-            With this array, you can exclude coefficients from ths L1 penalty.
-            Set the corresponding value to 1 (include) or 0 (exclude). The
-            default value ``None`` is the same as an array of ones.
-            Note that n_features* = X.shape[1] = length of coef_ (intercept
-            always excluded from counting).
-
-        P2 : None or array of shape (n_features*, n_features*)
-            With this square matrix the L2 penalty is calculated as `w P2 w`.
-            This gives a fine control over this penalty (Tikhonov
-            regularization).
-            Note that n_features* = X.shape[1] = length of coef_ (intercept
-            always excluded from counting).
-
-        check_input : boolean, optional (default=True)
-            Allow to bypass several input checking.
-            Don't use this parameter unless you know what you do.
-
         Returns
         -------
         self : returns an instance of self.
@@ -872,16 +879,6 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
             elif weights.shape[0] != y.shape[0]:
                 raise ValueError("Sample weights must have the same length as"
                                  " y")
-        # IMPORTANT NOTE: Since we want to minimize
-        # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
-        # deviance = sum(sample_weight * unit_deviance),
-        # we rescale weights such that sum(weights) = 1 and this becomes
-        # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
-        weights = weights/np.sum(weights)
-
-        if not isinstance(check_input, bool):
-            raise ValueError("The argument check_input must be bool; got "
-                             "(check_input={0})".format(check_input))
 
         # 1.2 validate arguments of __init__ ##################################
         # Garantee that self._family_instance is an instance of class
@@ -928,17 +925,22 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
-        if self.solver not in ['irls', 'lbfgs', 'newton-cg']:
-            raise ValueError("GLM Regression supports only irls, lbfgs and"
-                             "newton-cg solvers, got {0}".format(self.solver))
+        if self.solver == 'auto':
+            solver = 'irls'
+        else:
+            solver = self.solver
+        if solver not in ['irls', 'lbfgs', 'newton-cg']:
+            raise ValueError("GeneralizedLinearRegressor supports only irls, "
+                             "lbfgs and newton-cg solvers, got {0}"
+                             "".format(solver))
         if self.alpha > 0:
             if (self.l1_ratio > 0 and
-                    self.solver not in []):
+                    solver not in []):
                 # TODO: Add solver for L1
                 # raise ValueError("The solver option (solver={0}) is not "
                 #                  "appropriate for the chosen penalty which"
                 #                  " includes L1 (alpha={1})."
-                #                  .format(self.solver, self.alpha))
+                #                  .format(solver, self.alpha))
                 raise NotImplementedError("Currently, no solver is implemented"
                                           " that can deal with L1 penalties.")
         if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
@@ -964,28 +966,32 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
         if not isinstance(self.copy_X, bool):
             raise ValueError("The argument copy_X must be bool;"
                              " got {0}".format(self.copy_X))
+        if not isinstance(self.check_input, bool):
+            raise ValueError("The attribute check_input must be bool; got "
+                             "(check_input={0})".format(self.check_input))
 
-        if P1 is None:
+        if self.P1 is None:
             P1 = np.ones(X.shape[1])
         else:
-            P1 = np.atleast_1d(P1)
+            P1 = np.atleast_1d(np.copy(self.P1))
             if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1):
                 raise ValueError("P1 must be either None or an 1D array with "
                                  "the length of X.shape[1]; "
                                  "got (P1.shape[0]={0}), "
                                  "needed (X.shape[1]={1})."
                                  .format(P1.shape[0], X.shape[1]))
-        if P2 is None:
+        if self.P2 is None:
             P2 = np.ones(X.shape[1])
             if sparse.issparse(X):
                 P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0),
                       shape=(X.shape[1], X.shape[1]))).tocsr()
         else:
-            P2 = check_array(P2, accept_sparse=['csr', 'csc', 'coo'],
+            P2 = check_array(self.P2, copy=True,
+                             accept_sparse=['csr', 'csc', 'coo'],
                              dtype="numeric", ensure_2d=True)
-            if ((P2.shape[0] != P2.shape[1]) or
-                (P2.shape[0] != X.shape[1]) or
-                    (P2.ndim != 2)):
+            if ((P2.ndim != 2) or
+                    (P2.shape[0] != P2.shape[1]) or
+                    (P2.shape[0] != X.shape[1])):
                 raise ValueError("P2 must be either None or an array of shape "
                                  "(n_features, n_features) with "
                                  "n_features=X.shape[1]; "
@@ -1020,16 +1026,39 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
         P2 *= l2
 
         # 1.3 additional validations ##########################################
-        if check_input:
+        if self.check_input:
             if not np.all(family.in_y_range(y)):
                 raise ValueError("Some value(s) of y are out of the valid "
                                  "range for family {0}"
                                  .format(family.__class__.__name__))
+            if not np.all(weights >= 0):
+                raise ValueError("Sample weights must be non-negative.")
+            # check that P2 is positive semidefinite
+            # np.linalg.cholesky(P2) 'only' asserts positive definite
+            if self.P2 is not None:
+                if sparse.issparse(P2):
+                    # TODO: check sparse P2 for non-negativeness
+                    raise NotImplementedError("Check sparse P2 for "
+                                              "non-negaitveness is not yet "
+                                              "implemented.")
+                elif P2.ndim == 2:
+                    if not np.all(np.linalg.eigvals(P2) >= -1e-15):
+                        raise ValueError("P2 must be positive definite.")
             # TODO: if alpha=0 check that Xnew is not rank deficient
             # TODO: what else to check?
 
         #######################################################################
-        # 2. initialization of coef = (intercept_, coef_)                     #
+        # 2. rescaling of weights (sample_weight)                             #
+        #######################################################################
+        # IMPORTANT NOTE: Since we want to minimize
+        # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
+        # deviance = sum(sample_weight * unit_deviance),
+        # we rescale weights such that sum(weights) = 1 and this becomes
+        # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
+        weights = weights/np.sum(weights)
+
+        #######################################################################
+        # 3. initialization of coef = (intercept_, coef_)                     #
         #######################################################################
         # Note: Since phi=self.dispersion_ does not enter the estimation
         #       of mu_i=E[y_i], set it to 1.
@@ -1082,13 +1111,13 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
             coef = start_params
 
         #######################################################################
-        # 3. fit                                                              #
+        # 4. fit                                                              #
         #######################################################################
         # algorithms for optimiation
         # TODO: Parallelize it
         self.n_iter_ = 0
         converged = False
-        # 3.1 IRLS ############################################################
+        # 4.1 IRLS ############################################################
         # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj'
         #   Obj = objective function = 1/2 Dev + l2/2 w P2 w
         #   Dev = deviance, s = normalized weights, variance V(mu) but phi=1
@@ -1104,7 +1133,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
         #   Obj'' ~ X' W X + l2 P2
         # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu)
         # Note: P2 = l2*P2, see above
-        if self.solver == 'irls':
+        if solver == 'irls':
             # eta = linear predictor
             eta = safe_sparse_dot(Xnew, coef, dense_output=True)
             mu = link.inverse(eta)
@@ -1150,10 +1179,10 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None,
                               "of iterations (currently {0})"
                               .format(self.max_iter), ConvergenceWarning)
 
-        # 3.2 L-BFGS and Newton-CG ############################################
+        # 4.2 L-BFGS and Newton-CG ############################################
         # TODO: performance: make one function return both deviance and
         #       gradient of deviance
-        elif self.solver in ['lbfgs', 'newton-cg']:
+        elif solver in ['lbfgs', 'newton-cg']:
             def func(coef, *args):
                 if P2.ndim == 1:
                     L2 = safe_sparse_dot(coef.T, P2*coef)
@@ -1190,7 +1219,7 @@ def Hs(s):
 
             args = (Xnew, y, weights, link)
 
-            if self.solver == 'lbfgs':
+            if solver == 'lbfgs':
                 coef, loss, info = optimize.fmin_l_bfgs_b(
                     func, coef, fprime=fprime, args=args,
                     iprint=(self.verbose > 0) - 1, pgtol=self.tol,
@@ -1204,13 +1233,37 @@ def Hs(s):
                         warnings.warn("lbfgs failed for the reason: {0}"
                                       .format(info["task"]))
                 self.n_iter_ = info['nit']
-            elif self.solver == 'newton-cg':
+            elif solver == 'newton-cg':
                 coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef,
                                            args=args, maxiter=self.max_iter,
                                            tol=self.tol)
 
+        # 4.3 coordinate descent ##############################################
+        # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+        # An Improved GLMNET for L1-regularized Logistic Regression,
+        # Journal of Machine Learning Research 13 (2012) 1999-2030
+        # Note: Use Fisher matrix instead of Hessian
+        #
+        # 1. find optimal descent direction d by minimizing
+        #    min_d F(w+d) = min_d F(w+d) - F(w)
+        #    F = f + g; f(w) = 1/2 dev; g(w) = 1/2*w*P2*w + ||P1*w||_1
+        # 2. quadrdatic approx of f(w+d)-f(w):
+        #    q(d) = f'(w)*d +1/2 d*H*d
+        #    min_d q(d) + g(w+d) - g(w)
+        # 3. coordinate descent by updating coordinate j (d -> d+z*e_j):
+        #    min_z q(d+z*e_j) + g(w+d+z*e_j) - g(w)
+        #    = min_z q(d+z e_j) - q(d) + g(w+d+z*e_j) - g(w+d)
+        # TODO
+        # elif solver == 'cd':
+            # line search parameters
+            # (beta, sigma) = (0.5, 0.01)
+            # for iteration k from 1 to maxiter
+            #     for coordinate j sample at random
+            #     np.random.choice(coord, replace = False)
+            #
+
         #######################################################################
-        # 4. postprocessing                                                   #
+        # 5. postprocessing                                                   #
         #######################################################################
         if self.fit_intercept:
             self.intercept_ = coef[0]

From 5c1369bde863a73aff46a502acd70e58e06dcb85 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 24 Jan 2018 15:22:08 +0100
Subject: [PATCH 10/69] [WIP] Add Generalized Linear Models (#9405)

* added coordinate descent solver
* skip doctest for GeneralizedLinearRegressor example
* symmetrize P2 => use P2 = 1/2 (P2+P2')
* better validation of parameter start_params
---
 doc/modules/linear_model.rst           |  21 +-
 sklearn/linear_model/glm.py            | 353 ++++++++++++++++++++-----
 sklearn/linear_model/tests/test_glm.py |  82 ++++--
 3 files changed, 365 insertions(+), 91 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 98736facd9b76..834466e494a4a 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -894,9 +894,9 @@ becomes
             +\frac{\alpha(1-\rho)}{2} w^T P_2 w
 
 with sample weights :math:`s`.
-:math:`P_1` can be used to exclude some of the coefficients in the L1
-penalty, :math:`P_2` (must be positive semi-definite) allows for a more
-versatile L2 penalty.
+:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in
+the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows
+for a more versatile L2 penalty.
 
 Use cases, where a loss different from the squared loss might be appropriate,
 are the following:
@@ -908,22 +908,23 @@ are the following:
   * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family).
 
 Since the linear predictor :math:`Xw` can be negative and
-Poisson, Gamma and Inverse Gaussian distributions don't have negative values,
+Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
 it is convenient to apply a link function different from the identity link
 :math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with
 :math:`h(Xw)=\exp(Xw)`.
 
 Note that the feature matrix `X` should be standardized before fitting. This
 ensures that the penalty treats features equally. The estimator can be used as
-follows::
+follows:
 
     >>> from sklearn.linear_model import GeneralizedLinearRegressor
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
-    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
-    GeneralizedLinearRegressor(alpha=0.5, copy_X=True, family='poisson',
-                  fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0,
-                  link='log', max_iter=100, solver='irls', start_params=None,
-                  tol=0.0001, verbose=0, warm_start=False)
+    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +SKIP
+    GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True,
+              copy_X=True, family='poisson', fit_dispersion='chisqr',
+              fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
+              random_state=None, selection='random', solver='auto',
+              start_params=None, tol=0.0001, verbose=0, warm_start=False)
     >>> reg.coef_
     array([ 0.24630255,  0.43373521])
     >>> reg.intercept_ #doctest: +ELLIPSIS
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 3f6b91026ef9b..3de82c20f33cf 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -3,20 +3,21 @@
 """
 
 # Author: Christian Lorentzen <lorentzen.ch@googlemail.ch>
+# some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
 # TODO: Write more tests
-# TODO: Add l1-penalty (elastic net)
+# TODO: Write examples and more docu
 # TODO: deal with option self.copy_X
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
 # TODO: Add cross validation
-# TODO: Write examples and more docu
-# TODO: Make it as much consistent to other estimators in linear_model as
-#       possible
-# TODO: which dtype to force for y and X? Which for P1, P2?
+# TODO: Should GeneralizedLinearRegressor inherit from LinearModel?
+#       So far, it does not.
 # TODO: Include further classes in class.rst? ExponentialDispersionModel?
 #       TweedieDistribution?
+# TODO: Negative values in P1 are not allowed so far. They could be used form
+#       group lasse.
 
 # Design Decisions:
 # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
@@ -52,10 +53,11 @@
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..externals import six
+from ..externals.six.moves import xrange
 from ..utils import check_array, check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import newton_cg
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, check_random_state
 
 
 class Link(six.with_metaclass(ABCMeta)):
@@ -309,7 +311,9 @@ def _score(self, coef, phi, X, y, weights, link):
 
         .. math:
 
-            \mathbf{score}(\boldsymbol{w}) = \mathbf{X}^T \mathbf{D}
+            \mathbf{score}(\boldsymbol{w})
+            = \frac{\partial loglike}{\partial\boldsymbol{w}}
+            = \mathbf{X}^T \mathbf{D}
             \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,,
 
         with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and
@@ -411,6 +415,29 @@ def _deviance_hessian(self, coef, X, y, weights, link):
                                                  weights=weights, link=link)
         return 2*info_matrix
 
+    def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link):
+        """Calculates eta (linear predictor), mu, score function (derivative
+        of log-likelihood) and Fisher matrix (all with phi=1) all in one go"""
+        n_samples, n_features = X.shape
+        # eta = linear predictor
+        eta = safe_sparse_dot(X, coef, dense_output=True)
+        mu = link.inverse(eta)
+        sigma_inv = 1./self.variance(mu, phi=phi, weights=weights)
+        d1 = link.inverse_derivative(eta)  # = h'(eta)
+        # Alternatively:
+        # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
+        # d1 = 1./link.derivative(mu)
+        d1_sigma_inv = sparse.dia_matrix((sigma_inv*d1, 0),
+                                         shape=(n_samples, n_samples))
+        temp = safe_sparse_dot(d1_sigma_inv, (y-mu), dense_output=True)
+        score = safe_sparse_dot(X.T, temp, dense_output=True)
+        #
+        d2_sigma_inv = sparse.dia_matrix((sigma_inv*(d1**2), 0),
+                                         shape=(n_samples, n_samples))
+        temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False)
+        fisher = safe_sparse_dot(X.T, temp, dense_output=False)
+        return eta, mu, score, fisher
+
     def starting_mu(self, y, weights=1):
         """Starting values for the mean mu_i in (unpenalized) IRLS."""
         return ((weights*y+np.mean(weights*y)) /
@@ -670,8 +697,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
             alpha = a + b and l1_ratio = a / (a + b)
 
     The parameter `l1_ratio` corresponds to alpha in the glmnet R package while
-    alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio
-    = 1 is the lasso penalty.
+    'alpha' corresponds to the lambda parameter in glmnet. Specifically,
+    l1_ratio = 1 is the lasso penalty.
 
     Read more in the :ref:`User Guide <Generalized_linear_regression>`.
 
@@ -686,6 +713,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     TODO: For `alpha` > 0, the feature matrix `X` is assumed to be
     standardized. Call
     :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
+    Otherwise, the strength of the penalty is different for the features.
 
     TODO: Estimation of the dispersion parameter phi.
 
@@ -742,19 +770,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         the chi squared statisic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'auto', 'irls', 'newton-cg', 'lbfgs'}, optional (defaul='auto')
+    solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \
+            optional (defaul='auto')
         Algorithm to use in the optimization problem.
 
+        - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'.
+
         - 'irls' is iterated reweighted least squares (Fisher scoring).
             It is the standard algorithm for GLMs. Cannot deal with
             L1 penalties.
 
         - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties.
 
-        - 'auto' sets 'irls'.
+        - 'cd' is the coordinate descent algorithm. It can deal with L1 and
+            L2 penalties.
 
     max_iter : int, optional (default=100)
-        TODO
+        The maximal number of iterations for solver algorithms.
 
     tol : float, optional (default=1e-4)
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
@@ -781,6 +813,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         This option only applies if ``warm_start=False`` or if fit is called
         the first time (``self.coef_`` does not exist).
 
+    selection : str, optional (default='random')
+        For the solver 'cd' (coordinate descent), the coordinates (features)
+        can be updated in either cyclic or random order.
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator that selects a random
+        feature to be updated for solver 'cd' (coordinate descent).
+        If int, random_state is the seed used by the random
+        number generator; if RandomState instance, random_state is the random
+        number generator; if None, the random number generator is the
+        RandomState instance used by `np.random`. Used when ``selection`` ==
+        'random'.
+
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
 
@@ -816,7 +865,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
                  fit_intercept=True, family='normal', link='identity',
                  fit_dispersion='chisqr', solver='auto', max_iter=100,
-                 tol=1e-4, warm_start=False, start_params=None, copy_X=True,
+                 tol=1e-4, warm_start=False, start_params=None,
+                 selection='random', random_state=None, copy_X=True,
                  check_input=True, verbose=0):
         self.alpha = alpha
         self.l1_ratio = l1_ratio
@@ -831,6 +881,8 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
         self.tol = tol
         self.warm_start = warm_start
         self.start_params = start_params
+        self.selection = selection
+        self.random_state = random_state
         self.copy_X = copy_X
         self.check_input = check_input
         self.verbose = verbose
@@ -925,24 +977,21 @@ def fit(self, X, y, sample_weight=None):
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
-        if self.solver == 'auto':
-            solver = 'irls'
-        else:
-            solver = self.solver
-        if solver not in ['irls', 'lbfgs', 'newton-cg']:
+        if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']:
             raise ValueError("GeneralizedLinearRegressor supports only irls, "
-                             "lbfgs and newton-cg solvers, got {0}"
-                             "".format(solver))
-        if self.alpha > 0:
-            if (self.l1_ratio > 0 and
-                    solver not in []):
-                # TODO: Add solver for L1
-                # raise ValueError("The solver option (solver={0}) is not "
-                #                  "appropriate for the chosen penalty which"
-                #                  " includes L1 (alpha={1})."
-                #                  .format(solver, self.alpha))
-                raise NotImplementedError("Currently, no solver is implemented"
-                                          " that can deal with L1 penalties.")
+                             "auto, lbfgs, newton-cg and cd solvers, got {0}"
+                             "".format(self.solver))
+        solver = self.solver
+        if self.solver == 'auto':
+            if self.l1_ratio == 0:
+                solver = 'irls'
+            else:
+                solver = 'cd'
+        if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']):
+                raise ValueError("The chosen solver (solver={0}) can't deal "
+                                 "with L1 penalties, which are included with "
+                                 "(alpha={1}) and (l1_ratio={2})."
+                                 .format(solver, self.alpha, self.l1_ratio))
         if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
             raise ValueError("Maximum number of iteration must be positive;"
                              " got (max_iter={0!r})".format(self.max_iter))
@@ -953,7 +1002,14 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("The argument warm_start must be bool;"
                              " got {0}".format(self.warm_start))
         start_params = self.start_params
-        if start_params is not None and start_params != 'least_squares':
+        if start_params is None:
+            pass
+        elif isinstance(start_params, six.string_types):
+            if start_params not in ['least_squares']:
+                raise ValueError("The argument start_params must be None, "
+                                 "'least-squares' or an array of right length,"
+                                 " got(start_params={0})".format(start_params))
+        else:
             start_params = np.atleast_1d(start_params)
             if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
                     (start_params.ndim != 1)):
@@ -963,6 +1019,12 @@ def fit(self, X, y, sample_weight=None):
                                  .format(X.shape[1] + self.fit_intercept,
                                          start_params.shape[0],
                                          start_params.ndim))
+
+        if self.selection not in ['cyclic', 'random']:
+            raise ValueError("The argument selection must be 'cyclic' or "
+                             "'random', got (selection={0})"
+                             .format(self.selection))
+        random_state = check_random_state(self.random_state)
         if not isinstance(self.copy_X, bool):
             raise ValueError("The argument copy_X must be bool;"
                              " got {0}".format(self.copy_X))
@@ -974,15 +1036,16 @@ def fit(self, X, y, sample_weight=None):
             P1 = np.ones(X.shape[1])
         else:
             P1 = np.atleast_1d(np.copy(self.P1))
-            if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1):
+            if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]):
                 raise ValueError("P1 must be either None or an 1D array with "
                                  "the length of X.shape[1]; "
                                  "got (P1.shape[0]={0}), "
                                  "needed (X.shape[1]={1})."
                                  .format(P1.shape[0], X.shape[1]))
         if self.P2 is None:
-            P2 = np.ones(X.shape[1])
-            if sparse.issparse(X):
+            if not sparse.issparse(X):
+                P2 = np.ones(X.shape[1])
+            else:
                 P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0),
                       shape=(X.shape[1], X.shape[1]))).tocsr()
         else:
@@ -1024,6 +1087,12 @@ def fit(self, X, y, sample_weight=None):
         l2 = self.alpha * (1-self.l1_ratio)
         P1 *= l1
         P2 *= l2
+        # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
+        # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
+        if sparse.issparse(P2):
+            P2 = 0.5 * (P2 + P2.transpose())
+        else:
+            P2 = 0.5 * (P2 + P2.T)
 
         # 1.3 additional validations ##########################################
         if self.check_input:
@@ -1033,14 +1102,20 @@ def fit(self, X, y, sample_weight=None):
                                  .format(family.__class__.__name__))
             if not np.all(weights >= 0):
                 raise ValueError("Sample weights must be non-negative.")
-            # check that P2 is positive semidefinite
+            # check if P1 has only non-negative values, negative values might
+            # indicate group lasso in the future.
+            if self.P1 is not None:
+                if not np.all(P1 >= 0):
+                    raise ValueError("P1 must not have negative values.")
+            # check if P2 is positive semidefinite
             # np.linalg.cholesky(P2) 'only' asserts positive definite
             if self.P2 is not None:
                 if sparse.issparse(P2):
                     # TODO: check sparse P2 for non-negativeness
-                    raise NotImplementedError("Check sparse P2 for "
-                                              "non-negaitveness is not yet "
-                                              "implemented.")
+                    # raise NotImplementedError("Check sparse P2 for "
+                    #                          "non-negaitveness is not yet "
+                    #                          "implemented.")
+                    pass
                 elif P2.ndim == 2:
                     if not np.all(np.linalg.eigvals(P2) >= -1e-15):
                         raise ValueError("P2 must be positive definite.")
@@ -1090,7 +1165,8 @@ def fit(self, X, y, sample_weight=None):
             else:
                 # with L1 penalty, start with coef = 0
                 coef = np.zeros(n_features)
-        elif self.start_params == 'least_squares':
+        elif (isinstance(self.start_params, six.string_types) and
+                self.start_params == 'least_squares'):
             if self.alpha == 0:
                 reg = LinearRegression(copy_X=True, fit_intercept=False)
                 reg.fit(Xnew, link.link(y))
@@ -1102,7 +1178,7 @@ def fit(self, X, y, sample_weight=None):
                 reg.fit(Xnew, link.link(y))
                 coef = reg.coef_
             else:
-                # TODO: Does this make sense?
+                # TODO: Does this make sense at all?
                 reg = ElasticNet(copy_X=True, fit_intercept=False,
                                  alpha=self.alpha, l1_ratio=self.l1_ratio)
                 reg.fit(Xnew, link.link(y))
@@ -1125,14 +1201,17 @@ def fit(self, X, y, sample_weight=None):
         #   D2  = link.inverse_derivative(eta)^2 = D^2
         #   W   = D2/V(mu)
         #   l2  = alpha * (1 - l1_ratio)
-        #   Obj' = d(Obj)/d(w) = 1/2 Dev' + P2 w
+        #   Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w
         #        = -X' D (y-mu)/V(mu) + l2 P2 w
         #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
         #   Use Fisher matrix instead of full info matrix -X'(...) X,
         #    i.e. E[Dev''] with E[y-mu]=0:
         #   Obj'' ~ X' W X + l2 P2
-        # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu)
-        # Note: P2 = l2*P2, see above
+        # (1): w = (X' W X + l2 P2)^-1 X' W z,
+        #      with z = eta + D^-1 (y-mu)
+        # Note: we already set P2 = l2*P2, see above
+        # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+        # Note: ' denotes derivative, but also transpose for matrices
         if solver == 'irls':
             # eta = linear predictor
             eta = safe_sparse_dot(Xnew, coef, dense_output=True)
@@ -1150,9 +1229,8 @@ def fit(self, X, y, sample_weight=None):
                 # working observations
                 z = eta + (y-mu)/hp
                 # solve A*coef = b
-                # A = X' W X + l2 P2, b = X' W z
+                # A = X' W X + P2, b = X' W z
                 coef = _irls_step(Xnew, W, P2, z)
-
                 # updated linear predictor
                 # do it here for updated values for tolerance
                 eta = safe_sparse_dot(Xnew, coef, dense_output=True)
@@ -1242,25 +1320,182 @@ def Hs(s):
         # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
         # An Improved GLMNET for L1-regularized Logistic Regression,
         # Journal of Machine Learning Research 13 (2012) 1999-2030
-        # Note: Use Fisher matrix instead of Hessian
+        # Note: Use Fisher matrix instead of Hessian for H
         #
         # 1. find optimal descent direction d by minimizing
         #    min_d F(w+d) = min_d F(w+d) - F(w)
-        #    F = f + g; f(w) = 1/2 dev; g(w) = 1/2*w*P2*w + ||P1*w||_1
-        # 2. quadrdatic approx of f(w+d)-f(w):
-        #    q(d) = f'(w)*d +1/2 d*H*d
-        #    min_d q(d) + g(w+d) - g(w)
+        #    F = f + g, f(w) = 1/2 deviance, g(w) = 1/2 w*P2*w + ||P1*w||_1
+        # 2. quadrdatic approximation of F(w+d)-F(w) = q(d):
+        #    using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives
+        #    q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
+        #           + ||P1*(w+d)||_1 - ||P1*w||_1
+        #    min_d q(d)
         # 3. coordinate descent by updating coordinate j (d -> d+z*e_j):
-        #    min_z q(d+z*e_j) + g(w+d+z*e_j) - g(w)
-        #    = min_z q(d+z e_j) - q(d) + g(w+d+z*e_j) - g(w+d)
-        # TODO
-        # elif solver == 'cd':
+        #    min_z q(d+z*e_j)
+        #    = min_z q(d+z*e_j) - q(d)
+        #    = min_z A_j z + 1/2 B_jj z^2
+        #            + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
+        #    A = f'(w) + d*H(w) + (w+d)*P2
+        #    B = H+P2
+        # Note: we already set P2 = l2*P2, P1 = l1*P1, see above
+        # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+        # Note: f' = -score, H = Fisher matrix
+        elif solver == 'cd':
             # line search parameters
-            # (beta, sigma) = (0.5, 0.01)
-            # for iteration k from 1 to maxiter
-            #     for coordinate j sample at random
-            #     np.random.choice(coord, replace = False)
-            #
+            (beta, sigma) = (0.5, 0.01)
+            # max inner loops (cycles through all features)
+            max_inner_iter = 1000
+            # some precalculations
+            eta, mu, score, fisher = family._eta_mu_score_fisher(
+                coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link)
+            # initial stopping tolerance of inner loop
+            # use L1-norm of minimum-norm of subgradient of F
+            # fp_wP2 = f'(w) + w*P2
+            if P2.ndim == 1:
+                fp_wP2 = -score + coef*P2
+            else:
+                fp_wP2 = -score + safe_sparse_dot(coef, P2)
+            inner_tol = (np.where(coef == 0,
+                         np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
+                         fp_wP2+np.sign(coef)*P1))
+            inner_tol = linalg.norm(inner_tol, ord=1)
+            # outer loop
+            while self.n_iter_ < self.max_iter:
+                self.n_iter_ += 1
+                # initialize search direction d (to be optimized)
+                d = np.zeros_like(coef)
+                # inner loop
+                # TODO: use sparsity (coefficient already 0 due to L1 penalty)
+                d = np.zeros_like(coef)
+                # A = f'(w) + d*H(w) + (w+d)*P2
+                # B = H+P2
+                # Note: f'=-score and H=fisher are updated at the end of outer
+                #       iteration
+                B = fisher
+                if P2.ndim == 1:
+                    coef_P2 = coef * P2
+                    B[np.diag_indices_from(B)] += P2
+                else:
+                    coef_P2 = safe_sparse_dot(coef, P2)
+                    B += P2
+                A = -score + coef_P2  # + d*(H+P2) but d=0 so far
+                inner_iter = 0
+                while inner_iter < max_inner_iter:
+                    inner_iter += 1
+                    if self.selection == 'random':
+                        featurelist = random_state.permutation(n_features)
+                    else:
+                        featurelist = np.arange(n_features)
+                    for j in featurelist:
+                        # minimize_z: a z + 1/2 b z^2 + c |d+z|
+                        # a = A_j
+                        # b = B_jj > 0
+                        # c = |P1_j| = P1_j > 0, ee 1.3
+                        # d = w_j + d_j
+                        # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
+                        # with beta = z+d, beta_hat = d-a/b and gamma = c/b
+                        # z = 1/b * S(bd-a,c) - d
+                        # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
+                        a = A[j]
+                        b = B[j, j]
+                        if P1[j] == 0:
+                            if b == 0:
+                                z = 0
+                            else:
+                                z = -a/b
+                        elif a + P1[j] < b * (coef[j]+d[j]):
+                            if b == 0:
+                                z = 0
+                            else:
+                                z = -(a + P1[j])/b
+                        elif a - P1[j] > b * (coef[j]+d[j]):
+                            if b == 0:
+                                z = 0
+                            else:
+                                z = -(a - P1[j])/b
+                        else:
+                            z = -(coef[j] + d[j])
+                        # update direction d
+                        d[j] += z
+                        # update A because d_j is now d_j+z
+                        # A = f'(w) + d*H(w) + (w+d)*P2
+                        # => A += (H+P2)*e_j z  = B_j * z
+                        # Note: B is symmetric B = B.transpose
+                        if sparse.issparse(B):
+                            if sparse.isspmatrix_csc(B):
+                                # slice columns
+                                A += B[:, j].toarray().ravel() * z
+                            else:
+                                # slice rows
+                                A += B[j, :].toarray().ravel() * z
+                        else:
+                            A += B[j, :] * z
+                        # end of cycle
+                    # stopping criterion for inner loop
+                    # sum_i(|minimum-norm subgrad of q(d)_i|)
+                    mn_subgrad = (np.where(coef + d == 0,
+                                  np.sign(A)*np.maximum(np.abs(A)-P1, 0),
+                                  A+np.sign(coef+d)*P1))
+                    mn_subgrad = np.sum(np.abs(mn_subgrad))
+                    if mn_subgrad <= inner_tol:
+                        if inner_iter == 1:
+                            inner_tol = inner_tol/4.
+                        break
+                    # end of inner loop
+                # line search by sequence beta^k, k=0, 1, ..
+                # F(w + lambda d) - F(w) <= lambda * bound
+                # bound = sigma * (f'(w)*d + w*P2*d
+                #                  +||P1 (w+d)||_1 - ||P1 w||_1)
+                P1w_1 = linalg.norm(P1*coef, ord=1)
+                # Note: coef_P2 already calculated and still valid
+                bound = sigma * (
+                    safe_sparse_dot(-score, d) +
+                    safe_sparse_dot(coef_P2, d) +
+                    linalg.norm(P1*(coef+d), ord=1) -
+                    P1w_1)
+                Fw = (0.5 * family.deviance(y, mu, weights) +
+                      0.5 * safe_sparse_dot(coef_P2, coef) +
+                      P1w_1)
+                la = 1./beta
+                for k in range(20):
+                    la *= beta  # starts with la=1
+                    mu_wd = link.inverse(safe_sparse_dot(Xnew, coef+la*d,
+                                         dense_output=True))
+                    Fwd = (0.5 * family.deviance(y, mu_wd, weights) +
+                           linalg.norm(P1*(coef+la*d), ord=1))
+                    if P2.ndim == 1:
+                        Fwd += 0.5 * safe_sparse_dot((coef+la*d)*P2, coef+la*d)
+                    else:
+                        Fwd += 0.5 * (safe_sparse_dot(coef+la*d,
+                                      safe_sparse_dot(P2, coef+la*d)))
+                    if Fwd-Fw <= sigma*la*bound:
+                        break
+                # update coefficients
+                # coef_old = coef.copy()
+                coef += la * d
+                # calculate eta, mu, score, Fisher matrix for next iteration
+                eta, mu, score, fisher = family._eta_mu_score_fisher(
+                    coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link)
+                # stopping criterion for outer loop
+                # sum_i(|minimum-norm subgrad of F(w)_i|)
+                # fp_wP2 = f'(w) + w*P2
+                # Note: eta, mu and score are already updated
+                if P2.ndim == 1:
+                    fp_wP2 = -score + coef*P2
+                else:
+                    fp_wP2 = -score + safe_sparse_dot(coef, P2)
+                mn_subgrad = (np.where(coef == 0,
+                              np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
+                              fp_wP2+np.sign(coef)*P1))
+                mn_subgrad = np.sum(np.abs(mn_subgrad))
+                if mn_subgrad <= self.tol:
+                    converged = True
+                    break
+                # end of outer loop
+            if not converged:
+                warnings.warn("Coordinate descent failed to converge. Increase"
+                              " the number of iterations (currently {0})"
+                              .format(self.max_iter), ConvergenceWarning)
 
         #######################################################################
         # 5. postprocessing                                                   #
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index de7de90db967b..87cc8bea45f5b 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -1,6 +1,7 @@
 import numpy as np
 from numpy.testing import assert_allclose
 import scipy as sp
+from scipy import sparse
 
 from sklearn.linear_model.glm import (
     Link,
@@ -126,7 +127,7 @@ def test_glm_identiy_regression():
         GammaDistribution(), InverseGaussianDistribution(),
         TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
         GeneralizedHyperbolicSecand())
-    for solver in ['irls', 'lbfgs', 'newton-cg']:
+    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
         for family in families:
             glm = GeneralizedLinearRegressor(
                 alpha=0, family=family, fit_intercept=False, solver=solver)
@@ -162,28 +163,31 @@ def test_normal_ridge():
     rng = np.random.RandomState(0)
     alpha = 1.0
 
-    # With more samples than features
+    # 1. With more samples than features
     n_samples, n_features, n_predict = 6, 5, 10
     y = rng.randn(n_samples)
     X = rng.randn(n_samples, n_features)
     T = rng.randn(n_predict, n_features)
 
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True)
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6,
+                  solver='svd', normalize=False)
     ridge.fit(X, y)
-    for solver in ['irls', 'lbfgs', 'newton-cg']:
+    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
         glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
                                          family='normal', link='identity',
-                                         fit_intercept=True, solver=solver)
+                                         fit_intercept=True, tol=1e-6,
+                                         max_iter=100, solver=solver)
         glm.fit(X, y)
         assert_equal(glm.coef_.shape, (X.shape[1], ))
         assert_array_almost_equal(glm.coef_, ridge.coef_)
         assert_almost_equal(glm.intercept_, ridge.intercept_)
         assert_array_almost_equal(glm.predict(T), ridge.predict(T))
 
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False)
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
+                  solver='svd', normalize=False)
     ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6,
                                      family='normal', link='identity',
                                      fit_intercept=False, solver='irls')
     glm.fit(X, y)
@@ -192,28 +196,30 @@ def test_normal_ridge():
     assert_almost_equal(glm.intercept_, ridge.intercept_)
     assert_array_almost_equal(glm.predict(T), ridge.predict(T))
 
-    # With more features than samples
+    # 2. With more features than samples and sparse
     n_samples, n_features, n_predict = 5, 10, 10
     y = rng.randn(n_samples)
-    X = rng.randn(n_samples, n_features)
-    T = rng.randn(n_predict, n_features)
+    X = sparse.csr_matrix(rng.randn(n_samples, n_features))
+    T = sparse.csr_matrix(rng.randn(n_predict, n_features))
 
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True)
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9,
+                  solver='sag', normalize=False, max_iter=100000)
     ridge.fit(X, y)
-    for solver in ['irls', 'lbfgs', 'newton-cg']:
-        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
+        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7,
                                          family='normal', link='identity',
                                          fit_intercept=True, solver=solver)
         glm.fit(X, y)
         assert_equal(glm.coef_.shape, (X.shape[1], ))
-        assert_array_almost_equal(glm.coef_, ridge.coef_)
-        assert_almost_equal(glm.intercept_, ridge.intercept_)
-        assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+        assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5)
+        assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5)
+        assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5)
 
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False)
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
+                  solver='sag', normalize=False, max_iter=1000)
     ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6,
                                      family='normal', link='identity',
                                      fit_intercept=False, solver='irls')
     glm.fit(X, y)
@@ -240,9 +246,41 @@ def test_poisson_ridge():
     # b            0.03741173122
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
-    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, family='poisson',
-                                     link='log', tol=1e-10)
+    s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 7, 'cd': 7}
+    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
+        glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
+                                         fit_intercept=True, family='poisson',
+                                         link='log', tol=1e-7,
+                                         solver=solver, max_iter=200)
+        glm.fit(X, y)
+        assert_almost_equal(glm.intercept_, -0.12889386979,
+                            decimal=s_dec[solver])
+        assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
+                                  decimal=s_dec[solver])
+
+
+def test_poisson_enet():
+    """Test elastic net regression with poisson family and LogLink
+
+    Compare to R's glmnet"""
+    # library("glmnet")
+    # options(digits=10)
+    # library("glmnet")
+    # options(digits=10)
+    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
+    # x <- data.matrix(df[,c("a", "b")])
+    # y <- df$y
+    # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson",
+    #               standardize=F, thresh=1e-10, nlambda=10000)
+    # coef(fit, s=1)
+    # (Intercept) -0.03550978409
+    # a            0.16936423283
+    # b            .
+    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
+    y = np.array([0, 1, 1, 2])
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
+                                     link='log', tol=1e-7)
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, -0.12889386979, decimal=7)
-    assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
+    assert_almost_equal(glm.intercept_, -0.03550978409, decimal=7)
+    assert_array_almost_equal(glm.coef_, [0.16936423283, 0.],
                               decimal=7)

From 91497a2abc4824cdcb72f88dc26c9fd347d54b0d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 24 Jan 2018 20:37:27 +0100
Subject: [PATCH 11/69] [WIP] Add Generalized Linear Models (#9405)

* bug for sparse matrices for newton-cg solver, function grad_hess
* reduce precision for solver newton-cg in test_poisson_ridge
* remedy doctest issues in linear_model.rst for example of GeneralizedLinearRegressor
* remove unused import of xrange from six
---
 doc/modules/linear_model.rst           | 2 +-
 sklearn/linear_model/glm.py            | 3 +--
 sklearn/linear_model/tests/test_glm.py | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 834466e494a4a..1f0946e97b059 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -919,7 +919,7 @@ follows:
 
     >>> from sklearn.linear_model import GeneralizedLinearRegressor
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
-    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +SKIP
+    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
     GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True,
               copy_X=True, family='poisson', fit_dispersion='chisqr',
               fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 3de82c20f33cf..25f3ee1f52a2e 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -53,7 +53,6 @@
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..externals import six
-from ..externals.six.moves import xrange
 from ..utils import check_array, check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import newton_cg
@@ -1288,7 +1287,7 @@ def grad_hess(coef, X, y, weights, link):
                 if P2.ndim == 1:
                     hessian[np.diag_indices_from(hessian)] += P2
                 else:
-                    hessian += P2
+                    hessian = hessian + P2
 
                 def Hs(s):
                     ret = safe_sparse_dot(hessian, s)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 87cc8bea45f5b..c48c59ebd0eda 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -246,7 +246,7 @@ def test_poisson_ridge():
     # b            0.03741173122
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
-    s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 7, 'cd': 7}
+    s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7}
     for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
         glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
                                          fit_intercept=True, family='poisson',

From b9e5105ddb011a2a4efd74eeb3033ebb824fa5a8 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 24 Jan 2018 21:44:10 +0100
Subject: [PATCH 12/69] [WIP] Add Generalized Linear Models (#9405)

* bug in cd solver for sparse matrices
* higer precision (smaller tol) in test_normal_ridge for sparse matrices
* for each solver a separate precision (tol) in test_poisson_ridge
---
 sklearn/linear_model/glm.py            |  2 +-
 sklearn/linear_model/tests/test_glm.py | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 25f3ee1f52a2e..b428ee7509d14 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -1376,7 +1376,7 @@ def Hs(s):
                     B[np.diag_indices_from(B)] += P2
                 else:
                     coef_P2 = safe_sparse_dot(coef, P2)
-                    B += P2
+                    B = B + P2
                 A = -score + coef_P2  # + d*(H+P2) but d=0 so far
                 inner_iter = 0
                 while inner_iter < max_inner_iter:
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index c48c59ebd0eda..baad852dfb945 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -207,19 +207,20 @@ def test_normal_ridge():
                   solver='sag', normalize=False, max_iter=100000)
     ridge.fit(X, y)
     for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
-        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7,
+        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8,
                                          family='normal', link='identity',
-                                         fit_intercept=True, solver=solver)
+                                         fit_intercept=True, solver=solver,
+                                         max_iter=300)
         glm.fit(X, y)
         assert_equal(glm.coef_.shape, (X.shape[1], ))
         assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5)
         assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5)
         assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5)
 
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7,
                   solver='sag', normalize=False, max_iter=1000)
     ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6,
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7,
                                      family='normal', link='identity',
                                      fit_intercept=False, solver='irls')
     glm.fit(X, y)
@@ -247,11 +248,12 @@ def test_poisson_ridge():
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
     s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7}
+    s_tol = {'irls': 1e-8, 'lbfgs': 1e-7, 'newton-cg': 1e-7, 'cd': 1e-8}
     for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
         glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
                                          fit_intercept=True, family='poisson',
-                                         link='log', tol=1e-7,
-                                         solver=solver, max_iter=200)
+                                         link='log', tol=s_tol[solver],
+                                         solver=solver, max_iter=300)
         glm.fit(X, y)
         assert_almost_equal(glm.intercept_, -0.12889386979,
                             decimal=s_dec[solver])

From e317422e9dd860c4ed5a3c6ac6191eb8e560c365 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 25 Jan 2018 21:44:04 +0100
Subject: [PATCH 13/69] [WIP] Add Generalized Linear Models (#9405)

* improved documentation
* additional option 'zero' for argument start_params
* validation of sample_weight in function predict
* input validation of estimate_phi
* set default fit_dispersion=None
* bug in estimate_phi because of weight rescaling
* test for estimate_phi in normal ridge regression
* extended tests for elastic net poisson
---
 sklearn/linear_model/glm.py            | 116 ++++++++++++++++---------
 sklearn/linear_model/tests/test_glm.py |  39 +++++++--
 2 files changed, 110 insertions(+), 45 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index b428ee7509d14..e5eda6108052c 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -11,13 +11,13 @@
 # TODO: deal with option self.copy_X
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
-# TODO: Add cross validation
+# TODO: Add cross validation support
 # TODO: Should GeneralizedLinearRegressor inherit from LinearModel?
 #       So far, it does not.
 # TODO: Include further classes in class.rst? ExponentialDispersionModel?
 #       TweedieDistribution?
-# TODO: Negative values in P1 are not allowed so far. They could be used form
-#       group lasse.
+# TODO: Negative values in P1 are not allowed so far. They could be used to
+#       for group lasso.
 
 # Design Decisions:
 # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
@@ -642,7 +642,7 @@ def _irls_step(X, W, P2, z):
     -------
     coef: array, shape = (X.shape[1])
     """
-    # TODO: scipy.linalg.solve if faster, but ordinary least squares uses
+    # TODO: scipy.linalg.solve is faster, but ordinary least squares uses
     #       scipy.linalg.lstsq. What is more appropriate?
     n_samples, n_features = X.shape
     if sparse.issparse(X):
@@ -709,16 +709,20 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     (penalized) maximum likelihood which is equivalent to minimizing the
     deviance.
 
-    TODO: For `alpha` > 0, the feature matrix `X` is assumed to be
-    standardized. Call
+    For `alpha` > 0, the feature matrix `X` should be standardized in order to
+    penalize features equally strong. Call
     :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
-    Otherwise, the strength of the penalty is different for the features.
 
     TODO: Estimation of the dispersion parameter phi.
 
-    TODO: Notes on weights and 'scaled' distributions. For Poisson, this means
-    to fit y = z/w with z=counts and w=exposure (time, money, persons, ...)
-    => y is a ratio with weights w. Same for other distributions.
+    If your target `y` is a ratio, you should also provide appropriate weights
+    `w`. As an example, consider Poission distributed counts `z` (integers) and
+    weights `w`=exposure (time, money, persons years, ...), then you fit
+    `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y,
+    sample_weight=w)``. You need the weights for the right mean, consider:
+    :math:`\bar(y) = \frac{\sum_i w_i y_i}{\sum_i w_i}`.
+    In this case one might say that y has a 'scaled' Poisson distributions.
+    The same holds for other distributions.
 
     Parameters
     ----------
@@ -800,8 +804,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         does not exit (first call to fit), option ``start_params`` sets the
         starting values for ``coef_`` and ``intercept_``.
 
-    start_params : None or array of shape (n_features, ) or 'least_squares'}, \
-            optional (default=None)
+    start_params : {None, 'least_squares', 'zero'} or array of shape \
+            (n_features, ) or }, optional (default=None)
         If an array of size n_features is supplied, use these as start values
         for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
         is assumed to be the start value for the ``intercept_``.
@@ -854,16 +858,18 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     n_iter_ : int
         Actual number of iterations of the solver.
 
-    Notes
-    -----
 
     References
     ----------
-    TODO
+    For the coordinate descent implementation:
+    .. [1] Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+           An Improved GLMNET for L1-regularized Logistic Regression,
+           Journal of Machine Learning Research 13 (2012) 1999-2030
+           https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
     """
     def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
                  fit_intercept=True, family='normal', link='identity',
-                 fit_dispersion='chisqr', solver='auto', max_iter=100,
+                 fit_dispersion=None, solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False, start_params=None,
                  selection='random', random_state=None, copy_X=True,
                  check_input=True, verbose=0):
@@ -1004,9 +1010,10 @@ def fit(self, X, y, sample_weight=None):
         if start_params is None:
             pass
         elif isinstance(start_params, six.string_types):
-            if start_params not in ['least_squares']:
+            if start_params not in ['least_squares', 'zero']:
                 raise ValueError("The argument start_params must be None, "
-                                 "'least-squares' or an array of right length,"
+                                 "'least-squares', 'zero' or an array of right"
+                                 " length,"
                                  " got(start_params={0})".format(start_params))
         else:
             start_params = np.atleast_1d(start_params)
@@ -1129,6 +1136,7 @@ def fit(self, X, y, sample_weight=None):
         # deviance = sum(sample_weight * unit_deviance),
         # we rescale weights such that sum(weights) = 1 and this becomes
         # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
+        weights_sum = np.sum(weights)
         weights = weights/np.sum(weights)
 
         #######################################################################
@@ -1141,7 +1149,8 @@ def fit(self, X, y, sample_weight=None):
         coef = None
         if self.warm_start and hasattr(self, "coef_"):
             if self.fit_intercept:
-                coef = np.concatenate((self.intercept_, self.coef_))
+                coef = np.concatenate((np.array([self.intercept_]),
+                                       self.coef_))
             else:
                 coef = self.coef_
         elif self.start_params is None:
@@ -1164,24 +1173,27 @@ def fit(self, X, y, sample_weight=None):
             else:
                 # with L1 penalty, start with coef = 0
                 coef = np.zeros(n_features)
-        elif (isinstance(self.start_params, six.string_types) and
-                self.start_params == 'least_squares'):
-            if self.alpha == 0:
-                reg = LinearRegression(copy_X=True, fit_intercept=False)
-                reg.fit(Xnew, link.link(y))
-                coef = reg.coef_
-            elif self.l1_ratio <= 0.01:
-                # ElasticNet says l1_ratio <= 0.01 is not reliable, use Ridge
-                reg = Ridge(copy_X=True, fit_intercept=False,
-                            alpha=self.alpha)
-                reg.fit(Xnew, link.link(y))
-                coef = reg.coef_
-            else:
-                # TODO: Does this make sense at all?
-                reg = ElasticNet(copy_X=True, fit_intercept=False,
-                                 alpha=self.alpha, l1_ratio=self.l1_ratio)
-                reg.fit(Xnew, link.link(y))
-                coef = reg.coef_
+        elif isinstance(self.start_params, six.string_types):
+            if self.start_params == 'zero':
+                coef = np.zeros(n_features)
+            elif self.start_params == 'least_squares':
+                if self.alpha == 0:
+                    reg = LinearRegression(copy_X=True, fit_intercept=False)
+                    reg.fit(Xnew, link.link(y))
+                    coef = reg.coef_
+                elif self.l1_ratio <= 0.01:
+                    # ElasticNet says l1_ratio <= 0.01 is not reliable
+                    # => use Ridge
+                    reg = Ridge(copy_X=True, fit_intercept=False,
+                                alpha=self.alpha)
+                    reg.fit(Xnew, link.link(y))
+                    coef = reg.coef_
+                else:
+                    # TODO: Does this make sense at all?
+                    reg = ElasticNet(copy_X=True, fit_intercept=False,
+                                     alpha=self.alpha, l1_ratio=self.l1_ratio)
+                    reg.fit(Xnew, link.link(y))
+                    coef = reg.coef_
         else:
             coef = start_params
 
@@ -1365,6 +1377,7 @@ def Hs(s):
                 d = np.zeros_like(coef)
                 # inner loop
                 # TODO: use sparsity (coefficient already 0 due to L1 penalty)
+                #       => active set of features for featurelist, see paper
                 d = np.zeros_like(coef)
                 # A = f'(w) + d*H(w) + (w+d)*P2
                 # B = H+P2
@@ -1508,7 +1521,8 @@ def Hs(s):
             self.coef_ = coef
 
         if self.fit_dispersion in ['chisqr', 'deviance']:
-            self.dispersion_ = self.estimate_phi(y, X, weights)
+            # attention because of rescaling of weights
+            self.dispersion_ = self.estimate_phi(y, X, weights)*weights_sum
 
         return self
 
@@ -1544,9 +1558,23 @@ def predict(self, X, sample_weight=1):
         C : array, shape = (n_samples)
             Returns predicted values times sample_weight.
         """
-        # TODO: validation of sample_weight
         eta = self.linear_predictor(X)
         mu = self._link_instance.inverse(eta)
+        if sample_weight is None:
+            return mu
+        elif np.isscalar(sample_weight):
+            if sample_weight <= 0:
+                raise ValueError("Sample weight must be positive, "
+                                 "got (sample_weight={0})."
+                                 .format(sample_weight))
+        else:
+            sample_weights = np.atleast_1d(sample_weight)
+            if sample_weight.ndim > 1:
+                raise ValueError("Sample weight must be 1D array or scalar.")
+            elif sample_weight.shape[0] != mu.shape[0]:
+                raise ValueError("Sample weights must have the same length as"
+                                 " X.shape[1].")
+
         return mu*sample_weight
 
     def estimate_phi(self, y, X, sample_weight):
@@ -1554,10 +1582,20 @@ def estimate_phi(self, y, X, sample_weight):
         Returns the estimate.
         """
         check_is_fitted(self, "coef_")
+        _dtype = [np.float64, np.float32]
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                         dtype=_dtype, y_numeric=True, multi_output=False)
         n_samples, n_features = X.shape
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
             eta += self.intercept_
+            n_features += 1
+        if n_samples <= n_features:
+            raise ValueError("Estimation of dispersion parameter phi requires"
+                             " more samples than features, got"
+                             " samples=X.shape[0]={0} and"
+                             " n_features=X.shape[1]+fit_intercept={1}."
+                             .format(n_samples, n_features))
         mu = self._link_instance.inverse(eta)
         if self.fit_dispersion == 'chisqr':
             chisq = np.sum(sample_weight*(y-mu)**2 /
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index baad852dfb945..ee90cd51c874d 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -164,7 +164,7 @@ def test_normal_ridge():
     alpha = 1.0
 
     # 1. With more samples than features
-    n_samples, n_features, n_predict = 6, 5, 10
+    n_samples, n_features, n_predict = 10, 5, 10
     y = rng.randn(n_samples)
     X = rng.randn(n_samples, n_features)
     T = rng.randn(n_predict, n_features)
@@ -189,12 +189,16 @@ def test_normal_ridge():
     ridge.fit(X, y)
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6,
                                      family='normal', link='identity',
-                                     fit_intercept=False, solver='irls')
+                                     fit_intercept=False, solver='irls',
+                                     fit_dispersion='chisqr')
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
     assert_array_almost_equal(glm.coef_, ridge.coef_)
     assert_almost_equal(glm.intercept_, ridge.intercept_)
     assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+    mu = glm.predict(X)
+    assert_almost_equal(glm.dispersion_,
+                        np.sum((y-mu)**2/(n_samples-n_features)))
 
     # 2. With more features than samples and sparse
     n_samples, n_features, n_predict = 5, 10, 10
@@ -278,11 +282,34 @@ def test_poisson_enet():
     # (Intercept) -0.03550978409
     # a            0.16936423283
     # b            .
+    glmnet_intercept = -0.03550978409
+    glmnet_coef = [0.16936423283, 0.]
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
-                                     link='log', tol=1e-7)
+                                     link='log', solver='cd', tol=1e-7)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7)
+    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7)
+
+    # same for start_params='zero' with reduced precision
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
+                                     link='log', solver='cd', tol=1e-5,
+                                     start_params='zero')
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
+    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
+
+    # start_params='least_squares' with different alpha
+    glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5,
+                                     family='poisson',
+                                     link='log', solver='cd', tol=1e-5,
+                                     start_params='zero')
+    glm.fit(X, y)
+    # warm start with original alpha and use of sparse matrices
+    glm.warm_start = True
+    glm.alpha = 1
+    X = sparse.csr_matrix(X)
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, -0.03550978409, decimal=7)
-    assert_array_almost_equal(glm.coef_, [0.16936423283, 0.],
-                              decimal=7)
+    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
+    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)

From 9a9818441d605bf86547651997db81e969f41cdf Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 25 Jan 2018 22:59:46 +0100
Subject: [PATCH 14/69] [WIP] Add Generalized Linear Models (#9405)

* new helper function _check_weights for validation of sample_weight
* fix white space issue in doctest of linear_model.rst
---
 doc/modules/linear_model.rst |  8 ++---
 sklearn/linear_model/glm.py  | 59 +++++++++++++++++-------------------
 2 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 1f0946e97b059..f7b0ca0cc7add 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -921,10 +921,10 @@ follows:
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
     GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True,
-              copy_X=True, family='poisson', fit_dispersion='chisqr',
-              fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
-              random_state=None, selection='random', solver='auto',
-              start_params=None, tol=0.0001, verbose=0, warm_start=False)
+                  copy_X=True, family='poisson', fit_dispersion='chisqr',
+                  fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
+                  random_state=None, selection='random', solver='auto',
+                  start_params=None, tol=0.0001, verbose=0, warm_start=False)
     >>> reg.coef_
     array([ 0.24630255,  0.43373521])
     >>> reg.intercept_ #doctest: +ELLIPSIS
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index e5eda6108052c..138830ea431c9 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -59,6 +59,26 @@
 from ..utils.validation import check_is_fitted, check_random_state
 
 
+def _check_weights(sample_weight, n_samples):
+    if sample_weight is None:
+        weights = np.ones(n_samples)
+    elif np.isscalar(sample_weight):
+        if sample_weight < 0:
+            raise ValueError("Sample weights must be non-negative.")
+        weights = sample_weight*np.ones(n_samples)
+    else:
+        weights = np.atleast_1d(sample_weight)
+        if weights.ndim > 1:
+            raise ValueError("Sample weight must be 1D array or scalar")
+        elif weights.shape[0] != n_samples:
+            raise ValueError("Sample weights must have the same length as"
+                             " y")
+        if not np.all(sample_weight >= 0):
+            raise ValueError("Sample weights must be non-negative.")
+
+    return weights
+
+
 class Link(six.with_metaclass(ABCMeta)):
     """Abstract base class for Link funtions
     """
@@ -925,17 +945,7 @@ def fit(self, X, y, sample_weight=None):
                          dtype=_dtype, y_numeric=True, multi_output=False)
         y = y.astype(np.float64)
 
-        if sample_weight is None:
-            weights = np.ones_like(y)
-        elif np.isscalar(sample_weight):
-            weights = sample_weight*np.ones_like(y)
-        else:
-            weights = np.atleast_1d(sample_weight)
-            if weights.ndim > 1:
-                raise ValueError("Sample weight must be 1D array or scalar")
-            elif weights.shape[0] != y.shape[0]:
-                raise ValueError("Sample weights must have the same length as"
-                                 " y")
+        weights = _check_weights(sample_weight, y.shape[0])
 
         # 1.2 validate arguments of __init__ ##################################
         # Garantee that self._family_instance is an instance of class
@@ -1544,7 +1554,7 @@ def linear_predictor(self, X):
         return safe_sparse_dot(X, self.coef_,
                                dense_output=True) + self.intercept_
 
-    def predict(self, X, sample_weight=1):
+    def predict(self, X, sample_weight=None):
         """Predict uing GLM with feature matrix X.
         If sample_weight is given, returns prediction*sample_weight.
 
@@ -1558,26 +1568,13 @@ def predict(self, X, sample_weight=1):
         C : array, shape = (n_samples)
             Returns predicted values times sample_weight.
         """
+        weights = _check_weights(sample_weight, X.shape[0])
         eta = self.linear_predictor(X)
         mu = self._link_instance.inverse(eta)
-        if sample_weight is None:
-            return mu
-        elif np.isscalar(sample_weight):
-            if sample_weight <= 0:
-                raise ValueError("Sample weight must be positive, "
-                                 "got (sample_weight={0})."
-                                 .format(sample_weight))
-        else:
-            sample_weights = np.atleast_1d(sample_weight)
-            if sample_weight.ndim > 1:
-                raise ValueError("Sample weight must be 1D array or scalar.")
-            elif sample_weight.shape[0] != mu.shape[0]:
-                raise ValueError("Sample weights must have the same length as"
-                                 " X.shape[1].")
 
-        return mu*sample_weight
+        return mu*weights
 
-    def estimate_phi(self, y, X, sample_weight):
+    def estimate_phi(self, y, X, sample_weight=None):
         """Estimation of the dispersion parameter.
         Returns the estimate.
         """
@@ -1586,6 +1583,7 @@ def estimate_phi(self, y, X, sample_weight):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          dtype=_dtype, y_numeric=True, multi_output=False)
         n_samples, n_features = X.shape
+        weights = _check_weights(sample_weight, n_samples)
         eta = safe_sparse_dot(X, self.coef_, dense_output=True)
         if self.fit_intercept is True:
             eta += self.intercept_
@@ -1640,10 +1638,7 @@ def score(self, X, y, sample_weight=None):
         # Note, default score defined in RegressorMixin is R^2 score.
         # TODO: make D^2 a score function in module metrics (and thereby get
         #       input validation and so on)
-        if sample_weight is None:
-            weights = np.ones_like(y)
-        else:
-            weights = np.atleast_1d(sample_weight)
+        weights = _check_weights(sample_weight, y.shape[0])
         mu = self.predict(X)
         dev = self._family_instance.deviance(y, mu, weights=weights)
         y_mean = np.average(y, weights=weights)

From db9defe6b7637fe022034ca7f435f4fd37f6c118 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 26 Jan 2018 08:33:59 +0100
Subject: [PATCH 15/69] [WIP] Add Generalized Linear Models (#9405)

* fit_dispersion default=None also in docs.
* improved docs.
* fixed input validation of predict
* fixed bug for sample_weight in estimate_phi
---
 sklearn/linear_model/glm.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 138830ea431c9..22a8be9e50828 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -754,7 +754,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         case, the design matrix X must have full column rank
         (no collinearities).
 
-    l1_ratio : float, optional (defaul=0)
+    l1_ratio : float, optional (default=0)
         The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For
         ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
@@ -768,7 +768,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         Note that n_features* = X.shape[1] = length of coef_ (intercept
         always excluded from counting).
 
-    P2 : None or array of shape (n_features*, n_features*)
+    P2 : None or array of shape (n_features*, n_features*), optional\
+            (default=None)
         With this square matrix the L2 penalty is calculated as `w P2 w`.
         This gives a fine control over this penalty (Tikhonov
         regularization).
@@ -781,20 +782,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\
             of class ExponentialDispersionModel, optional(default='normal')
-        the distributional assumption of the GLM.
+        the distributional assumption of the GLM, i.e. which loss function to
+        be minimized.
 
     link : {'identity', 'log'} or an instance of class Link,
         optional (default='identity')
         the link function of the GLM, i.e. mapping from linear predictor
         (X*coef) to expectation (mu).
 
-    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul='chisqr')
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None)
         method for estimation of the dispersion parameter phi. Whether to use
         the chi squared statisic or the deviance statistic. If None, the
         dispersion is not estimated.
 
     solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \
-            optional (defaul='auto')
+            optional (default='auto')
         Algorithm to use in the optimization problem.
 
         - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'.
@@ -830,11 +832,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
         is assumed to be the start value for the ``intercept_``.
         If 'least_squares' is set, the result of a least squares fit in the
-        link space (linear predictor) is taken. If ``None``, the start values
-        are calculated by setting mu to family.starting_mu(..) and one step of
-        irls.
-        This option only applies if ``warm_start=False`` or if fit is called
-        the first time (``self.coef_`` does not exist).
+        link space (linear predictor) is taken.
+        If 'zero' is set, all coefficients start with zero.
+        If ``None``, the start values are calculated by setting mu to
+        family.starting_mu(..) and one step of irls.
+        These options only apply if ``warm_start=False`` or if fit is called
+        the first time (``self.coef_`` does not yet exist).
 
     selection : str, optional (default='random')
         For the solver 'cd' (coordinate descent), the coordinates (features)
@@ -1550,7 +1553,9 @@ def linear_predictor(self, X):
             Returns predicted values of linear predictor.
         """
         check_is_fitted(self, "coef_")
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                        dtype='numeric', copy=True, ensure_2d=True,
+                        allow_nd=False)
         return safe_sparse_dot(X, self.coef_,
                                dense_output=True) + self.intercept_
 
@@ -1568,9 +1573,10 @@ def predict(self, X, sample_weight=None):
         C : array, shape = (n_samples)
             Returns predicted values times sample_weight.
         """
-        weights = _check_weights(sample_weight, X.shape[0])
+        # validation of X in linear_predictor
         eta = self.linear_predictor(X)
         mu = self._link_instance.inverse(eta)
+        weights = _check_weights(sample_weight, X.shape[0])
 
         return mu*weights
 
@@ -1596,11 +1602,11 @@ def estimate_phi(self, y, X, sample_weight=None):
                              .format(n_samples, n_features))
         mu = self._link_instance.inverse(eta)
         if self.fit_dispersion == 'chisqr':
-            chisq = np.sum(sample_weight*(y-mu)**2 /
+            chisq = np.sum(weights*(y-mu)**2 /
                            self._family_instance.unit_variance(mu))
             return chisq/(n_samples - n_features)
         elif self.fit_dispersion == 'deviance':
-            dev = self._family_instance.deviance(y, mu, sample_weight)
+            dev = self._family_instance.deviance(y, mu, weights)
             return dev/(n_samples - n_features)
 
     # Note: check_estimator(GeneralizedLinearRegressor) might raise

From dc7fdd7f8f6a3a2276f5f96aa6d5dd6ad3ce853e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 26 Jan 2018 08:41:24 +0100
Subject: [PATCH 16/69] [WIP] Add Generalized Linear Models (#9405)

* improved docs
---
 sklearn/linear_model/tests/test_glm.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index ee90cd51c874d..0af837c9c73f3 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -177,7 +177,8 @@ def test_normal_ridge():
         glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
                                          family='normal', link='identity',
                                          fit_intercept=True, tol=1e-6,
-                                         max_iter=100, solver=solver)
+                                         max_iter=100, solver=solver,
+                                         random_state=42)
         glm.fit(X, y)
         assert_equal(glm.coef_.shape, (X.shape[1], ))
         assert_array_almost_equal(glm.coef_, ridge.coef_)
@@ -214,7 +215,7 @@ def test_normal_ridge():
         glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8,
                                          family='normal', link='identity',
                                          fit_intercept=True, solver=solver,
-                                         max_iter=300)
+                                         max_iter=300, random_state=42)
         glm.fit(X, y)
         assert_equal(glm.coef_.shape, (X.shape[1], ))
         assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5)
@@ -257,7 +258,8 @@ def test_poisson_ridge():
         glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
                                          fit_intercept=True, family='poisson',
                                          link='log', tol=s_tol[solver],
-                                         solver=solver, max_iter=300)
+                                         solver=solver, max_iter=300,
+                                         random_state=42)
         glm.fit(X, y)
         assert_almost_equal(glm.intercept_, -0.12889386979,
                             decimal=s_dec[solver])
@@ -282,20 +284,23 @@ def test_poisson_enet():
     # (Intercept) -0.03550978409
     # a            0.16936423283
     # b            .
+    rand = 0
     glmnet_intercept = -0.03550978409
     glmnet_coef = [0.16936423283, 0.]
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
-                                     link='log', solver='cd', tol=1e-7)
+                                     link='log', solver='cd', tol=1e-7,
+                                     selection='random', random_state=42)
     glm.fit(X, y)
     assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7)
     assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7)
 
-    # same for start_params='zero' with reduced precision
+    # same for start_params='zero' and selection='cyclic'
+    # with reduced precision
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
                                      link='log', solver='cd', tol=1e-5,
-                                     start_params='zero')
+                                     selection='cyclic', start_params='zero')
     glm.fit(X, y)
     assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
     assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)

From b11d06ba72865c14b0532c6d6c34d264a09d7ae4 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 26 Jan 2018 16:57:45 +0100
Subject: [PATCH 17/69] [WIP] Add Generalized Linear Models (#9405)

* fixed input validation of X in predict
---
 sklearn/linear_model/glm.py            | 4 +++-
 sklearn/linear_model/tests/test_glm.py | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 22a8be9e50828..032ded86816dd 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -1573,7 +1573,9 @@ def predict(self, X, sample_weight=None):
         C : array, shape = (n_samples)
             Returns predicted values times sample_weight.
         """
-        # validation of X in linear_predictor
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                        dtype='numeric', copy=True, ensure_2d=True,
+                        allow_nd=False)
         eta = self.linear_predictor(X)
         mu = self._link_instance.inverse(eta)
         weights = _check_weights(sample_weight, X.shape[0])
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 0af837c9c73f3..776edd8aeec46 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -284,7 +284,6 @@ def test_poisson_enet():
     # (Intercept) -0.03550978409
     # a            0.16936423283
     # b            .
-    rand = 0
     glmnet_intercept = -0.03550978409
     glmnet_coef = [0.16936423283, 0.]
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T

From 9e6c01378a4cb245824bcb9429b3d566652af743 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 26 Jan 2018 17:37:44 +0100
Subject: [PATCH 18/69] [WIP] Add Generalized Linear Models (#9405)

* redundant line of code 'd = np.zeros_like(coef)'
---
 doc/modules/linear_model.rst | 2 +-
 sklearn/linear_model/glm.py  | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index f7b0ca0cc7add..dcb35b6a5d941 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -921,7 +921,7 @@ follows:
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
     GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True,
-                  copy_X=True, family='poisson', fit_dispersion='chisqr',
+                  copy_X=True, family='poisson', fit_dispersion=None,
                   fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
                   random_state=None, selection='random', solver='auto',
                   start_params=None, tol=0.0001, verbose=0, warm_start=False)
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 032ded86816dd..8856af8ec698a 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -73,7 +73,7 @@ def _check_weights(sample_weight, n_samples):
         elif weights.shape[0] != n_samples:
             raise ValueError("Sample weights must have the same length as"
                              " y")
-        if not np.all(sample_weight >= 0):
+        if not np.all(weights >= 0):
             raise ValueError("Sample weights must be non-negative.")
 
     return weights
@@ -1132,7 +1132,7 @@ def fit(self, X, y, sample_weight=None):
                 if sparse.issparse(P2):
                     # TODO: check sparse P2 for non-negativeness
                     # raise NotImplementedError("Check sparse P2 for "
-                    #                          "non-negaitveness is not yet "
+                    #                          "non-negativeness is not yet "
                     #                          "implemented.")
                     pass
                 elif P2.ndim == 2:
@@ -1391,7 +1391,6 @@ def Hs(s):
                 # inner loop
                 # TODO: use sparsity (coefficient already 0 due to L1 penalty)
                 #       => active set of features for featurelist, see paper
-                d = np.zeros_like(coef)
                 # A = f'(w) + d*H(w) + (w+d)*P2
                 # B = H+P2
                 # Note: f'=-score and H=fisher are updated at the end of outer

From bad0190a22623eae3a2f6dfcdb0fd8caee625111 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 27 Jan 2018 20:38:02 +0100
Subject: [PATCH 19/69] [WIP] Add Generalized Linear Models (#9405)

* added test to compare to ElasticNet
* deleted identical comment lines
---
 sklearn/linear_model/tests/test_glm.py | 29 +++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 776edd8aeec46..9990cafe2cbcf 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -12,7 +12,7 @@
     GammaDistribution, InverseGaussianDistribution,
     GeneralizedHyperbolicSecand,
     GeneralizedLinearRegressor)
-from sklearn.linear_model.ridge import Ridge
+from sklearn.linear_model import ElasticNet, Ridge
 
 from sklearn.utils.testing import (
     assert_equal, assert_almost_equal,
@@ -267,14 +267,37 @@ def test_poisson_ridge():
                                   decimal=s_dec[solver])
 
 
+def test_normal_enet():
+    """Tet elastic net regression with normal/gaussian family"""
+    rng = np.random.RandomState(0)
+    alpha, l1_ratio = 0.3, 0.7
+    n_samples, n_features = 20, 2
+    X = rng.randn(n_samples, n_features).copy(order='F')
+    beta = rng.randn(n_features)
+    y = 2 + np.dot(X, beta) + rng.randn(n_samples)
+
+    glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio,
+                                     family='normal', link='identity',
+                                     fit_intercept=True, tol=1e-7,
+                                     max_iter=100, selection='cyclic',
+                                     solver='cd', start_params='zero',
+                                     check_input=False)
+    glm.fit(X, y)
+
+    enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True,
+                      normalize=False, tol=1e-7, copy_X=True)
+    enet.fit(X, y)
+
+    assert_almost_equal(glm.intercept_, enet.intercept_)
+    assert_array_almost_equal(glm.coef_, enet.coef_)
+
+
 def test_poisson_enet():
     """Test elastic net regression with poisson family and LogLink
 
     Compare to R's glmnet"""
     # library("glmnet")
     # options(digits=10)
-    # library("glmnet")
-    # options(digits=10)
     # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
     # x <- data.matrix(df[,c("a", "b")])
     # y <- df$y

From 48137d86079c9a8efd15d57e719e3ee35f1644c9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 28 Jan 2018 11:52:19 +0100
Subject: [PATCH 20/69] [WIP] Add Generalized Linear Models (#9405)

* increased precision in test_normal_enet
---
 sklearn/linear_model/tests/test_glm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 9990cafe2cbcf..6f8bdd3a72f40 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -278,18 +278,18 @@ def test_normal_enet():
 
     glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio,
                                      family='normal', link='identity',
-                                     fit_intercept=True, tol=1e-7,
+                                     fit_intercept=True, tol=1e-8,
                                      max_iter=100, selection='cyclic',
                                      solver='cd', start_params='zero',
                                      check_input=False)
     glm.fit(X, y)
 
     enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True,
-                      normalize=False, tol=1e-7, copy_X=True)
+                      normalize=False, tol=1e-8, copy_X=True)
     enet.fit(X, y)
 
-    assert_almost_equal(glm.intercept_, enet.intercept_)
-    assert_array_almost_equal(glm.coef_, enet.coef_)
+    assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
+    assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
 
 
 def test_poisson_enet():

From 2c2a077a2e8c57bdf3c945678526d6efdb5763e9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 28 Jan 2018 12:42:03 +0100
Subject: [PATCH 21/69] [WIP] Add Generalized Linear Models (#9405)

* better doc for heavy tailed distributions
---
 doc/modules/linear_model.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index dcb35b6a5d941..5cb8e54afbb06 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -905,7 +905,8 @@ are the following:
 
   * If the target values are positive valued and skewed, you might try a Gamma deviance.
 
-  * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family).
+  * If the target values seem to be heavier tailed than a Gamma distribution, you might try an Inverse Gaussian deviance (or even higher variance powers of the Tweedie family).
+    Keep in mind that the mean is not a good measure for very heavy tailed distributions, cf. extreme value theory.
 
 Since the linear predictor :math:`Xw` can be negative and
 Poisson, Gamma and Inverse Gaussian distributions don't support negative values,

From 15931c3148b68c47aa3c3c19983525ae758a0981 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 28 Jan 2018 15:18:26 +0100
Subject: [PATCH 22/69] [WIP] Add Generalized Linear Models (#9405)

* improved input validation and testing of them
---
 sklearn/linear_model/glm.py            |  24 +++-
 sklearn/linear_model/tests/test_glm.py | 174 ++++++++++++++++++++++++-
 2 files changed, 189 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 8856af8ec698a..33df6b4c9b850 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -63,11 +63,14 @@ def _check_weights(sample_weight, n_samples):
     if sample_weight is None:
         weights = np.ones(n_samples)
     elif np.isscalar(sample_weight):
-        if sample_weight < 0:
+        if sample_weight <= 0:
             raise ValueError("Sample weights must be non-negative.")
         weights = sample_weight*np.ones(n_samples)
     else:
-        weights = np.atleast_1d(sample_weight)
+        _dtype = [np.float64, np.float32]
+        weights = check_array(sample_weight, accept_sparse='csr',
+                              force_all_finite=True, ensure_2d=False,
+                              dtype=_dtype)
         if weights.ndim > 1:
             raise ValueError("Sample weight must be 1D array or scalar")
         elif weights.shape[0] != n_samples:
@@ -75,6 +78,9 @@ def _check_weights(sample_weight, n_samples):
                              " y")
         if not np.all(weights >= 0):
             raise ValueError("Sample weights must be non-negative.")
+        elif not np.sum(weights) > 0:
+            raise ValueError("Sample weights must have at least one positive "
+                             "element.")
 
     return weights
 
@@ -1010,10 +1016,12 @@ def fit(self, X, y, sample_weight=None):
                                  "with L1 penalties, which are included with "
                                  "(alpha={1}) and (l1_ratio={2})."
                                  .format(solver, self.alpha, self.l1_ratio))
-        if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
-            raise ValueError("Maximum number of iteration must be positive;"
+        if (not isinstance(self.max_iter, six.integer_types)
+                or self.max_iter <= 0):
+            raise ValueError("Maximum number of iteration must be a positive "
+                             "integer;"
                              " got (max_iter={0!r})".format(self.max_iter))
-        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
+        if not isinstance(self.tol, numbers.Number) or self.tol <= 0:
             raise ValueError("Tolerance for stopping criteria must be "
                              "positive; got (tol={0!r})".format(self.tol))
         if not isinstance(self.warm_start, bool):
@@ -1029,7 +1037,9 @@ def fit(self, X, y, sample_weight=None):
                                  " length,"
                                  " got(start_params={0})".format(start_params))
         else:
-            start_params = np.atleast_1d(start_params)
+            start_params = check_array(start_params, accept_sparse='csr',
+                                       force_all_finite=True, ensure_2d=False,
+                                       dtype=_dtype, copy=True)
             if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
                     (start_params.ndim != 1)):
                 raise ValueError("Start values for parameters must have the"
@@ -1160,7 +1170,7 @@ def fit(self, X, y, sample_weight=None):
 
         # set start values for coef
         coef = None
-        if self.warm_start and hasattr(self, "coef_"):
+        if self.warm_start and hasattr(self, 'coef_'):
             if self.fit_intercept:
                 coef = np.concatenate((np.array([self.intercept_]),
                                        self.coef_))
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 6f8bdd3a72f40..1abbcf0540e28 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -5,7 +5,7 @@
 
 from sklearn.linear_model.glm import (
     Link,
-    # IdentityLink,
+    IdentityLink,
     LogLink,
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
@@ -16,7 +16,8 @@
 
 from sklearn.utils.testing import (
     assert_equal, assert_almost_equal,
-    assert_array_equal, assert_array_almost_equal)
+    assert_array_equal, assert_array_almost_equal,
+    assert_raises)
 
 
 def test_link_properties():
@@ -102,6 +103,34 @@ def f(coef):
         assert_allclose(fisher, approx, rtol=1e-3)
 
 
+def test_sample_weights_validation():
+    """Test the raised errors in the validation of sample_weight"""
+    # 1. scalar value but not positive
+    X = [[1]]
+    y = [1]
+    weights = 0
+    glm = GeneralizedLinearRegressor(fit_intercept=False)
+    assert_raises(ValueError, glm.fit, X, y, weights)
+
+    # 2. 2d array
+    weights = [[0]]
+    assert_raises(ValueError, glm.fit, X, y, weights)
+
+    # 3. 1d but wrong length
+    weights = [1, 0]
+    assert_raises(ValueError, glm.fit, X, y, weights)
+
+    # 4. 1d but only zeros (sum not greater than 0)
+    weights = [0, 0]
+    X = [[0], [1]]
+    y = [1, 2]
+    assert_raises(ValueError, glm.fit, X, y, weights)
+
+    # 5. 1d but weith a negative value
+    weights = [2, -1]
+    assert_raises(ValueError, glm.fit, X, y, weights)
+
+
 def test_glm_family_argument():
     """Test GLM family argument set as string
     """
@@ -115,6 +144,147 @@ def test_glm_family_argument():
                                          alpha=0).fit(X, y)
         assert_equal(type(glm._family_instance), type(fam))
 
+    glm = GeneralizedLinearRegressor(family='not a family',
+                                     fit_intercept=False)
+    assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_link_argument():
+    """Test GLM link argument set as string
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for (l, link) in [('identity', IdentityLink()),
+                      ('log', LogLink())]:
+        glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
+                                         link=l).fit(X, y)
+        assert_equal(type(glm._link_instance), type(link))
+
+    glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
+                                     link='not a link')
+    assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_alpha_argument():
+    """Test GLM alpha argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for alpha in ['not a number', -4.2]:
+        glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
+                                         alpha=alpha)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_l1_ratio_argument():
+    """Test GLM l1_ratio argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for l1_ratio in ['not a number', -4.2, 1.1, [1]]:
+        glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
+                                         l1_ratio=l1_ratio)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_fit_intercept_argument():
+    """Test GLM fit_intercept argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for fit_intercept in ['not bool', 1, 0, [True]]:
+        glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_solver_argument():
+    """Test GLM solver argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for solver in ['not a solver', 1, [1]]:
+        glm = GeneralizedLinearRegressor(solver=solver)
+        assert_raises(ValueError, glm.fit, X, y)
+
+    # solver not suitable for L1 penalty
+    for solver in ['irls', 'lbfgs', 'newton-cg']:
+        glm = GeneralizedLinearRegressor(solver=solver, alpha=1, l1_ratio=0.1)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_max_iter_argument():
+    """Test GLM max_iter argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for max_iter in ['not a number', 0, -1, 5.5, [1]]:
+        glm = GeneralizedLinearRegressor(max_iter=max_iter)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_tol_argument():
+    """Test GLM tol argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for tol in ['not a number', 0, -1.0, [1e-3]]:
+        glm = GeneralizedLinearRegressor(tol=tol)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_warm_start_argument():
+    """Test GLM warm_start argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for warm_start in ['not bool', 1, 0, [True]]:
+        glm = GeneralizedLinearRegressor(warm_start=warm_start)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_start_params_argument():
+    """Test GLM start_params argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for start_params in ['not a start_params', ['zero'], [0, 0, 0],
+                         [[0, 0]], ['a', 'b']]:
+        glm = GeneralizedLinearRegressor(start_params=start_params)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_selection_argument():
+    """Test GLM selection argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for selection in ['not a selection', 1, 0, ['cyclic']]:
+        glm = GeneralizedLinearRegressor(selection=selection)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_check_input_argument():
+    """Test GLM check_input argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for check_input in ['not bool', 1, 0, [True]]:
+        glm = GeneralizedLinearRegressor(check_input=check_input)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_random_state_argument():
+    """Test GLM random_state argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for random_state in ['a string', 0.5, [0]]:
+        glm = GeneralizedLinearRegressor(random_state=random_state)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+# TODO: check P1 and P2
+# TODO: check additional validations if check_input == True
 
 def test_glm_identiy_regression():
     """Test GLM regression with identity link on a simple dataset

From feedba379c2a8f53a7a9792e33041f29da7a4c95 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 30 Mar 2018 18:50:06 +0200
Subject: [PATCH 23/69] [MRG] Add Generalized Linear Models (#9405)

* improved input validation and testing of P1
* test case for validation of argument P2
* test case for validation of argument copy_X
---
 sklearn/linear_model/glm.py            |  3 ++
 sklearn/linear_model/tests/test_glm.py | 48 +++++++++++++++++++++-----
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 33df6b4c9b850..eae4a56ea1d95 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -1065,6 +1065,9 @@ def fit(self, X, y, sample_weight=None):
             P1 = np.ones(X.shape[1])
         else:
             P1 = np.atleast_1d(np.copy(self.P1))
+            if P1.dtype.kind not in ['b', 'i', 'u', 'f']:
+                raise ValueError("P1 must be a numeric value; "
+                                 "got (dtype={0}).".format(P1.dtype))
             if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]):
                 raise ValueError("P1 must be either None or an 1D array with "
                                  "the length of X.shape[1]; "
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 1abbcf0540e28..edf579a416973 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -187,6 +187,27 @@ def test_glm_l1_ratio_argument():
         assert_raises(ValueError, glm.fit, X, y)
 
 
+def test_glm_P1_argument():
+    """Test GLM P1 arguments
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for P1 in [['a string', 'a string'], [1, [2]], [1, 2, 3]]:
+        glm = GeneralizedLinearRegressor(P1=P1)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_P2_argument():
+    """Test GLM P2 arguments
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for P2 in [np.full((2, 2), 'a string'), [[1, [2]], [3, 4]], [1, 2, 3],
+               [[1, 2]], [[1], [2]]]:
+        glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
 def test_glm_fit_intercept_argument():
     """Test GLM fit_intercept argument
     """
@@ -263,27 +284,36 @@ def test_glm_selection_argument():
         assert_raises(ValueError, glm.fit, X, y)
 
 
-def test_glm_check_input_argument():
-    """Test GLM check_input argument
+def test_glm_random_state_argument():
+    """Test GLM random_state argument
     """
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for check_input in ['not bool', 1, 0, [True]]:
-        glm = GeneralizedLinearRegressor(check_input=check_input)
+    for random_state in ['a string', 0.5, [0]]:
+        glm = GeneralizedLinearRegressor(random_state=random_state)
         assert_raises(ValueError, glm.fit, X, y)
 
 
-def test_glm_random_state_argument():
-    """Test GLM random_state argument
+def test_glm_copy_X_argument():
+    """Test GLM copy_X arguments
     """
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for random_state in ['a string', 0.5, [0]]:
-        glm = GeneralizedLinearRegressor(random_state=random_state)
+    for copy_X in ['not bool', 1, 0, [True]]:
+        glm = GeneralizedLinearRegressor(copy_X=copy_X)
+        assert_raises(ValueError, glm.fit, X, y)
+
+
+def test_glm_check_input_argument():
+    """Test GLM check_input argument
+    """
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    for check_input in ['not bool', 1, 0, [True]]:
+        glm = GeneralizedLinearRegressor(check_input=check_input)
         assert_raises(ValueError, glm.fit, X, y)
 
 
-# TODO: check P1 and P2
 # TODO: check additional validations if check_input == True
 
 def test_glm_identiy_regression():

From 6fdfb47428571b4c8e89046a7b1f481711832f61 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 30 Mar 2018 19:41:09 +0200
Subject: [PATCH 24/69] [MRG] Add Generalized Linear Models (#9405)

* fix doctest failure in example of linear_model.rst

* fix dtype issue in test_glm_P2_argument
---
 doc/modules/linear_model.rst           | 4 ++--
 sklearn/linear_model/tests/test_glm.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 5cb8e54afbb06..9f85da771c6f1 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -926,8 +926,8 @@ follows:
                   fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
                   random_state=None, selection='random', solver='auto',
                   start_params=None, tol=0.0001, verbose=0, warm_start=False)
-    >>> reg.coef_
-    array([ 0.24630255,  0.43373521])
+    >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE
+    array([0.24630255, 0.43373521])
     >>> reg.intercept_ #doctest: +ELLIPSIS
     -0.76383575...
 
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index edf579a416973..c5d132d35bdb9 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -202,8 +202,8 @@ def test_glm_P2_argument():
     """
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for P2 in [np.full((2, 2), 'a string'), [[1, [2]], [3, 4]], [1, 2, 3],
-               [[1, 2]], [[1], [2]]]:
+    for P2 in [np.full((2, 2), 'a string', dtype=np.dtype('<U8')),
+               [[1, [2]], [3, 4]], [1, 2, 3], [[1, 2]], [[1], [2]]]:
         glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
         assert_raises(ValueError, glm.fit, X, y)
 

From d489f56204585fb2c57e20d44e05730c47f301a1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 5 Aug 2018 14:48:33 +0200
Subject: [PATCH 25/69] [MRG] Add Generalized Linear Models (#9405)

* fix typos in doc
---
 doc/modules/linear_model.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 9f85da771c6f1..a204ccb080cc9 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -934,7 +934,7 @@ follows:
 Mathematical formulation
 ------------------------
 
-In the unpenalized case, the assumptions are the folowing:
+In the unpenalized case, the assumptions are the following:
 
     * The target values :math:`y_i` are realizations of random variables
       :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})`
@@ -951,7 +951,7 @@ same as specifying a unit variance function (they are one-to-one).
 
 Including penalties helps to avoid overfitting or, in case of L1 penalty, to
 obtain sparse solutions. But there are also other motivations to include them,
-e.g. accounting fo dependence structure of :math:`y`.
+e.g. accounting for the dependence structure of :math:`y`.
 
 The objective function, which is independent of :math:`\phi`, is minimized with
 respect to the coefficients :math:`w`.

From 809e3a2747e451fc97b9a462cf682110d59fac25 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 26 Aug 2018 20:41:25 +0200
Subject: [PATCH 26/69] Remove test_glm_P2_argument

---
 sklearn/linear_model/tests/test_glm.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index c5d132d35bdb9..82b8ec2435543 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -197,15 +197,15 @@ def test_glm_P1_argument():
         assert_raises(ValueError, glm.fit, X, y)
 
 
-def test_glm_P2_argument():
-    """Test GLM P2 arguments
-    """
-    y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    for P2 in [np.full((2, 2), 'a string', dtype=np.dtype('<U8')),
-               [[1, [2]], [3, 4]], [1, 2, 3], [[1, 2]], [[1], [2]]]:
-        glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
-        assert_raises(ValueError, glm.fit, X, y)
+# def test_glm_P2_argument():
+#     """Test GLM P2 arguments
+#     """
+#     y = np.array([1, 2])
+#     X = np.array([[1], [1]])
+#     for P2 in [np.full((2, 2), 'a string', dtype=np.dtype('<U8')),
+#                [[1, [2]], [3, 4]], [1, 2, 3], [[1, 2]], [[1], [2]]]:
+#         glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
+#         assert_raises(ValueError, glm.fit, X, y)
 
 
 def test_glm_fit_intercept_argument():

From 4edce36f6d4e66325db6213eb31a407d04565ee9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 30 Aug 2018 19:33:27 +0200
Subject: [PATCH 27/69] Filter out DeprecationWarning in old versions of
 scipy.sparse.linalg.spsolve about usage of umfpack

---
 sklearn/linear_model/tests/test_glm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 82b8ec2435543..e4be75ddb7a64 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -355,6 +355,7 @@ def test_glm_log_regression():
             assert_array_almost_equal(res.coef_, coef)
 
 
+@pytest.mark.filterwarnings('ignore:DeprecationWarning')
 def test_normal_ridge():
     """Test ridge regression for Normal distributions
 

From 46df5b6555938dc7c9acac67929b0633ea1354a3 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 30 Aug 2018 20:40:08 +0200
Subject: [PATCH 28/69] import pytest

---
 sklearn/linear_model/tests/test_glm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index e4be75ddb7a64..dfa205407a193 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -1,5 +1,6 @@
 import numpy as np
 from numpy.testing import assert_allclose
+import pytest
 import scipy as sp
 from scipy import sparse
 

From 21f2136c083b4057868b7886880f356e58703611 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 30 Aug 2018 21:17:53 +0200
Subject: [PATCH 29/69] Document arguments of abstact methods

---
 sklearn/linear_model/glm.py | 125 +++++++++++++++++++++++++++++++++---
 1 file changed, 115 insertions(+), 10 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index eae4a56ea1d95..825ee6bfe8c45 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -93,12 +93,22 @@ class Link(six.with_metaclass(ABCMeta)):
     def link(self, mu):
         """The link function g(mu) with argument mu=E[Y] returns the
         linear predictor.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Usually the predicted mean.
         """
         raise NotImplementedError
 
     @abstractmethod
     def derivative(self, mu):
         """Derivative of the link g'(mu).
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Usually the predicted mean.
         """
         raise NotImplementedError
 
@@ -106,18 +116,33 @@ def derivative(self, mu):
     def inverse(self, lin_pred):
         """The inverse link function h(lin_pred) with the linear predictor as
         argument returns mu=E[Y].
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (predicted) linear predictor.
         """
         raise NotImplementedError
 
     @abstractmethod
     def inverse_derivative(self, lin_pred):
         """Derivative of the inverse link function h'(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (predicted) linear predictor.
         """
         raise NotImplementedError
 
     @abstractmethod
     def inverse_derivative2(self, lin_pred):
         """Second derivative of the inverse link function h''(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (predicted) linear predictor.
         """
         raise NotImplementedError
 
@@ -236,6 +261,11 @@ def include_upper_bound(self):
 
     def in_y_range(self, x):
         """Returns true if `x` is in the valid range of Y~EDM.
+
+        Parameters
+        ----------
+        x : array, shape (n_samples,)
+            Target values.
         """
         if self.include_lower_bound:
             if self.include_upper_bound:
@@ -263,12 +293,22 @@ def unit_variance(self, mu):
             \partial\mu^2}}\big|_{y=\mu}
 
         See also :func:`variance`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
         """
         raise NotImplementedError()
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
         r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Target values.
         """
         raise NotImplementedError()
 
@@ -276,6 +316,17 @@ def variance(self, mu, phi=1, weights=1):
         r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
         :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`,
         with unit variance :math:`v(\mu)` and weights :math:`s_i`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        phi : float
+            Dispersion parameter.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
         """
         return phi/weights * self.unit_variance(mu)
 
@@ -284,6 +335,17 @@ def variance_derivative(self, mu, phi=1, weights=1):
         :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
         =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)`
         and weights :math:`s_i`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        phi : float (default=1)
+            Dispersion parameter.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
         """
         return phi/weights * self.unit_variance_derivative(mu)
 
@@ -293,6 +355,14 @@ def unit_deviance(self, y, mu):
         In terms of the log-likelihood it is given by
         :math:`d(y,\mu) = -2\phi\cdot
         \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).`
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
         """
         raise NotImplementedError()
 
@@ -301,9 +371,13 @@ def unit_deviance_derivative(self, y, mu):
         :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
         with unit variance :math:`v(\mu)`.
 
-        Returns
-        -------
-        derivative: array, shape = (n_samples,)
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
         """
         return -2*(y-mu)/self.unit_variance(mu)
 
@@ -313,6 +387,17 @@ def deviance(self, y, mu, weights=1):
         In terms of the likelihood it is :math:`D = -2\phi\cdot
         \left(loglike(y,\mu,\frac{phi}{s})
         - loglike(y,y,\frac{phi}{s})\right)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
         """
         return np.sum(weights*self.unit_deviance(y, mu))
 
@@ -326,6 +411,17 @@ def _deviance(self, coef, X, y, weights, link):
 
     def deviance_derivative(self, y, mu, weights=1):
         """The derivative w.r.t. `mu` of the deviance.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
         """
         return weights*self.unit_deviance_derivative(y, mu)
 
@@ -464,7 +560,16 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link):
         return eta, mu, score, fisher
 
     def starting_mu(self, y, weights=1):
-        """Starting values for the mean mu_i in (unpenalized) IRLS."""
+        """Starting values for the mean mu_i in (unpenalized) IRLS.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
         return ((weights*y+np.mean(weights*y)) /
                 (2.*np.sum(np.ones_like(y)*weights)))
 
@@ -656,12 +761,12 @@ def _irls_step(X, W, P2, z):
     X : numpy array or sparse matrix of shape (n_samples, n_features)
         Training data (with intercept included if present)
 
-    W : numpy array of shape (n_samples, )
+    W : numpy array of shape (n_samples,)
 
     P2 : numpy array or sparse matrix of shape (n_features, n_features)
         The l2-penalty matrix or vector (=diagonal matrix)
 
-    z  : numpy array of shape (n_samples, )
+    z  : numpy array of shape (n_samples,)
         Working observations
 
     Returns
@@ -927,12 +1032,12 @@ def fit(self, X, y, sample_weight=None):
         Parameters
         ----------
         X : numpy array or sparse matrix of shape (n_samples, n_features)
-            Training data
+            Training data.
 
-        y : numpy array of shape (n_samples, )
-            Target values
+        y : numpy array of shape (n_samples,)
+            Target values.
 
-        sample_weight : array of shape (n_samples, ) or None,\
+        sample_weight : array of shape (n_samples,) or None,\
                 optinal (default=None)
             Individual weights w_i for each sample. Note that for an
             Exponential Dispersion Model (EDM), one has

From 1faedf87eecfd2c6a668e2aec6e28a9fff8780ec Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 30 Aug 2018 21:18:51 +0200
Subject: [PATCH 30/69] Pytest filter warnings use two colons

---
 sklearn/linear_model/tests/test_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index dfa205407a193..1ac5ccd4d3d5c 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -356,7 +356,7 @@ def test_glm_log_regression():
             assert_array_almost_equal(res.coef_, coef)
 
 
-@pytest.mark.filterwarnings('ignore:DeprecationWarning')
+@pytest.mark.filterwarnings('ignore::DeprecationWarning')
 def test_normal_ridge():
     """Test ridge regression for Normal distributions
 

From 992f9819838336a9372a7cadecc53eeafef439ff Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 30 Aug 2018 22:07:33 +0200
Subject: [PATCH 31/69] Improve documentation of arguments that were so far
 undocumented

---
 sklearn/linear_model/glm.py | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 825ee6bfe8c45..a0d0266fa0efe 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -579,6 +579,17 @@ class TweedieDistribution(ExponentialDispersionModel):
     They have :math:`\mu=\mathrm{E}[Y]` and
     :math:`\mathrm{Var}[Y] \propto \mu^power.
 
+    Special cases are:
+
+    ===== ================
+    Power Distribution
+    ===== ================
+    0     Normal
+    1     Poisson
+    (0,1) Compound Poisson
+    2     Gamma
+    3     Inverse Gaussian
+
     Attributes
     ----------
     power : float
@@ -586,6 +597,12 @@ class TweedieDistribution(ExponentialDispersionModel):
             :math:`v(\mu) = \mu^{power}`.
     """
     def __init__(self, power=0):
+        """
+        Parameters
+        ----------
+        power : float (default=0)
+            Power of (of mu) of the variance function.
+        """
         self.power = power
         self._upper_bound = np.Inf
         self._include_upper_bound = False
@@ -623,6 +640,9 @@ def __init__(self, power=0):
             # Positive Stable
             self._lower_bound = 0
             self._include_lower_bound = False
+        else:
+            raise ValueError('The power must be a float, i.e. real number, '
+                             'got (power={})'.format(power))
 
     @property
     def power(self):
@@ -653,12 +673,22 @@ def include_upper_bound(self):
 
     def unit_variance(self, mu):
         """The unit variance of a Tweedie distribution is v(mu)=mu**power.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
         """
         return np.power(mu, self.power)
 
     def unit_variance_derivative(self, mu):
         """The derivative of the unit variance of a Tweedie distribution is
         v(mu)=power*mu**(power-1).
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
         """
         return self.power*np.power(mu, self.power-1)
 
@@ -680,9 +710,6 @@ def unit_deviance(self, y, mu):
             return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
                         y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
 
-    def likelihood(self, y, X, w, phi, weights=1):
-        raise NotImplementedError('This function is not (yet) implemented.')
-
 
 class NormalDistribution(TweedieDistribution):
     """Class for the Normal (aka Gaussian) distribution"""

From 06b8451ea109040371615e9e7baaa8ff505197f0 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 31 Aug 2018 00:26:39 +0200
Subject: [PATCH 32/69] Further improve documentation of arguments

---
 sklearn/linear_model/glm.py | 43 +++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index a0d0266fa0efe..9688b1e0c9e5c 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -601,7 +601,7 @@ def __init__(self, power=0):
         Parameters
         ----------
         power : float (default=0)
-            Power of (of mu) of the variance function.
+            Variance power of the `unit_variance` function.
         """
         self.power = power
         self._upper_bound = np.Inf
@@ -798,7 +798,7 @@ def _irls_step(X, W, P2, z):
 
     Returns
     -------
-    coef: array, shape = (X.shape[1])
+    coef: array, shape (X.shape[1])
     """
     # TODO: scipy.linalg.solve is faster, but ordinary least squares uses
     #       scipy.linalg.lstsq. What is more appropriate?
@@ -898,7 +898,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
-    P1 : None or array of shape (n_features*, ), optional\
+    P1 : None or array of shape (n_features*,), optional\
             (default=None)
         With this array, you can exclude coefficients from the L1 penalty.
         Set the corresponding value to 1 (include) or 0 (exclude). The
@@ -1007,7 +1007,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     Attributes
     ----------
-    coef_ : array, shape (n_features, )
+    coef_ : array, shape (n_features,)
         Estimated coefficients for the linear predictor (X*coef_) in the GLM.
 
     intercept_ : float
@@ -1679,7 +1679,7 @@ def Hs(s):
 
         if self.fit_dispersion in ['chisqr', 'deviance']:
             # attention because of rescaling of weights
-            self.dispersion_ = self.estimate_phi(y, X, weights)*weights_sum
+            self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum
 
         return self
 
@@ -1688,12 +1688,12 @@ def linear_predictor(self, X):
 
         Parameters
         ----------
-        X : numpy array or sparse matrix of shape [n_samples,n_features]
+        X : numpy array or sparse matrix, shape (n_samples, n_features)
             Samples.
 
         Returns
         -------
-        C : array, shape = (n_samples)
+        C : array, shape (n_samples)
             Returns predicted values of linear predictor.
         """
         check_is_fitted(self, "coef_")
@@ -1709,12 +1709,15 @@ def predict(self, X, sample_weight=None):
 
         Parameters
         ----------
-        X : numpy array or sparse matrix of shape [n_samples,n_features]
+        X : numpy array or sparse matrix, shape (n_samples, n_features)
             Samples.
 
+        sample_weight : array of shape (n_samples,) or None , \
+             (default=None)
+
         Returns
         -------
-        C : array, shape = (n_samples)
+        C : array, shape (n_samples,)
             Returns predicted values times sample_weight.
         """
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
@@ -1726,9 +1729,21 @@ def predict(self, X, sample_weight=None):
 
         return mu*weights
 
-    def estimate_phi(self, y, X, sample_weight=None):
-        """Estimation of the dispersion parameter.
+    def estimate_phi(self, X, y, sample_weight=None):
+        """Estimation of the dispersion parameter phi.
         Returns the estimate.
+
+        Parameters
+        ----------
+        X : numpy array or sparse matrix of shape (n_samples, n_features)
+            Training data.
+
+        y : numpy array, shape (n_samples,)
+            Target values.
+
+        sample_weight : array of shape (n_samples,) or None,\
+                optinal (default=None)
+            Sample weights.
         """
         check_is_fitted(self, "coef_")
         _dtype = [np.float64, np.float32]
@@ -1773,13 +1788,13 @@ def score(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like, shape = (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Test samples
 
-        y : array-like of shape = (n_samples)
+        y : array-like, shape (n_samples,)
             True valeus for X.
 
-        sample_weight : array-like, shape = (n_samples), optional
+        sample_weight : array-like, shape = (n_samples,), optional
             Sample weights.
 
         Returns

From c93f60d9e98a5be0d493d513e37c3c9de5167542 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 31 Aug 2018 08:25:01 +0200
Subject: [PATCH 33/69] Remove parameters docstring for __init__

---
 sklearn/linear_model/glm.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 9688b1e0c9e5c..021927b598822 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -597,12 +597,6 @@ class TweedieDistribution(ExponentialDispersionModel):
             :math:`v(\mu) = \mu^{power}`.
     """
     def __init__(self, power=0):
-        """
-        Parameters
-        ----------
-        power : float (default=0)
-            Variance power of the `unit_variance` function.
-        """
         self.power = power
         self._upper_bound = np.Inf
         self._include_upper_bound = False

From 66ec63b5157026f8541e2761c29ef3225d89a44c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 31 Aug 2018 19:24:23 +0200
Subject: [PATCH 34/69] Fix typos in docstring of TweedieDistribution

---
 sklearn/linear_model/glm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 021927b598822..65abb42b043d5 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -577,7 +577,7 @@ def starting_mu(self, y, weights=1):
 class TweedieDistribution(ExponentialDispersionModel):
     r"""A class for the Tweedie distribution.
     They have :math:`\mu=\mathrm{E}[Y]` and
-    :math:`\mathrm{Var}[Y] \propto \mu^power.
+    :math:`\mathrm{Var}[Y] \propto \mu^power`.
 
     Special cases are:
 
@@ -593,7 +593,7 @@ class TweedieDistribution(ExponentialDispersionModel):
     Attributes
     ----------
     power : float
-            The variance power of the unit_variance
+            The variance power of the `unit_variance`
             :math:`v(\mu) = \mu^{power}`.
     """
     def __init__(self, power=0):

From 53c69702c790223d3940cd650b9d52a61e39b244 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Fri, 31 Aug 2018 22:20:43 +0200
Subject: [PATCH 35/69] Change docstring section of TweedieDistribution from
 Attributes to Parameters

---
 sklearn/linear_model/glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 65abb42b043d5..fcb6c9754b826 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -590,7 +590,7 @@ class TweedieDistribution(ExponentialDispersionModel):
     2     Gamma
     3     Inverse Gaussian
 
-    Attributes
+    Parameters
     ----------
     power : float
             The variance power of the `unit_variance`

From 87d5ba38c5663224edd7ec10bc7efdf27172e7f8 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 7 Oct 2018 19:43:42 +0200
Subject: [PATCH 36/69] Minor doc improvements of GeneralizedLinearRegressor

---
 sklearn/linear_model/glm.py | 87 +++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index fcb6c9754b826..01d5420773ffc 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -6,17 +6,16 @@
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
-# TODO: Write more tests
-# TODO: Write examples and more docu
-# TODO: deal with option self.copy_X
+# TODO: Write examples
+# TODO: Make option self.copy_X more meaningfull than just for start values.
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
-# TODO: Add cross validation support
+# TODO: Add cross validation support?
 # TODO: Should GeneralizedLinearRegressor inherit from LinearModel?
 #       So far, it does not.
 # TODO: Include further classes in class.rst? ExponentialDispersionModel?
 #       TweedieDistribution?
-# TODO: Negative values in P1 are not allowed so far. They could be used to
+# TODO: Negative values in P1 are not allowed so far. They could be used
 #       for group lasso.
 
 # Design Decisions:
@@ -26,7 +25,7 @@
 #   regressor, Bernoulli/Binomial => classifier.
 #   Solution: GeneralizedLinearRegressor since this is the focus.
 # - Allow for finer control of penalty terms:
-#   L1: ||P1*w||_1 with P1*w a componentwise product, this allows to exclude
+#   L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude
 #       factors from the L1 penalty.
 #   L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be
 #   a 1st or 2nd order difference matrix (compare B-spline penalties and
@@ -322,7 +321,7 @@ def variance(self, mu, phi=1, weights=1):
         mu : array, shape (n_samples,)
             Predicted mean.
 
-        phi : float
+        phi : float (default=1)
             Dispersion parameter.
 
         weights : array, shape (n_samples,) (default=1)
@@ -592,7 +591,7 @@ class TweedieDistribution(ExponentialDispersionModel):
 
     Parameters
     ----------
-    power : float
+    power : float (default=0)
             The variance power of the `unit_variance`
             :math:`v(\mu) = \mu^{power}`.
     """
@@ -779,22 +778,22 @@ def _irls_step(X, W, P2, z):
 
     Parameters
     ----------
-    X : numpy array or sparse matrix of shape (n_samples, n_features)
+    X : {numpy array, sparse matrix}, shape (n_samples, n_features)
         Training data (with intercept included if present)
 
-    W : numpy array of shape (n_samples,)
+    W : numpy array, shape (n_samples,)
 
-    P2 : numpy array or sparse matrix of shape (n_features, n_features)
-        The l2-penalty matrix or vector (=diagonal matrix)
+    P2 : {numpy array, sparse matrix}, shape (n_features, n_features)
+        The L2-penalty matrix or vector (=diagonal matrix)
 
-    z  : numpy array of shape (n_samples,)
+    z  : numpy array, shape (n_samples,)
         Working observations
 
     Returns
     -------
     coef: array, shape (X.shape[1])
     """
-    # TODO: scipy.linalg.solve is faster, but ordinary least squares uses
+    # TODO: scipy.linalg.solve seems faster, but ordinary least squares uses
     #       scipy.linalg.lstsq. What is more appropriate?
     n_samples, n_features = X.shape
     if sparse.issparse(X):
@@ -892,19 +891,20 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
-    P1 : None or array of shape (n_features*,), optional\
+    P1 : {None, array-like}, shape (n_features*,), optional\
             (default=None)
         With this array, you can exclude coefficients from the L1 penalty.
         Set the corresponding value to 1 (include) or 0 (exclude). The
-        default value ``None`` is the same as an array of ones.
+        default value ``None`` is the same as a 1d array of ones.
         Note that n_features* = X.shape[1] = length of coef_ (intercept
         always excluded from counting).
 
-    P2 : None or array of shape (n_features*, n_features*), optional\
-            (default=None)
+    P2 : {None, array-like, sparse matrix}, shape \
+            (n_features*, n_features*), optional (default=None)
         With this square matrix the L2 penalty is calculated as `w P2 w`.
         This gives a fine control over this penalty (Tikhonov
         regularization).
+        The default value ``None`` is the same as the idendity matrix.
         Note that n_features* = X.shape[1] = length of coef_ (intercept
         always excluded from counting). P2 must be positive semi-definite.
 
@@ -939,8 +939,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties.
 
-        - 'cd' is the coordinate descent algorithm. It can deal with L1 and
-            L2 penalties.
+        - 'cd' is the coordinate descent algorithm. It can
+            deal with L1 as well as L2 penalties.
 
     max_iter : int, optional (default=100)
         The maximal number of iterations for solver algorithms.
@@ -958,8 +958,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         does not exit (first call to fit), option ``start_params`` sets the
         starting values for ``coef_`` and ``intercept_``.
 
-    start_params : {None, 'least_squares', 'zero'} or array of shape \
-            (n_features, ) or }, optional (default=None)
+    start_params : {None, 'least_squares', 'zero', array of shape \
+            (n_features, )}, optional (default=None)
         If an array of size n_features is supplied, use these as start values
         for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
         is assumed to be the start value for the ``intercept_``.
@@ -979,7 +979,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         (setting to 'random') often leads to significantly faster convergence
         especially when tol is higher than 1e-4.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : {int, RandomState instance, None}, optional (default=None)
         The seed of the pseudo random number generator that selects a random
         feature to be updated for solver 'cd' (coordinate descent).
         If int, random_state is the seed used by the random
@@ -1052,13 +1052,13 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : numpy array or sparse matrix of shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Training data.
 
-        y : numpy array of shape (n_samples,)
+        y : array-like, shape (n_samples,)
             Target values.
 
-        sample_weight : array of shape (n_samples,) or None,\
+        sample_weight : {None, array-like}, shape (n_samples,),\
                 optinal (default=None)
             Individual weights w_i for each sample. Note that for an
             Exponential Dispersion Model (EDM), one has
@@ -1190,12 +1190,12 @@ def fit(self, X, y, sample_weight=None):
         if self.P1 is None:
             P1 = np.ones(X.shape[1])
         else:
-            P1 = np.atleast_1d(np.copy(self.P1))
+            P1 = np.copy(np.atleast_1d(self.P1))
             if P1.dtype.kind not in ['b', 'i', 'u', 'f']:
                 raise ValueError("P1 must be a numeric value; "
                                  "got (dtype={0}).".format(P1.dtype))
             if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]):
-                raise ValueError("P1 must be either None or an 1D array with "
+                raise ValueError("P1 must be either None or a 1d array with "
                                  "the length of X.shape[1]; "
                                  "got (P1.shape[0]={0}), "
                                  "needed (X.shape[1]={1})."
@@ -1324,6 +1324,7 @@ def fit(self, X, y, sample_weight=None):
                 coef = _irls_step(Xnew, W, P2, z)
             else:
                 # with L1 penalty, start with coef = 0
+                # TODO: Are there better options?
                 coef = np.zeros(n_features)
         elif isinstance(self.start_params, six.string_types):
             if self.start_params == 'zero':
@@ -1353,7 +1354,7 @@ def fit(self, X, y, sample_weight=None):
         # 4. fit                                                              #
         #######################################################################
         # algorithms for optimiation
-        # TODO: Parallelize it
+        # TODO: Parallelize it?
         self.n_iter_ = 0
         converged = False
         # 4.1 IRLS ############################################################
@@ -1682,12 +1683,12 @@ def linear_predictor(self, X):
 
         Parameters
         ----------
-        X : numpy array or sparse matrix, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Samples.
 
         Returns
         -------
-        C : array, shape (n_samples)
+        C : array, shape (n_samples,)
             Returns predicted values of linear predictor.
         """
         check_is_fitted(self, "coef_")
@@ -1703,17 +1704,18 @@ def predict(self, X, sample_weight=None):
 
         Parameters
         ----------
-        X : numpy array or sparse matrix, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Samples.
 
-        sample_weight : array of shape (n_samples,) or None , \
-             (default=None)
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
 
         Returns
         -------
         C : array, shape (n_samples,)
             Returns predicted values times sample_weight.
         """
+        # TODO: Is copy=True necessary?
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
                         dtype='numeric', copy=True, ensure_2d=True,
                         allow_nd=False)
@@ -1729,14 +1731,14 @@ def estimate_phi(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : numpy array or sparse matrix of shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Training data.
 
-        y : numpy array, shape (n_samples,)
+        y : array-like, shape (n_samples,)
             Target values.
 
-        sample_weight : array of shape (n_samples,) or None,\
-                optinal (default=None)
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
             Sample weights.
         """
         check_is_fitted(self, "coef_")
@@ -1782,13 +1784,14 @@ def score(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
-            Test samples
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Test samples.
 
         y : array-like, shape (n_samples,)
-            True valeus for X.
+            True values of target.
 
-        sample_weight : array-like, shape = (n_samples,), optional
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
             Sample weights.
 
         Returns

From a9ae023ec331e782d7a23de18f7e6fbb0dd1f57d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Mon, 8 Oct 2018 20:21:35 +0200
Subject: [PATCH 37/69] Double escape in doctring of GeneralizedLinearRegressor

---
 sklearn/linear_model/glm.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 01d5420773ffc..535af60289f8e 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -866,13 +866,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     TODO: Estimation of the dispersion parameter phi.
 
-    If your target `y` is a ratio, you should also provide appropriate weights
-    `w`. As an example, consider Poission distributed counts `z` (integers) and
-    weights `w`=exposure (time, money, persons years, ...), then you fit
+    If the target `y` is a ratio, appropriate weights `w` should be provided.
+    As an example, consider Poission distributed counts `z` (integers) and
+    weights `w`=exposure (time, money, persons years, ...). Then you fit
     `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y,
-    sample_weight=w)``. You need the weights for the right mean, consider:
-    :math:`\bar(y) = \frac{\sum_i w_i y_i}{\sum_i w_i}`.
-    In this case one might say that y has a 'scaled' Poisson distributions.
+    sample_weight=w)``. The weights are necessary for the right mean, consider:
+    :math:`\\bar(y) = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`.
+    In this case one might say that 'y' has a 'scaled' Poisson distributions.
     The same holds for other distributions.
 
     Parameters
@@ -891,7 +891,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
-    P1 : {None, array-like}, shape (n_features*,), optional\
+    P1 : {None, array-like}, shape (n_features*,), optional \
             (default=None)
         With this array, you can exclude coefficients from the L1 penalty.
         Set the corresponding value to 1 (include) or 0 (exclude). The

From bb62485166412d8ba6393e5ba753b015b806867c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Mon, 31 Dec 2018 16:54:40 +0100
Subject: [PATCH 38/69] Add example for GeneralizedLinearRegressor

* add example

* improve docstring of GeneralizedLinearRegressor

* improve user guide for GeneralizedLinearRegressor
---
 doc/modules/linear_model.rst                  | 34 +++++---
 .../plot_poisson_spline_regression.py         | 83 +++++++++++++++++++
 sklearn/linear_model/glm.py                   | 22 ++---
 3 files changed, 115 insertions(+), 24 deletions(-)
 create mode 100644 examples/linear_model/plot_poisson_spline_regression.py

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index a204ccb080cc9..d65f7ed121f8e 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -901,12 +901,15 @@ for a more versatile L2 penalty.
 Use cases, where a loss different from the squared loss might be appropriate,
 are the following:
 
-  * If the target values :math:`y` are counts (integer valued) or frequencies, you might try a Poisson deviance.
+  * If the target values :math:`y` are counts (non-negative integer valued) or
+    frequencies (non-negative), you might use a Poisson deviance with log-link.
 
-  * If the target values are positive valued and skewed, you might try a Gamma deviance.
+  * If the target values are positive valued and skewed, you might try a
+    Gamma deviance with log-link.
 
-  * If the target values seem to be heavier tailed than a Gamma distribution, you might try an Inverse Gaussian deviance (or even higher variance powers of the Tweedie family).
-    Keep in mind that the mean is not a good measure for very heavy tailed distributions, cf. extreme value theory.
+  * If the target values seem to be heavier tailed than a Gamma distribution,
+    you might try an Inverse Gaussian deviance (or even higher variance powers
+    of the Tweedie family).
 
 Since the linear predictor :math:`Xw` can be negative and
 Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
@@ -931,6 +934,11 @@ follows:
     >>> reg.intercept_ #doctest: +ELLIPSIS
     -0.76383575...
 
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py`
+
 Mathematical formulation
 ------------------------
 
@@ -956,20 +964,20 @@ e.g. accounting for the dependence structure of :math:`y`.
 The objective function, which is independent of :math:`\phi`, is minimized with
 respect to the coefficients :math:`w`.
 
-The deviance is defined by
+The deviance is defined by the log of the EDM likelihood as
 
 .. math::     D(y, \mu) = -2\phi\cdot
               \left(loglike(y,\mu,\frac{\phi}{s})
               - loglike(y,y,\frac{\phi}{s})\right)
 
-=====================================  =================================
-Distribution                           Variance Function :math:`v(\mu)`
-=====================================  =================================
-Normal ("normal")                      :math:`1`
-Poisson ("poisson")                    :math:`\mu`
-Gamma ("gamma")                        :math:`\mu^2`
-Inverse Gaussian ("inverse.gaussian")  :math:`\mu^3`
-=====================================  =================================
+===================================== ===============================  ================================= ============================================
+Distribution                          Target Domain                    Variance Function :math:`v(\mu)`  Deviance :math:`D(y, \mu)`
+===================================== ===============================  ================================= ============================================
+Normal ("normal")                     :math:`y \in (-\infty, \infty)`  :math:`1`                         :math:`(y-\mu)^2`
+Poisson ("poisson")                   :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{/mu}-y+\mu)`
+Gamma ("gamma")                       :math:`y \in (0, \infty)`        :math:`\mu^2`                     :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
+Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)`        :math:`\mu^3`                     :math:`\frac{(y-\mu)^2}{y\mu^2}`
+===================================== ===============================  ================================= ============================================
 
 Two remarks:
 
diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py
new file mode 100644
index 0000000000000..b98bca5d8f867
--- /dev/null
+++ b/examples/linear_model/plot_poisson_spline_regression.py
@@ -0,0 +1,83 @@
+"""
+=================================
+Poisson Regression with B-Splines
+=================================
+
+As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py`
+example, a Poisson regression with penalized B-splines (P-splines) [1]_ is
+fitted on slightly different sinusodial, Poisson distributed data and
+compared to an AdaBoost model with decision trees.
+One can see, that this is a hard problem for both estimators.
+
+.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines
+       and penalties". Statist. Sci. 11 (1996), no. 2, 89--121.
+       `doi:10.1214/ss/1038425655
+       <https://projecteuclid.org/euclid.ss/1038425655>`_
+
+"""
+print(__doc__)
+
+# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
+# based on the AdaBoost regression example from Noel Dawe <noel.dawe@gmail.com>
+# License: BSD 3 clause
+
+# importing necessary libraries
+import numpy as np
+from scipy.linalg import toeplitz
+from scipy.interpolate import BSpline
+import matplotlib.pyplot as plt
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import AdaBoostRegressor
+from sklearn.linear_model import GeneralizedLinearRegressor
+
+
+# Create the dataset
+xmin, xmax = 0, 6
+rng = np.random.RandomState(1)
+X = np.linspace(xmin, xmax, 500)[:, np.newaxis]
+y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel())
+y = rng.poisson(y_true, X.shape[0])
+
+# b-spline basis
+nknots, degree = 40, 3
+ns = nknots - degree - 1  # number of base spline functions
+dx = (xmax - xmin) / (nknots - 1 - 2 * degree)
+knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots)
+coef = np.zeros(ns)
+splineBasis = np.empty((X.shape[0], ns), dtype=float)
+for i in range(ns):
+    coef[i] = 1
+    splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \
+        .ravel()
+    coef[i] = 0
+
+# second order difference matrix
+P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float)
+P2[0, 0] = P2[-1, -1] = 1
+
+# Fit regression model
+regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
+                           n_estimators=10, random_state=rng)
+
+regr_2 = GeneralizedLinearRegressor(family='poisson', link='log',
+                                    fit_intercept=True, alpha=0.02,
+                                    l1_ratio=0.1, P2=P2)
+
+regr_1.fit(X, y)
+regr_2.fit(splineBasis, y)
+
+# Predict
+y_1 = regr_1.predict(X)
+y_2 = regr_2.predict(splineBasis)
+
+# Plot the results
+plt.figure()
+plt.plot(X, y_true, c="b", label="true mean")
+plt.scatter(X, y, c="k", marker='.', label="training samples")
+plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2)
+plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2)
+plt.xlabel("data")
+plt.ylabel("target")
+plt.title("Regression Comparison")
+plt.legend()
+plt.show()
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 535af60289f8e..37afc8da2d6db 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -6,7 +6,7 @@
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
-# TODO: Write examples
+# TODO: Write more examples.
 # TODO: Make option self.copy_X more meaningfull than just for start values.
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
@@ -832,7 +832,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     with inverse link function `h` and s=sum of `sample_weight` (which equals
     n_samples for `sample_weight=None`).
-    For `P1`=`P2`=identity, the penalty is the elastic net::
+    For `P1=P2=identity`, the penalty is the elastic net::
 
             alpha * l1_ratio * ||w||_1
             + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2
@@ -868,11 +868,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     If the target `y` is a ratio, appropriate weights `w` should be provided.
     As an example, consider Poission distributed counts `z` (integers) and
-    weights `w`=exposure (time, money, persons years, ...). Then you fit
-    `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y,
-    sample_weight=w)``. The weights are necessary for the right mean, consider:
-    :math:`\\bar(y) = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`.
-    In this case one might say that 'y' has a 'scaled' Poisson distributions.
+    weights `w=exposure` (time, money, persons years, ...). Then you fit
+    `y = z/w`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
+    sample_weight=w)``. The weights are necessary for the right meanself.
+    Consider :math:`\\bar{y} = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`,
+    in this case one might say that `y` has a 'scaled' Poisson distributions.
     The same holds for other distributions.
 
     Parameters
@@ -1017,10 +1017,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     References
     ----------
     For the coordinate descent implementation:
-    .. [1] Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
-           An Improved GLMNET for L1-regularized Logistic Regression,
-           Journal of Machine Learning Research 13 (2012) 1999-2030
-           https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+        * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+          An Improved GLMNET for L1-regularized Logistic Regression,
+          Journal of Machine Learning Research 13 (2012) 1999-2030
+          https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
     """
     def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
                  fit_intercept=True, family='normal', link='identity',

From 16d064db7cee1d59569d21631cc2fa41be8b3b14 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 1 Jan 2019 11:58:41 +0100
Subject: [PATCH 39/69] Resolve merge conflicts

* resolve merge conflicts in linear_model.rst

* replace BSpline by splev to support older scipy versions
---
 doc/modules/linear_model.rst                       | 14 +++++++-------
 .../linear_model/plot_poisson_spline_regression.py |  8 +++++---
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index d65f7ed121f8e..09f14735c2907 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -876,18 +876,18 @@ to warm-starting (see :term:`Glossary <warm_start>`).
 
 .. _Generalized_linear_regression:
 
-Generalized linear regression
+Generalized Linear Regression
 =============================
 
 :class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two
-ways [8]_. First, the predicted values :math:`\hat{y}` are linked to a linear
+ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear
 combination of the input variables :math:`X` via an inverse link function
 :math:`h` as
 
 .. math::    \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p).
 
 Secondly, the squared loss function is replaced by the deviance :math:`D` of an
-exponential dispersion model (EDM) [9]_. The objective function beeing minimized
+exponential dispersion model (EDM) [11]_. The objective function beeing minimized
 becomes
 
 .. math::    \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1
@@ -983,7 +983,7 @@ Two remarks:
 
 * The deviances for at least Normal, Poisson and Gamma distributions are
   strictly consistent scoring functions for the mean :math:`\mu`, see Eq.
-  (19)-(20) in [10]_.
+  (19)-(20) in [12]_.
 
 * If you want to model a frequency, i.e. counts per exposure (time, volume, ...)
   you can do so by a Poisson distribution and passing
@@ -993,12 +993,12 @@ Two remarks:
 
 .. topic:: References:
 
-    .. [8] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+    .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
 
-    .. [9] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
+    .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
            See also `Exponential dispersion model. <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
 
-    .. [10] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
+    .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
 
 Stochastic Gradient Descent - SGD
 =================================
diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py
index b98bca5d8f867..fce85fae1ea8c 100644
--- a/examples/linear_model/plot_poisson_spline_regression.py
+++ b/examples/linear_model/plot_poisson_spline_regression.py
@@ -24,7 +24,8 @@
 # importing necessary libraries
 import numpy as np
 from scipy.linalg import toeplitz
-from scipy.interpolate import BSpline
+# from scipy.interpolate import BSpline
+from scipy.interpolate import splev
 import matplotlib.pyplot as plt
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.ensemble import AdaBoostRegressor
@@ -47,8 +48,9 @@
 splineBasis = np.empty((X.shape[0], ns), dtype=float)
 for i in range(ns):
     coef[i] = 1
-    splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \
-        .ravel()
+#    splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \
+#        .ravel()
+    splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel()
     coef[i] = 0
 
 # second order difference matrix

From 1a02a901d1a6d99484241ca205178ac61fc47846 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 1 Jan 2019 12:59:30 +0100
Subject: [PATCH 40/69] Adapt for minimum numpy version

* replace np.block
---
 sklearn/linear_model/glm.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 37afc8da2d6db..851767055c61c 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -1235,8 +1235,11 @@ def fit(self, X, y, sample_weight=None):
                 P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2),
                                        dtype=P2.dtype).tocsr()
             else:
-                P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))],
-                               [np.zeros((X.shape[1], 1)), P2]])
+                # as of numpy 1.13 this would work:
+                # P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))],
+                #                [np.zeros((X.shape[1], 1)), P2]])
+                P2 = np.hstack((np.zeros((X.shape[1], 1)), P2))
+                P2 = np.vstack((np.zeros((1, X.shape[1]+1)), P2))
         else:
             Xnew = X
 

From 177eb4cc017a7262e472070b4a920250711a099c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 6 Jan 2019 19:46:19 +0100
Subject: [PATCH 41/69] Remove six dependencies as in #12639

* replace six.with_metaclass(ABCMeta) by metaclass=ABCMeta

* replace six.integer_types by int

* replace six.string_types by str

* rebase

* correct email address
---
 sklearn/linear_model/glm.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 851767055c61c..aca49ec7edf28 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -2,12 +2,13 @@
 Generalized Linear Models with Exponential Dispersion Family
 """
 
-# Author: Christian Lorentzen <lorentzen.ch@googlemail.ch>
+# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
 # TODO: Write more examples.
-# TODO: Make option self.copy_X more meaningfull than just for start values.
+# TODO: Make option self.copy_X more meaningful.
+# So far, fit uses Xnew instead of X.
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
 # TODO: Add cross validation support?
@@ -51,7 +52,6 @@
 from .ridge import Ridge
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
-from ..externals import six
 from ..utils import check_array, check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import newton_cg
@@ -84,7 +84,7 @@ def _check_weights(sample_weight, n_samples):
     return weights
 
 
-class Link(six.with_metaclass(ABCMeta)):
+class Link(metaclass=ABCMeta):
     """Abstract base class for Link funtions
     """
 
@@ -186,7 +186,7 @@ def inverse_derivative2(self, lin_pred):
         return np.exp(lin_pred)
 
 
-class ExponentialDispersionModel(six.with_metaclass(ABCMeta)):
+class ExponentialDispersionModel(metaclass=ABCMeta):
     r"""Base class for reproductive Exponential Dispersion Models (EDM).
 
     The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
@@ -1142,7 +1142,7 @@ def fit(self, X, y, sample_weight=None):
                                  "with L1 penalties, which are included with "
                                  "(alpha={1}) and (l1_ratio={2})."
                                  .format(solver, self.alpha, self.l1_ratio))
-        if (not isinstance(self.max_iter, six.integer_types)
+        if (not isinstance(self.max_iter, int)
                 or self.max_iter <= 0):
             raise ValueError("Maximum number of iteration must be a positive "
                              "integer;"
@@ -1156,7 +1156,7 @@ def fit(self, X, y, sample_weight=None):
         start_params = self.start_params
         if start_params is None:
             pass
-        elif isinstance(start_params, six.string_types):
+        elif isinstance(start_params, str):
             if start_params not in ['least_squares', 'zero']:
                 raise ValueError("The argument start_params must be None, "
                                  "'least-squares', 'zero' or an array of right"
@@ -1329,7 +1329,7 @@ def fit(self, X, y, sample_weight=None):
                 # with L1 penalty, start with coef = 0
                 # TODO: Are there better options?
                 coef = np.zeros(n_features)
-        elif isinstance(self.start_params, six.string_types):
+        elif isinstance(self.start_params, str):
             if self.start_params == 'zero':
                 coef = np.zeros(n_features)
             elif self.start_params == 'least_squares':

From 3d4c784df6797c244b31dd3393083d2c63114bf2 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 3 Feb 2019 21:38:24 +0100
Subject: [PATCH 42/69] Improve user guide, doc and fix penalty parameter for
 Ridge

* move parts of docstring to new Notes section

* improve user guide and doc

* fix typos

* fix scaling of penalty parameter in Ridge()

* docstring for _check_weights

* reduce tol for parameter initialization
---
 doc/modules/linear_model.rst |  18 +++---
 sklearn/linear_model/glm.py  | 114 +++++++++++++++++++----------------
 2 files changed, 73 insertions(+), 59 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 09f14735c2907..174d1e4eddae4 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -890,7 +890,7 @@ Secondly, the squared loss function is replaced by the deviance :math:`D` of an
 exponential dispersion model (EDM) [11]_. The objective function beeing minimized
 becomes
 
-.. math::    \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1
+.. math::    \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1
             +\frac{\alpha(1-\rho)}{2} w^T P_2 w
 
 with sample weights :math:`s`.
@@ -914,7 +914,7 @@ are the following:
 Since the linear predictor :math:`Xw` can be negative and
 Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
 it is convenient to apply a link function different from the identity link
-:math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with
+:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with
 :math:`h(Xw)=\exp(Xw)`.
 
 Note that the feature matrix `X` should be standardized before fitting. This
@@ -964,17 +964,19 @@ e.g. accounting for the dependence structure of :math:`y`.
 The objective function, which is independent of :math:`\phi`, is minimized with
 respect to the coefficients :math:`w`.
 
-The deviance is defined by the log of the EDM likelihood as
+The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
+likelihood as
 
-.. math::     D(y, \mu) = -2\phi\cdot
-              \left(loglike(y,\mu,\frac{\phi}{s})
-              - loglike(y,y,\frac{\phi}{s})\right)
+.. math::     d(y, \mu) = -2\phi\cdot
+              \left(loglike(y,\mu,\phi)
+              - loglike(y,y,\phi)\right) \\
+              D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i)
 
 ===================================== ===============================  ================================= ============================================
-Distribution                          Target Domain                    Variance Function :math:`v(\mu)`  Deviance :math:`D(y, \mu)`
+Distribution                          Target Domain                    Variance Function :math:`v(\mu)`  Unit Deviance :math:`d(y, \mu)`
 ===================================== ===============================  ================================= ============================================
 Normal ("normal")                     :math:`y \in (-\infty, \infty)`  :math:`1`                         :math:`(y-\mu)^2`
-Poisson ("poisson")                   :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{/mu}-y+\mu)`
+Poisson ("poisson")                   :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{\mu}-y+\mu)`
 Gamma ("gamma")                       :math:`y \in (0, \infty)`        :math:`\mu^2`                     :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
 Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)`        :math:`\mu^3`                     :math:`\frac{(y-\mu)^2}{y\mu^2}`
 ===================================== ===============================  ================================= ============================================
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index aca49ec7edf28..5fc869f81195f 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -8,7 +8,7 @@
 
 # TODO: Write more examples.
 # TODO: Make option self.copy_X more meaningful.
-# So far, fit uses Xnew instead of X.
+#       So far, fit uses Xnew instead of X.
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
 # TODO: Add cross validation support?
@@ -28,7 +28,7 @@
 # - Allow for finer control of penalty terms:
 #   L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude
 #       factors from the L1 penalty.
-#   L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be
+#   L2: w*P2*w with P2 a (semi-) positive definite matrix, e.g. P2 could be
 #   a 1st or 2nd order difference matrix (compare B-spline penalties and
 #   Tikhonov regularization).
 # - The link funtion (instance of class Link) is necessary for the evaluation
@@ -59,6 +59,8 @@
 
 
 def _check_weights(sample_weight, n_samples):
+    """Check that weights are non-negative and have the right shape
+    """
     if sample_weight is None:
         weights = np.ones(n_samples)
     elif np.isscalar(sample_weight):
@@ -594,6 +596,7 @@ class TweedieDistribution(ExponentialDispersionModel):
     power : float (default=0)
             The variance power of the `unit_variance`
             :math:`v(\mu) = \mu^{power}`.
+            For ``0<power<1``, no distribution exists.
     """
     def __init__(self, power=0):
         self.power = power
@@ -826,12 +829,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     Minimizes the objective function::
 
-            1/(2s) * deviance(y, h(X*w))
+            1/(2*sum(s)) * deviance(y, h(X*w); s)
             + alpha * l1_ratio * ||P1*w||_1
             + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
 
-    with inverse link function `h` and s=sum of `sample_weight` (which equals
-    n_samples for `sample_weight=None`).
+    with inverse link function `h` and s=`sample_weight` (for
+    `sample_weight=Nones` one has s=1 and sum(s) equals `n_samples`).
     For `P1=P2=identity`, the penalty is the elastic net::
 
             alpha * l1_ratio * ||w||_1
@@ -852,29 +855,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     Read more in the :ref:`User Guide <Generalized_linear_regression>`.
 
-    The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments :math:`E[Y_i]=\\mu_i=h(\\eta_i)` and
-    :math:`Var[Y_i]=\\frac{\\phi}{w_i} v(\\mu_i)`.
-
-    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
-    (penalized) maximum likelihood which is equivalent to minimizing the
-    deviance.
-
-    For `alpha` > 0, the feature matrix `X` should be standardized in order to
-    penalize features equally strong. Call
-    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
-
-    TODO: Estimation of the dispersion parameter phi.
-
-    If the target `y` is a ratio, appropriate weights `w` should be provided.
-    As an example, consider Poission distributed counts `z` (integers) and
-    weights `w=exposure` (time, money, persons years, ...). Then you fit
-    `y = z/w`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
-    sample_weight=w)``. The weights are necessary for the right meanself.
-    Consider :math:`\\bar{y} = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`,
-    in this case one might say that `y` has a 'scaled' Poisson distributions.
-    The same holds for other distributions.
-
     Parameters
     ----------
     alpha : float, optional (default=1)
@@ -891,22 +871,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
-    P1 : {None, array-like}, shape (n_features*,), optional \
+    P1 : {None, array-like}, shape (n_features,), optional \
             (default=None)
         With this array, you can exclude coefficients from the L1 penalty.
         Set the corresponding value to 1 (include) or 0 (exclude). The
         default value ``None`` is the same as a 1d array of ones.
-        Note that n_features* = X.shape[1] = length of coef_ (intercept
-        always excluded from counting).
+        Note that n_features = X.shape[1].
 
     P2 : {None, array-like, sparse matrix}, shape \
-            (n_features*, n_features*), optional (default=None)
+            (n_features, n_features), optional (default=None)
         With this square matrix the L2 penalty is calculated as `w P2 w`.
         This gives a fine control over this penalty (Tikhonov
-        regularization).
-        The default value ``None`` is the same as the idendity matrix.
-        Note that n_features* = X.shape[1] = length of coef_ (intercept
-        always excluded from counting). P2 must be positive semi-definite.
+        regularization). The diagonal zeros of a diagonal P2, for example,
+        exclude all corresponding coefficients from the L2 penalty.
+        The default value ``None`` is the same as the identity matrix.
+        Note that n_features = X.shape[1]. P2 must be positive semi-definite.
 
     fit_intercept : boolean, optional (default=True)
         Specifies if a constant (a.k.a. bias or intercept) should be
@@ -929,18 +908,22 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \
             optional (default='auto')
-        Algorithm to use in the optimization problem.
+        Algorithm to use in the optimization problem:
 
-        - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'.
+        'auto'
+            Sets 'irls' if l1_ratio equals 0, else 'cd'.
 
-        - 'irls' is iterated reweighted least squares (Fisher scoring).
+        'irls'
+            iterated reweighted least squares (Fisher scoring).
             It is the standard algorithm for GLMs. Cannot deal with
             L1 penalties.
 
-        - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties.
+        'newton-cg', 'lbfgs'
+            Cannot deal with L1 penalties.
 
-        - 'cd' is the coordinate descent algorithm. It can
-            deal with L1 as well as L2 penalties.
+        'cd'
+            coordinate descent algorithm. It can deal with L1 as well as L2
+            penalties.
 
     max_iter : int, optional (default=100)
         The maximal number of iterations for solver algorithms.
@@ -959,10 +942,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         starting values for ``coef_`` and ``intercept_``.
 
     start_params : {None, 'least_squares', 'zero', array of shape \
-            (n_features, )}, optional (default=None)
-        If an array of size n_features is supplied, use these as start values
+            (n_features*, )}, optional (default=None)
+        If an array of size n_features* is supplied, use it as start values
         for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
         is assumed to be the start value for the ``intercept_``.
+        Note that n_features* = X.shape[1] + fit_intercept includes the
+        intercept in counting.
         If 'least_squares' is set, the result of a least squares fit in the
         link space (linear predictor) is taken.
         If 'zero' is set, all coefficients start with zero.
@@ -1013,6 +998,30 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     n_iter_ : int
         Actual number of iterations of the solver.
 
+    Notes
+    -----
+    The fit itself does not need Y to be from an EDM, but only assumes
+    the first two moments :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`.
+
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    (penalized) maximum likelihood which is equivalent to minimizing the
+    deviance.
+
+    For `alpha` > 0, the feature matrix `X` should be standardized in order to
+    penalize features equally strong. Call
+    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
+
+    If the target `y` is a ratio, appropriate sample weights `s` should be
+    provided.
+    As an example, consider Poission distributed counts `z` (integers) and
+    weights `s=exposure` (time, money, persons years, ...). Then you fit
+    `y = z/s`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
+    sample_weight=s)``. The weights are necessary for the right (finite
+    sample) mean.
+    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
+    in this case one might say that `y` has a 'scaled' Poisson distributions.
+    The same holds for other distributions.
 
     References
     ----------
@@ -1138,10 +1147,10 @@ def fit(self, X, y, sample_weight=None):
             else:
                 solver = 'cd'
         if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']):
-                raise ValueError("The chosen solver (solver={0}) can't deal "
-                                 "with L1 penalties, which are included with "
-                                 "(alpha={1}) and (l1_ratio={2})."
-                                 .format(solver, self.alpha, self.l1_ratio))
+            raise ValueError("The chosen solver (solver={0}) can't deal "
+                             "with L1 penalties, which are included with "
+                             "(alpha={1}) and (l1_ratio={2})."
+                             .format(solver, self.alpha, self.l1_ratio))
         if (not isinstance(self.max_iter, int)
                 or self.max_iter <= 0):
             raise ValueError("Maximum number of iteration must be a positive "
@@ -1340,14 +1349,17 @@ def fit(self, X, y, sample_weight=None):
                 elif self.l1_ratio <= 0.01:
                     # ElasticNet says l1_ratio <= 0.01 is not reliable
                     # => use Ridge
+                    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
                     reg = Ridge(copy_X=True, fit_intercept=False,
-                                alpha=self.alpha)
+                                alpha=self.alpha*n_samples,
+                                tol=np.max([self.tol, np.sqrt(self.tol)]))
                     reg.fit(Xnew, link.link(y))
                     coef = reg.coef_
                 else:
                     # TODO: Does this make sense at all?
                     reg = ElasticNet(copy_X=True, fit_intercept=False,
-                                     alpha=self.alpha, l1_ratio=self.l1_ratio)
+                                     alpha=self.alpha, l1_ratio=self.l1_ratio,
+                                     tol=np.max([self.tol, np.sqrt(self.tol)]))
                     reg.fit(Xnew, link.link(y))
                     coef = reg.coef_
         else:
@@ -1557,7 +1569,7 @@ def Hs(s):
                         # minimize_z: a z + 1/2 b z^2 + c |d+z|
                         # a = A_j
                         # b = B_jj > 0
-                        # c = |P1_j| = P1_j > 0, ee 1.3
+                        # c = |P1_j| = P1_j > 0, see 1.3
                         # d = w_j + d_j
                         # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
                         # with beta = z+d, beta_hat = d-a/b and gamma = c/b

From 919912c3f98e6d1190737e344c14f31c2eef9077 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 17 Feb 2019 18:38:36 +0100
Subject: [PATCH 43/69] Smarter intercept initialization and docstring
 improvements

* smarter initialization of intercept

* PEP 257 -- Docstring Conventions

* minor docstring changes
---
 sklearn/linear_model/glm.py | 265 +++++++++++++++++++++---------------
 1 file changed, 153 insertions(+), 112 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 5fc869f81195f..d69ccd0a66486 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -59,14 +59,13 @@
 
 
 def _check_weights(sample_weight, n_samples):
-    """Check that weights are non-negative and have the right shape
-    """
+    """Check that weights are non-negative and have the right shape."""
     if sample_weight is None:
         weights = np.ones(n_samples)
     elif np.isscalar(sample_weight):
         if sample_weight <= 0:
             raise ValueError("Sample weights must be non-negative.")
-        weights = sample_weight*np.ones(n_samples)
+        weights = sample_weight * np.ones(n_samples)
     else:
         _dtype = [np.float64, np.float32]
         weights = check_array(sample_weight, accept_sparse='csr',
@@ -75,8 +74,8 @@ def _check_weights(sample_weight, n_samples):
         if weights.ndim > 1:
             raise ValueError("Sample weight must be 1D array or scalar")
         elif weights.shape[0] != n_samples:
-            raise ValueError("Sample weights must have the same length as"
-                             " y")
+            raise ValueError("Sample weights must have the same length as "
+                             "y")
         if not np.all(weights >= 0):
             raise ValueError("Sample weights must be non-negative.")
         elif not np.sum(weights) > 0:
@@ -87,70 +86,72 @@ def _check_weights(sample_weight, n_samples):
 
 
 class Link(metaclass=ABCMeta):
-    """Abstract base class for Link funtions
-    """
+    """Abstract base class for Link funtions."""
 
     @abstractmethod
     def link(self, mu):
-        """The link function g(mu) with argument mu=E[Y] returns the
-        linear predictor.
+        """Compute the link function g(mu).
+
+        The link function links the mean mu=E[Y] to the so called linear
+        predictor (X*w), i.e. g(mu) = linear predictor.
 
         Parameters
         ----------
         mu : array, shape (n_samples,)
-            Usually the predicted mean.
+            Usually the (predicted) mean.
         """
         raise NotImplementedError
 
     @abstractmethod
     def derivative(self, mu):
-        """Derivative of the link g'(mu).
+        """Compute the derivative of the link g'(mu).
 
         Parameters
         ----------
         mu : array, shape (n_samples,)
-            Usually the predicted mean.
+            Usually the (predicted) mean.
         """
         raise NotImplementedError
 
     @abstractmethod
     def inverse(self, lin_pred):
-        """The inverse link function h(lin_pred) with the linear predictor as
-        argument returns mu=E[Y].
+        """Compute the inverse link function h(lin_pred).
+
+        Gives the inverse relationship between linkear predictor and the mean
+        mu=E[Y], i.e. h(linear predictor) = mu.
 
         Parameters
         ----------
         lin_pred : array, shape (n_samples,)
-            Usually the (predicted) linear predictor.
+            Usually the (fitted) linear predictor.
         """
         raise NotImplementedError
 
     @abstractmethod
     def inverse_derivative(self, lin_pred):
-        """Derivative of the inverse link function h'(lin_pred).
+        """Compute the derivative of the inverse link function h'(lin_pred).
 
         Parameters
         ----------
         lin_pred : array, shape (n_samples,)
-            Usually the (predicted) linear predictor.
+            Usually the (fitted) linear predictor.
         """
         raise NotImplementedError
 
     @abstractmethod
     def inverse_derivative2(self, lin_pred):
-        """Second derivative of the inverse link function h''(lin_pred).
+        """Compute 2nd derivative of the inverse link function h''(lin_pred).
 
         Parameters
         ----------
         lin_pred : array, shape (n_samples,)
-            Usually the (predicted) linear predictor.
+            Usually the (fitted) linear predictor.
         """
         raise NotImplementedError
 
 
 class IdentityLink(Link):
-    """The identity link function g(x)=x.
-    """
+    """The identity link function g(x)=x."""
 
     def link(self, mu):
         return mu
@@ -169,8 +170,7 @@ def inverse_derivative2(self, lin_pred):
 
 
 class LogLink(Link):
-    """The log link function g(x)=log(x).
-    """
+    """The log link function g(x)=log(x)."""
 
     def link(self, mu):
         return np.log(mu)
@@ -238,26 +238,22 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
 
     @abstractproperty
     def lower_bound(self):
-        """The lower bound of values of Y~EDM.
-        """
+        """The lower bound of values of Y~EDM."""
         raise NotImplementedError()
 
     @abstractproperty
     def upper_bound(self):
-        """The upper bound of values of Y~EDM.
-        """
+        """The upper bound of values of Y~EDM."""
         raise NotImplementedError()
 
     @abstractproperty
     def include_lower_bound(self):
-        """If True, values of y may equal lower bound: y >= lower_bound.
-        """
+        """If True, values of y may equal lower bound: y >= lower_bound."""
         raise NotImplementedError()
 
     @abstractproperty
     def include_upper_bound(self):
-        """If True, values of y may equal upper bound: y <= upper_bound.
-        """
+        """If True, values of y may equal upper bound: y <= upper_bound."""
         raise NotImplementedError()
 
     def in_y_range(self, x):
@@ -285,7 +281,9 @@ def in_y_range(self, x):
 
     @abstractmethod
     def unit_variance(self, mu):
-        r"""The unit variance :math:`v(\mu)` determines the variance as
+        r"""Compute the unit variance function.
+
+        The unit variance :math:`v(\mu)` determines the variance as
         a function of the mean :math:`\mu` by
         :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`.
         It can also be derived from the unit deviance :math:`d(y,\mu)` as
@@ -304,7 +302,9 @@ def unit_variance(self, mu):
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
-        r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`.
+        r"""Compute the derivative of the unit variance w.r.t. mu.
+
+        Return :math:`v'(\mu)`.
 
         Parameters
         ----------
@@ -314,7 +314,9 @@ def unit_variance_derivative(self, mu):
         raise NotImplementedError()
 
     def variance(self, mu, phi=1, weights=1):
-        r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
+        r"""Compute the variance function.
+
+        The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
         :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`,
         with unit variance :math:`v(\mu)` and weights :math:`s_i`.
 
@@ -332,7 +334,9 @@ def variance(self, mu, phi=1, weights=1):
         return phi/weights * self.unit_variance(mu)
 
     def variance_derivative(self, mu, phi=1, weights=1):
-        r"""The derivative of the variance w.r.t. `mu`,
+        r"""Compute the derivative of the variance w.r.t. mu.
+
+        Returns
         :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
         =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)`
         and weights :math:`s_i`.
@@ -352,10 +356,12 @@ def variance_derivative(self, mu, phi=1, weights=1):
 
     @abstractmethod
     def unit_deviance(self, y, mu):
-        r"""The unit_deviance :math:`d(y,\mu)`.
-        In terms of the log-likelihood it is given by
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,\mu)` can be defined by the
+        log-likelihood as
         :math:`d(y,\mu) = -2\phi\cdot
-        \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).`
+        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
 
         Parameters
         ----------
@@ -368,7 +374,9 @@ def unit_deviance(self, y, mu):
         raise NotImplementedError()
 
     def unit_deviance_derivative(self, y, mu):
-        r"""The derivative w.r.t. `mu` of the unit deviance
+        r"""Compute the derivative of the unit deviance w.r.t. mu.
+
+        The derivative of the unit deviance is given by
         :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
         with unit variance :math:`v(\mu)`.
 
@@ -383,9 +391,12 @@ def unit_deviance_derivative(self, y, mu):
         return -2*(y-mu)/self.unit_variance(mu)
 
     def deviance(self, y, mu, weights=1):
-        r"""The deviance is given by :math:`D = \sum_i s_i \cdot d(y, \mu)
+        r"""Compute the deviance.
+
+        The deviance is a weighted sum of the per sample unit deviances,
+        :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)`
         with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`.
-        In terms of the likelihood it is :math:`D = -2\phi\cdot
+        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
         \left(loglike(y,\mu,\frac{phi}{s})
         - loglike(y,y,\frac{phi}{s})\right)`.
 
@@ -403,15 +414,15 @@ def deviance(self, y, mu, weights=1):
         return np.sum(weights*self.unit_deviance(y, mu))
 
     def _deviance(self, coef, X, y, weights, link):
-        """The deviance as a function of the coefficients `coef`
-        (:math:`w`).
-        """
+        """Compute the deviance as a function of the coefficients and data."""
         lin_pred = safe_sparse_dot(X, coef, dense_output=True)
         mu = link.inverse(lin_pred)
         return self.deviance(y, mu, weights)
 
     def deviance_derivative(self, y, mu, weights=1):
-        """The derivative w.r.t. `mu` of the deviance.
+        """Compute the derivative of the deviance w.r.t. mu.
+
+        It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`.
 
         Parameters
         ----------
@@ -427,7 +438,9 @@ def deviance_derivative(self, y, mu, weights=1):
         return weights*self.unit_deviance_derivative(y, mu)
 
     def _score(self, coef, phi, X, y, weights, link):
-        r"""The score function is the derivative of the
+        r"""Compute the score function.
+
+        The score function is the derivative of the
         log-likelihood w.r.t. `coef` (:math:`w`).
         It is given by
 
@@ -453,7 +466,8 @@ def _score(self, coef, phi, X, y, weights, link):
         return score
 
     def _fisher_matrix(self, coef, phi, X, y, weights, link):
-        r"""The Fisher information matrix.
+        r"""Compute the Fisher information matrix.
+
         The Fisher information matrix, also known as expected information
         matrix is given by
 
@@ -482,9 +496,10 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link):
         return fisher_matrix
 
     def _observed_information(self, coef, phi, X, y, weights, link):
-        r"""The observed information matrix.
+        r"""Compute the observed information matrix.
+
         The observed information matrix, also known as the negative of
-        the Hessian matrix of the log-likelihood. It is given by
+        the Hessian matrix of the log-likelihood, is given by
 
         .. math:
 
@@ -518,8 +533,10 @@ def _observed_information(self, coef, phi, X, y, weights, link):
         return observed_information
 
     def _deviance_derivative(self, coef, X, y, weights, link):
-        r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a
-        function of the coefficients `coef`.
+        r"""Compute the derivative of the deviance w.r.t. coef.
+
+        The derivative of the deviance w.r.t. `coef` (:math:`w`) as a
+        function of the coefficients `coef` and the data.
         This is equivalent to :math:`-2\phi` times the score function
         :func:`_score` (derivative of the log-likelihood).
         """
@@ -528,9 +545,11 @@ def _deviance_derivative(self, coef, X, y, weights, link):
         return -2*score
 
     def _deviance_hessian(self, coef, X, y, weights, link):
-        r"""The hessian matrix w.r.t. `coef` (:math:`w`) of the deviance
-        as a function of the coefficients `coef`.
-        This is equivalent to :math:`+2\phi` times the observed information
+        r"""Compute the hessian matrix of the deviance w.r.t. coef.
+
+        The hessian of the deviance w.r.t. `coef` (:math:`w`) is evaluated as
+        a function of the coefficients `coef` and the data.
+        It is equivalent to :math:`+2\phi` times the observed information
         matrix.
         """
         info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y,
@@ -538,8 +557,12 @@ def _deviance_hessian(self, coef, X, y, weights, link):
         return 2*info_matrix
 
     def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link):
-        """Calculates eta (linear predictor), mu, score function (derivative
-        of log-likelihood) and Fisher matrix (all with phi=1) all in one go"""
+        """Compute linear predictor, mean, score function and fisher matrix.
+
+        It calculates the linear predictor, the mean, score function
+        (derivative of log-likelihood) and Fisher information matrix
+        all in one go as function of `coef` (:math:`w`) and the data.
+        """
         n_samples, n_features = X.shape
         # eta = linear predictor
         eta = safe_sparse_dot(X, coef, dense_output=True)
@@ -561,7 +584,9 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link):
         return eta, mu, score, fisher
 
     def starting_mu(self, y, weights=1):
-        """Starting values for the mean mu_i in (unpenalized) IRLS.
+        """Set starting values for the mean mu.
+
+        These may be good starting points for the (unpenalized) IRLS solver.
 
         Parameters
         ----------
@@ -577,7 +602,9 @@ def starting_mu(self, y, weights=1):
 
 class TweedieDistribution(ExponentialDispersionModel):
     r"""A class for the Tweedie distribution.
-    They have :math:`\mu=\mathrm{E}[Y]` and
+
+    A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely
+    defined by it's mean-variance relationship
     :math:`\mathrm{Var}[Y] \propto \mu^power`.
 
     Special cases are:
@@ -668,7 +695,7 @@ def include_upper_bound(self):
         return self._include_upper_bound
 
     def unit_variance(self, mu):
-        """The unit variance of a Tweedie distribution is v(mu)=mu**power.
+        """Compute the unit variance of a Tweedie distribution v(mu)=mu**power.
 
         Parameters
         ----------
@@ -678,8 +705,8 @@ def unit_variance(self, mu):
         return np.power(mu, self.power)
 
     def unit_variance_derivative(self, mu):
-        """The derivative of the unit variance of a Tweedie distribution is
-        v(mu)=power*mu**(power-1).
+        """Compute the derivative of the unit variance of a Tweedie
+        distribution v(mu)=power*mu**(power-1).
 
         Parameters
         ----------
@@ -732,9 +759,9 @@ def __init__(self):
 
 
 class GeneralizedHyperbolicSecand(ExponentialDispersionModel):
-    """A class for the von Generalized Hyperbolic Secand (GHS) distribution.
+    """A class for the Generalized Hyperbolic Secand (GHS) distribution.
 
-    The GHS distribution is for data y in (-inf, inf).
+    The GHS distribution is for tagets y in (-inf, inf).
     """
     def __init__(self):
         self._lower_bound = -np.Inf
@@ -770,7 +797,7 @@ def unit_deviance(self, y, mu):
 
 
 def _irls_step(X, W, P2, z):
-    """One step in iteratively reweighted least squares
+    """Compute one step in iteratively reweighted least squares.
 
     Solve A w = b for w with
     A = (X' W X + P2)
@@ -823,18 +850,18 @@ def _irls_step(X, W, P2, z):
 
 
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
-    """Regression via a Generalized Linear Model (GLM) based on reproductive
-    Exponential Dispersion Models (EDM) with combined L1 and L2 priors as
-    regularizer.
+    """Regression via a Generalized Linear Model (GLM) with penalties.
 
-    Minimizes the objective function::
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) with
+    combined L1 and L2 priors as regularizer minimizes the following objective
+    function::
 
             1/(2*sum(s)) * deviance(y, h(X*w); s)
             + alpha * l1_ratio * ||P1*w||_1
             + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
 
     with inverse link function `h` and s=`sample_weight` (for
-    `sample_weight=Nones` one has s=1 and sum(s) equals `n_samples`).
+    `sample_weight=None`, one has s=1 and sum(s)=`n_samples`).
     For `P1=P2=identity`, the penalty is the elastic net::
 
             alpha * l1_ratio * ||w||_1
@@ -893,16 +920,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\
             of class ExponentialDispersionModel, optional(default='normal')
-        the distributional assumption of the GLM, i.e. which loss function to
-        be minimized.
+        The distributional assumption of the GLM, i.e. which distribution from
+        the EDM, specifies the loss function to be minimized.
 
     link : {'identity', 'log'} or an instance of class Link,
         optional (default='identity')
-        the link function of the GLM, i.e. mapping from linear predictor
+        The link function of the GLM, i.e. mapping from linear predictor
         (X*coef) to expectation (mu).
 
     fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None)
-        method for estimation of the dispersion parameter phi. Whether to use
+        Method for estimation of the dispersion parameter phi. Whether to use
         the chi squared statisic or the deviance statistic. If None, the
         dispersion is not estimated.
 
@@ -914,15 +941,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
             Sets 'irls' if l1_ratio equals 0, else 'cd'.
 
         'irls'
-            iterated reweighted least squares (Fisher scoring).
-            It is the standard algorithm for GLMs. Cannot deal with
+            Iterated reweighted least squares (with Fisher scoring).
+            It is the standard algorithm for GLMs. It cannot deal with
             L1 penalties.
 
         'newton-cg', 'lbfgs'
             Cannot deal with L1 penalties.
 
         'cd'
-            coordinate descent algorithm. It can deal with L1 as well as L2
+            Coordinate descent algorithm. It can deal with L1 as well as L2
             penalties.
 
     max_iter : int, optional (default=100)
@@ -930,15 +957,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     tol : float, optional (default=1e-4)
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
-        the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol``
+        the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative of
-        the deviance).
+        the objective function).
 
     warm_start : boolean, optional (default=False)
-        If set to ``True``, reuse the solution of the previous call to fit as
-        initialization for ``coef_`` and ``intercept_`` (supersedes option
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` (supersedes option
         ``start_params``). If set to ``True`` or if the attribute ``coef_``
-        does not exit (first call to fit), option ``start_params`` sets the
+        does not exit (first call to ``fit``), option ``start_params`` sets the
         starting values for ``coef_`` and ``intercept_``.
 
     start_params : {None, 'least_squares', 'zero', array of shape \
@@ -946,8 +973,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         If an array of size n_features* is supplied, use it as start values
         for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
         is assumed to be the start value for the ``intercept_``.
-        Note that n_features* = X.shape[1] + fit_intercept includes the
-        intercept in counting.
+        Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes
+        the intercept in counting.
         If 'least_squares' is set, the result of a least squares fit in the
         link space (linear predictor) is taken.
         If 'zero' is set, all coefficients start with zero.
@@ -960,7 +987,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         For the solver 'cd' (coordinate descent), the coordinates (features)
         can be updated in either cyclic or random order.
         If set to 'random', a random coefficient is updated every iteration
-        rather than looping over features sequentially by default. This
+        rather than looping over features sequentially in the same order. This
         (setting to 'random') often leads to significantly faster convergence
         especially when tol is higher than 1e-4.
 
@@ -1057,7 +1084,7 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
         self.verbose = verbose
 
     def fit(self, X, y, sample_weight=None):
-        """Fit a generalized linear model.
+        """Fit a Generalized Linear Model.
 
         Parameters
         ----------
@@ -1087,12 +1114,14 @@ def fit(self, X, y, sample_weight=None):
         _dtype = [np.float64, np.float32]
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          dtype=_dtype, y_numeric=True, multi_output=False)
+        # Without converting y to float, deviance might raise
+        # ValueError: Integers to negative integer powers are not allowed.
         y = y.astype(np.float64)
 
         weights = _check_weights(sample_weight, y.shape[0])
 
         # 1.2 validate arguments of __init__ ##################################
-        # Garantee that self._family_instance is an instance of class
+        # Guarantee that self._family_instance is an instance of class
         # ExponentialDispersionModel
         if isinstance(self.family, ExponentialDispersionModel):
             self._family_instance = self.family
@@ -1112,7 +1141,8 @@ def fit(self, X, y, sample_weight=None):
                     " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];"
                     " got (family={0})".format(self.family))
 
-        # Garantee that self._link_instance is set to an instance of class Link
+        # Guarantee that self._link_instance is set to an instance of
+        # class Link
         if isinstance(self.link, Link):
             self._link_instance = self.link
         else:
@@ -1127,19 +1157,19 @@ def fit(self, X, y, sample_weight=None):
                     .format(self.link))
 
         if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
-            raise ValueError("Penalty term must be non-negative;"
+            raise ValueError("Penalty term must be a non-negative number;"
                              " got (alpha={0})".format(self.alpha))
         if (not isinstance(self.l1_ratio, numbers.Number) or
                 self.l1_ratio < 0 or self.l1_ratio > 1):
-            raise ValueError("l1_ratio must be in interval [0, 1]; got"
-                             " (l1_ratio={0})".format(self.l1_ratio))
+            raise ValueError("l1_ratio must be a number in interval [0, 1];"
+                             " got (l1_ratio={0})".format(self.l1_ratio))
         if not isinstance(self.fit_intercept, bool):
             raise ValueError("The argument fit_intercept must be bool;"
                              " got {0}".format(self.fit_intercept))
         if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']:
-            raise ValueError("GeneralizedLinearRegressor supports only irls, "
-                             "auto, lbfgs, newton-cg and cd solvers, got {0}"
-                             "".format(self.solver))
+            raise ValueError("GeneralizedLinearRegressor supports only solvers"
+                             " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';"
+                             " got {0}".format(self.solver))
         solver = self.solver
         if self.solver == 'auto':
             if self.l1_ratio == 0:
@@ -1168,8 +1198,8 @@ def fit(self, X, y, sample_weight=None):
         elif isinstance(start_params, str):
             if start_params not in ['least_squares', 'zero']:
                 raise ValueError("The argument start_params must be None, "
-                                 "'least-squares', 'zero' or an array of right"
-                                 " length,"
+                                 "'least-squares', 'zero' or an array of "
+                                 " correct length;"
                                  " got(start_params={0})".format(start_params))
         else:
             start_params = check_array(start_params, accept_sparse='csr',
@@ -1179,21 +1209,21 @@ def fit(self, X, y, sample_weight=None):
                     (start_params.ndim != 1)):
                 raise ValueError("Start values for parameters must have the"
                                  "right length and dimension; required (length"
-                                 "={0}, ndim=1), got (length={1}, ndim={2})."
+                                 "={0}, ndim=1); got (length={1}, ndim={2})."
                                  .format(X.shape[1] + self.fit_intercept,
                                          start_params.shape[0],
                                          start_params.ndim))
 
         if self.selection not in ['cyclic', 'random']:
             raise ValueError("The argument selection must be 'cyclic' or "
-                             "'random', got (selection={0})"
+                             "'random'; got (selection={0})"
                              .format(self.selection))
         random_state = check_random_state(self.random_state)
         if not isinstance(self.copy_X, bool):
             raise ValueError("The argument copy_X must be bool;"
                              " got {0}".format(self.copy_X))
         if not isinstance(self.check_input, bool):
-            raise ValueError("The attribute check_input must be bool; got "
+            raise ValueError("The argument check_input must be bool; got "
                              "(check_input={0})".format(self.check_input))
 
         if self.P1 is None:
@@ -1232,7 +1262,7 @@ def fit(self, X, y, sample_weight=None):
         link = self._link_instance
 
         if self.fit_intercept:
-            # intercept is first column <=> coef[0] is for intecept
+            # Note: intercept is first column <=> coef[0] is for intecept
             if sparse.issparse(X):
                 Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X])
             else:
@@ -1259,10 +1289,11 @@ def fit(self, X, y, sample_weight=None):
         P2 *= l2
         # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
         # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
-        if sparse.issparse(P2):
-            P2 = 0.5 * (P2 + P2.transpose())
-        else:
-            P2 = 0.5 * (P2 + P2.T)
+        if P2.ndim == 2:
+            if sparse.issparse(P2):
+                P2 = 0.5 * (P2 + P2.transpose())
+            else:
+                P2 = 0.5 * (P2 + P2.T)
 
         # 1.3 additional validations ##########################################
         if self.check_input:
@@ -1301,7 +1332,7 @@ def fit(self, X, y, sample_weight=None):
         # we rescale weights such that sum(weights) = 1 and this becomes
         # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
         weights_sum = np.sum(weights)
-        weights = weights/np.sum(weights)
+        weights = weights/weights_sum
 
         #######################################################################
         # 3. initialization of coef = (intercept_, coef_)                     #
@@ -1338,6 +1369,8 @@ def fit(self, X, y, sample_weight=None):
                 # with L1 penalty, start with coef = 0
                 # TODO: Are there better options?
                 coef = np.zeros(n_features)
+                if self.fit_intercept:
+                    coef[0] = link.link(np.mean(y))
         elif isinstance(self.start_params, str):
             if self.start_params == 'zero':
                 coef = np.zeros(n_features)
@@ -1546,6 +1579,8 @@ def Hs(s):
                 # inner loop
                 # TODO: use sparsity (coefficient already 0 due to L1 penalty)
                 #       => active set of features for featurelist, see paper
+                #          of Improved GLMNET or Gap Safe Screening Rules
+                #          https://arxiv.org/abs/1611.05780
                 # A = f'(w) + d*H(w) + (w+d)*P2
                 # B = H+P2
                 # Note: f'=-score and H=fisher are updated at the end of outer
@@ -1694,7 +1729,7 @@ def Hs(s):
         return self
 
     def linear_predictor(self, X):
-        """The linear_predictor X*coef_ + intercept_.
+        """Compute the linear_predictor = X*coef_ + intercept_.
 
         Parameters
         ----------
@@ -1741,8 +1776,7 @@ def predict(self, X, sample_weight=None):
         return mu*weights
 
     def estimate_phi(self, X, y, sample_weight=None):
-        """Estimation of the dispersion parameter phi.
-        Returns the estimate.
+        """Estimate/fit the dispersion parameter phi.
 
         Parameters
         ----------
@@ -1755,6 +1789,11 @@ def estimate_phi(self, X, y, sample_weight=None):
         sample_weight : {None, array-like}, shape (n_samples,), optional \
                 (default=None)
             Sample weights.
+
+        Returns
+        -------
+        phi : float
+            Dispersion parameter.
         """
         check_is_fitted(self, "coef_")
         _dtype = [np.float64, np.float32]
@@ -1785,15 +1824,17 @@ def estimate_phi(self, X, y, sample_weight=None):
     # "AssertionError: -0.28014056555724598 not greater than 0.5"
     # unless GeneralizedLinearRegressor has a score which passes the test.
     def score(self, X, y, sample_weight=None):
-        r"""Returns D^2, a generalization of the coefficient of determination
-        R^2, which uses deviance instead of squared error.
+        r"""Compute D^2, the percentage of deviance explained.
+
+        D^2 is a generalization of the coefficient of determination R^2.
+        R^2 uses squared error and D^2 deviance. Note that those two are equal
+        for family='normal'.
 
         D^2 is defined as
         :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}`
         is the null deviance, i.e. the deviance of a model with intercept
         alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean
-        :math:`\bar{y}` is averaged by sample_weight. In the case of a Normal
-        distribution, D^2 equals R^2.
+        :math:`\bar{y}` is averaged by sample_weight.
         Best possible score is 1.0 and it can be negative (because the
         model can be arbitrarily worse).
 
@@ -1812,7 +1853,7 @@ def score(self, X, y, sample_weight=None):
         Returns
         -------
         score : float
-            D^2 of self.predict(X) wrt. y.
+            D^2 of self.predict(X) w.r.t. y.
         """
         # Note, default score defined in RegressorMixin is R^2 score.
         # TODO: make D^2 a score function in module metrics (and thereby get

From 01033e36d913756f7ff5e2214189cf1d7426dee1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 20 Feb 2019 17:20:09 +0100
Subject: [PATCH 44/69] Fix false formula in starting_mu and improve
 start_params

---
 sklearn/linear_model/glm.py | 108 ++++++++++++++++++++----------------
 1 file changed, 60 insertions(+), 48 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index d69ccd0a66486..fad7492acc2fb 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -583,7 +583,7 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link):
         fisher = safe_sparse_dot(X.T, temp, dense_output=False)
         return eta, mu, score, fisher
 
-    def starting_mu(self, y, weights=1):
+    def starting_mu(self, y, weights=1, ind_weight=0.5):
         """Set starting values for the mean mu.
 
         These may be good starting points for the (unpenalized) IRLS solver.
@@ -595,9 +595,13 @@ def starting_mu(self, y, weights=1):
 
         weights : array, shape (n_samples,) (default=1)
             Weights or exposure to which variance is inverse proportional.
+
+        ind_weight : float (default=0.5)
+            Must be between 0 and 1. Specifies how much weight is given to the
+            individual observations instead of the mean of y.
         """
-        return ((weights*y+np.mean(weights*y)) /
-                (2.*np.sum(np.ones_like(y)*weights)))
+        return (ind_weight * y +
+                (1. - ind_weight) * np.average(y, weights=weights))
 
 
 class TweedieDistribution(ExponentialDispersionModel):
@@ -852,17 +856,19 @@ def _irls_step(X, W, P2, z):
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """Regression via a Generalized Linear Model (GLM) with penalties.
 
-    GLMs based on a reproductive Exponential Dispersion Model (EDM) with
-    combined L1 and L2 priors as regularizer minimizes the following objective
-    function::
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
+    fitting and predicting the mean `mu=h(X*w)`. Therefore the fit minimizes
+    the following objective function with combined L1 and L2 priors as
+    regularizer::
 
             1/(2*sum(s)) * deviance(y, h(X*w); s)
             + alpha * l1_ratio * ||P1*w||_1
             + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
 
     with inverse link function `h` and s=`sample_weight` (for
-    `sample_weight=None`, one has s=1 and sum(s)=`n_samples`).
-    For `P1=P2=identity`, the penalty is the elastic net::
+    ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`).
+    For `P1=P2=identity` (``P1=None``, ``P2=None``), the penalty is the
+    elastic net::
 
             alpha * l1_ratio * ||w||_1
             + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2
@@ -966,24 +972,34 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         as initialization for ``coef_`` and ``intercept_`` (supersedes option
         ``start_params``). If set to ``True`` or if the attribute ``coef_``
         does not exit (first call to ``fit``), option ``start_params`` sets the
-        starting values for ``coef_`` and ``intercept_``.
+        start values for ``coef_`` and ``intercept_``.
+
+    start_params : {'irls', 'least_squares', 'zero', array of shape \
+            (n_features*, )}, optional (default='irls')
+        Relevant only if ``warm_start=False`` or if fit is called
+        the first time (``self.coef_`` does not yet exist).
+
+        'irls'
+            Start values of mu are calculated by family.starting_mu(..). Then,
+            one step of irls obtains start values for ``coef_`. This gives
+            usually good results.
 
-    start_params : {None, 'least_squares', 'zero', array of shape \
-            (n_features*, )}, optional (default=None)
-        If an array of size n_features* is supplied, use it as start values
-        for ``coef_`` in the fit. If ``fit_intercept=True``, the first element
+        'least_squares'
+        Start values for ``coef_`` are obtained by a least squares fit in the
+        link space (y is transformed to the space of the linear predictor).
+
+        'zero'
+        All coefficients are set to zero. If ``fit_intercept=True``, the
+        start value for the intercept is obtained by the average of y.
+
+        array
+        The array of size n_features* is directly used as start values
+        for ``coef_``. If ``fit_intercept=True``, the first element
         is assumed to be the start value for the ``intercept_``.
         Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes
         the intercept in counting.
-        If 'least_squares' is set, the result of a least squares fit in the
-        link space (linear predictor) is taken.
-        If 'zero' is set, all coefficients start with zero.
-        If ``None``, the start values are calculated by setting mu to
-        family.starting_mu(..) and one step of irls.
-        These options only apply if ``warm_start=False`` or if fit is called
-        the first time (``self.coef_`` does not yet exist).
 
-    selection : str, optional (default='random')
+    selection : str, optional (default='cyclic')
         For the solver 'cd' (coordinate descent), the coordinates (features)
         can be updated in either cyclic or random order.
         If set to 'random', a random coefficient is updated every iteration
@@ -1005,7 +1021,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     check_input : boolean, optional (default=True)
         Allow to bypass several checks on input: y values in range of family,
-        sample_weights non-negative, P2 positive semi-definite.
+        sample_weight non-negative, P2 positive semi-definite.
         Don't use this parameter unless you know what you do.
 
     verbose : int, optional (default=0)
@@ -1061,8 +1077,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
                  fit_intercept=True, family='normal', link='identity',
                  fit_dispersion=None, solver='auto', max_iter=100,
-                 tol=1e-4, warm_start=False, start_params=None,
-                 selection='random', random_state=None, copy_X=True,
+                 tol=1e-4, warm_start=False, start_params='irls',
+                 selection='cyclic', random_state=None, copy_X=True,
                  check_input=True, verbose=0):
         self.alpha = alpha
         self.l1_ratio = l1_ratio
@@ -1193,11 +1209,9 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("The argument warm_start must be bool;"
                              " got {0}".format(self.warm_start))
         start_params = self.start_params
-        if start_params is None:
-            pass
-        elif isinstance(start_params, str):
-            if start_params not in ['least_squares', 'zero']:
-                raise ValueError("The argument start_params must be None, "
+        if isinstance(start_params, str):
+            if start_params not in ['irls', 'least_squares', 'zero']:
+                raise ValueError("The argument start_params must be 'irls', "
                                  "'least-squares', 'zero' or an array of "
                                  " correct length;"
                                  " got(start_params={0})".format(start_params))
@@ -1348,11 +1362,11 @@ def fit(self, X, y, sample_weight=None):
                                        self.coef_))
             else:
                 coef = self.coef_
-        elif self.start_params is None:
-            if self.l1_ratio == 0:
+        elif isinstance(start_params, str):
+            if start_params == 'irls':
                 # See 3.1 IRLS
                 # Use mu_start and apply one irls step to calculate coef
-                mu = family.starting_mu(y, weights)
+                mu = family.starting_mu(y, weights=weights)
                 # linear predictor
                 eta = link.link(mu)
                 # h'(eta)
@@ -1365,16 +1379,9 @@ def fit(self, X, y, sample_weight=None):
                 # solve A*coef = b
                 # A = X' W X + l2 P2, b = X' W z
                 coef = _irls_step(Xnew, W, P2, z)
-            else:
-                # with L1 penalty, start with coef = 0
-                # TODO: Are there better options?
-                coef = np.zeros(n_features)
-                if self.fit_intercept:
-                    coef[0] = link.link(np.mean(y))
-        elif isinstance(self.start_params, str):
-            if self.start_params == 'zero':
-                coef = np.zeros(n_features)
-            elif self.start_params == 'least_squares':
+            elif start_params == 'least_squares':
+                # less restrictive tolerance for finding start values
+                tol = np.max([self.tol, np.sqrt(self.tol)])
                 if self.alpha == 0:
                     reg = LinearRegression(copy_X=True, fit_intercept=False)
                     reg.fit(Xnew, link.link(y))
@@ -1384,18 +1391,21 @@ def fit(self, X, y, sample_weight=None):
                     # => use Ridge
                     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
                     reg = Ridge(copy_X=True, fit_intercept=False,
-                                alpha=self.alpha*n_samples,
-                                tol=np.max([self.tol, np.sqrt(self.tol)]))
+                                alpha=self.alpha*n_samples, tol=tol)
                     reg.fit(Xnew, link.link(y))
                     coef = reg.coef_
                 else:
                     # TODO: Does this make sense at all?
                     reg = ElasticNet(copy_X=True, fit_intercept=False,
                                      alpha=self.alpha, l1_ratio=self.l1_ratio,
-                                     tol=np.max([self.tol, np.sqrt(self.tol)]))
+                                     tol=tol)
                     reg.fit(Xnew, link.link(y))
                     coef = reg.coef_
-        else:
+            else:  # start_params == 'zero'
+                coef = np.zeros(n_features)
+                if self.fit_intercept:
+                    coef[0] = link.link(np.average(y, weights=weights))
+        else:  # assign given array as start values
             coef = start_params
 
         #######################################################################
@@ -1560,6 +1570,8 @@ def Hs(s):
             # some precalculations
             eta, mu, score, fisher = family._eta_mu_score_fisher(
                 coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link)
+            # set up space for search direction d for inner loop
+            d = np.zeros_like(coef)
             # initial stopping tolerance of inner loop
             # use L1-norm of minimum-norm of subgradient of F
             # fp_wP2 = f'(w) + w*P2
@@ -1574,8 +1586,8 @@ def Hs(s):
             # outer loop
             while self.n_iter_ < self.max_iter:
                 self.n_iter_ += 1
-                # initialize search direction d (to be optimized)
-                d = np.zeros_like(coef)
+                # initialize search direction d (to be optimized) with zero
+                d.fill(0)
                 # inner loop
                 # TODO: use sparsity (coefficient already 0 due to L1 penalty)
                 #       => active set of features for featurelist, see paper

From 4071a8a54de0112fd1afd6d8fc5b5585708c84ea Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 20 Feb 2019 21:15:04 +0100
Subject: [PATCH 45/69] Improve argument handling of P1 and P2

* P2 also accepts 1d array and interprets it as diagonal matrix

* improved input checks for P1 and P2
---
 sklearn/linear_model/glm.py            | 103 ++++++++++++++++---------
 sklearn/linear_model/tests/test_glm.py |  31 ++++----
 2 files changed, 81 insertions(+), 53 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index fad7492acc2fb..33e0d75730e3a 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -45,7 +45,10 @@
 from abc import ABCMeta, abstractmethod, abstractproperty
 import numbers
 import numpy as np
-from scipy import linalg, optimize, sparse, special
+from scipy import linalg, sparse
+import scipy.sparse.linalg as splinalg
+from scipy.optimize import fmin_l_bfgs_b
+from scipy.special import xlogy
 import warnings
 from .base import LinearRegression
 from .coordinate_descent import ElasticNet
@@ -727,7 +730,7 @@ def unit_deviance(self, y, mu):
         if p == 1:
             # PoissonDistribution
             # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
-            return 2 * (special.xlogy(y, y/mu) - y + mu)
+            return 2 * (xlogy(y, y/mu) - y + mu)
         elif p == 2:
             # GammaDistribution
             return 2 * (np.log(mu/y)+y/mu-1)
@@ -840,7 +843,7 @@ def _irls_step(X, W, P2, z):
         XtW = X.transpose() * W
         A = XtW * X + L2
         b = XtW * z
-        coef = sparse.linalg.spsolve(A, b)
+        coef = splinalg.spsolve(A, b)
     else:
         XtW = (X.T * W)
         A = XtW.dot(X)
@@ -867,7 +870,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     with inverse link function `h` and s=`sample_weight` (for
     ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`).
-    For `P1=P2=identity` (``P1=None``, ``P2=None``), the penalty is the
+    For ``P1=P2='identity'`` (``P1=None``, ``P2=None``), the penalty is the
     elastic net::
 
             alpha * l1_ratio * ||w||_1
@@ -904,21 +907,24 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
-    P1 : {None, array-like}, shape (n_features,), optional \
-            (default=None)
+    P1 : {'identity', array-like}, shape (n_features,), optional \
+            (default='identity')
         With this array, you can exclude coefficients from the L1 penalty.
         Set the corresponding value to 1 (include) or 0 (exclude). The
-        default value ``None`` is the same as a 1d array of ones.
+        default value ``'identity'`` is the same as a 1d array of ones.
         Note that n_features = X.shape[1].
 
-    P2 : {None, array-like, sparse matrix}, shape \
-            (n_features, n_features), optional (default=None)
-        With this square matrix the L2 penalty is calculated as `w P2 w`.
-        This gives a fine control over this penalty (Tikhonov
-        regularization). The diagonal zeros of a diagonal P2, for example,
-        exclude all corresponding coefficients from the L2 penalty.
-        The default value ``None`` is the same as the identity matrix.
-        Note that n_features = X.shape[1]. P2 must be positive semi-definite.
+    P2 : {'identity', array-like, sparse matrix}, shape \
+            (n_features,) or (n_features, n_features), optional \
+            (default='identity')
+        With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`.
+        This gives a fine control over this penalty (Tikhonov regularization).
+        A 2d array is directly used as the square matrix P2. A 1d array is
+        interpreted as diagonal (square) matrix. The default 'identity' sets
+        the identity matrix, which gives the usual squared L2-norm. If you just
+        want to exclude certain coefficients, pass a 1d array filled with 1,
+        and 0 for the coefficients to be excluded.
+        Note that P2 must be positive semi-definite.
 
     fit_intercept : boolean, optional (default=True)
         Specifies if a constant (a.k.a. bias or intercept) should be
@@ -1074,7 +1080,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
           Journal of Machine Learning Research 13 (2012) 1999-2030
           https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
     """
-    def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None,
+    def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
                  fit_intercept=True, family='normal', link='identity',
                  fit_dispersion=None, solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False, start_params='irls',
@@ -1240,20 +1246,23 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("The argument check_input must be bool; got "
                              "(check_input={0})".format(self.check_input))
 
-        if self.P1 is None:
+        if isinstance(self.P1, str) and self.P1 == 'identity':
             P1 = np.ones(X.shape[1])
         else:
-            P1 = np.copy(np.atleast_1d(self.P1))
-            if P1.dtype.kind not in ['b', 'i', 'u', 'f']:
-                raise ValueError("P1 must be a numeric value; "
-                                 "got (dtype={0}).".format(P1.dtype))
+            P1 = np.atleast_1d(self.P1)
+            try:
+                P1 = P1.astype(np.float64, casting='safe', copy=True)
+            except TypeError:
+                raise TypeError("The given P1 cannot be converted to a numeric"
+                                "array; got (P1.dtype={0})."
+                                .format(P1.dtype))
             if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]):
-                raise ValueError("P1 must be either None or a 1d array with "
-                                 "the length of X.shape[1]; "
+                raise ValueError("P1 must be either 'identity' or a 1d array "
+                                 "with the length of X.shape[1]; "
                                  "got (P1.shape[0]={0}), "
                                  "needed (X.shape[1]={1})."
                                  .format(P1.shape[0], X.shape[1]))
-        if self.P2 is None:
+        if isinstance(self.P2, str) and self.P2 == 'identity':
             if not sparse.issparse(X):
                 P2 = np.ones(X.shape[1])
             else:
@@ -1262,8 +1271,15 @@ def fit(self, X, y, sample_weight=None):
         else:
             P2 = check_array(self.P2, copy=True,
                              accept_sparse=['csr', 'csc', 'coo'],
-                             dtype="numeric", ensure_2d=True)
-            if ((P2.ndim != 2) or
+                             dtype=_dtype, ensure_2d=False)
+            if P2.ndim == 1:
+                if P2.shape[0] != X.shape[1]:
+                    raise ValueError("P2 should be a 1d array of shape "
+                                     "(n_features,) with "
+                                     "n_features=X.shape[1]; "
+                                     "got (P2.shape=({0},)), needed ({1},)"
+                                     .format(P2.shape[0], X.shape[1]))
+            elif ((P2.ndim != 2) or
                     (P2.shape[0] != P2.shape[1]) or
                     (P2.shape[0] != X.shape[1])):
                 raise ValueError("P2 must be either None or an array of shape "
@@ -1319,21 +1335,32 @@ def fit(self, X, y, sample_weight=None):
                 raise ValueError("Sample weights must be non-negative.")
             # check if P1 has only non-negative values, negative values might
             # indicate group lasso in the future.
-            if self.P1 is not None:
+            if self.P1 != 'identity':
                 if not np.all(P1 >= 0):
                     raise ValueError("P1 must not have negative values.")
             # check if P2 is positive semidefinite
             # np.linalg.cholesky(P2) 'only' asserts positive definite
-            if self.P2 is not None:
-                if sparse.issparse(P2):
-                    # TODO: check sparse P2 for non-negativeness
-                    # raise NotImplementedError("Check sparse P2 for "
-                    #                          "non-negativeness is not yet "
-                    #                          "implemented.")
-                    pass
-                elif P2.ndim == 2:
-                    if not np.all(np.linalg.eigvals(P2) >= -1e-15):
-                        raise ValueError("P2 must be positive definite.")
+            if self.P2 != 'identity':
+                # due to numerical precision, we allow eigenvalues to be a
+                # tiny bit negative
+                epsneg = 10 * np.finfo(P2.dtype).epsneg
+                if P2.ndim == 1 or P2.shape[0] == 1:
+                    if not np.all(P2 >= 0):
+                        raise ValueError("1d array P2 must not have negative "
+                                         "values.")
+                elif sparse.issparse(P2):
+                    # for sparse matrices, not all eigenvals can be computed
+                    # efficiently, use only half of n_features
+                    # k = how many eigenvals to compute
+                    k = np.min([10, n_features // 10 + 1])
+                    sigma = 0  # start searching near this value
+                    which = 'SA'  # find smallest algebraic eigenvalues first
+                    if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma,
+                                                 which=which) >= epsneg):
+                        raise ValueError("P2 must be positive semi-definite.")
+                else:
+                    if not np.all(linalg.eigvalsh(P2) >= epsneg):
+                        raise ValueError("P2 must be positive semi-definite.")
             # TODO: if alpha=0 check that Xnew is not rank deficient
             # TODO: what else to check?
 
@@ -1520,7 +1547,7 @@ def Hs(s):
             args = (Xnew, y, weights, link)
 
             if solver == 'lbfgs':
-                coef, loss, info = optimize.fmin_l_bfgs_b(
+                coef, loss, info = fmin_l_bfgs_b(
                     func, coef, fprime=fprime, args=args,
                     iprint=(self.verbose > 0) - 1, pgtol=self.tol,
                     maxiter=self.max_iter)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 1ac5ccd4d3d5c..fde1604ad16e3 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -188,25 +188,26 @@ def test_glm_l1_ratio_argument():
         assert_raises(ValueError, glm.fit, X, y)
 
 
-def test_glm_P1_argument():
-    """Test GLM P1 arguments
-    """
+@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3]])
+def test_glm_P1_argument(P1):
+    """Test GLM P1 arguments."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for P1 in [['a string', 'a string'], [1, [2]], [1, 2, 3]]:
-        glm = GeneralizedLinearRegressor(P1=P1)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(P1=P1)
+    with pytest.raises((ValueError, TypeError)):
+        glm.fit(X, y)
 
 
-# def test_glm_P2_argument():
-#     """Test GLM P2 arguments
-#     """
-#     y = np.array([1, 2])
-#     X = np.array([[1], [1]])
-#     for P2 in [np.full((2, 2), 'a string', dtype=np.dtype('<U8')),
-#                [[1, [2]], [3, 4]], [1, 2, 3], [[1, 2]], [[1], [2]]]:
-#         glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
-#         assert_raises(ValueError, glm.fit, X, y)
+@pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]],
+                                sparse.csr_matrix([1, 2, 3]),
+                                sparse.lil_matrix([[1]])])
+def test_glm_P2_argument(P2):
+    """Test GLM P2 arguments."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
+    with pytest.raises((ValueError, TypeError)):
+        glm.fit(X, y)
 
 
 def test_glm_fit_intercept_argument():

From 757bc3c53facc32ca8eb3ef9b10100c3accb40dd Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 20 Feb 2019 22:48:14 +0100
Subject: [PATCH 46/69] Fix doctest, test_poisson_enet, change IRLS to use
 lstsq, fix input checks

* adapt examples of GeneralizedLinearModel to new defaults for
  P1, P2 and selection

* fix precision/decimal issue in test_poisson_enet

* use more robust least squares instead of solve in IRLS

* fix sign error in input checks
---
 doc/modules/linear_model.rst           | 16 +++++++++-------
 sklearn/linear_model/glm.py            | 25 ++++++++++++++++---------
 sklearn/linear_model/tests/test_glm.py | 24 ++++++++++++++++++++----
 3 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 174d1e4eddae4..e60e9e84a4747 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -924,15 +924,17 @@ follows:
     >>> from sklearn.linear_model import GeneralizedLinearRegressor
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
-    GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True,
-                  copy_X=True, family='poisson', fit_dispersion=None,
-                  fit_intercept=True, l1_ratio=0, link='log', max_iter=100,
-                  random_state=None, selection='random', solver='auto',
-                  start_params=None, tol=0.0001, verbose=0, warm_start=False)
+    GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5,
+                               check_input=True, copy_X=True, family='poisson',
+                               fit_dispersion=None, fit_intercept=True, l1_ratio=0,
+                               link='log', max_iter=100, random_state=None,
+                               selection='cyclic', solver='auto',
+                               start_params='irls', tol=0.0001, verbose=0,
+                               warm_start=False)
     >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE
-    array([0.24630255, 0.43373521])
+    array([0.24630169, 0.43373464])
     >>> reg.intercept_ #doctest: +ELLIPSIS
-    -0.76383575...
+    -0.76383633...
 
 
 .. topic:: Examples:
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 33e0d75730e3a..bc1a0434fa3b0 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -830,8 +830,9 @@ def _irls_step(X, W, P2, z):
     -------
     coef: array, shape (X.shape[1])
     """
-    # TODO: scipy.linalg.solve seems faster, but ordinary least squares uses
-    #       scipy.linalg.lstsq. What is more appropriate?
+    # Note: solve vs least squares, what is more appropriate?
+    #       scipy.linalg.solve seems faster, but scipy.linalg.lstsq
+    #       is more robust.
     n_samples, n_features = X.shape
     if sparse.issparse(X):
         W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr()
@@ -843,7 +844,8 @@ def _irls_step(X, W, P2, z):
         XtW = X.transpose() * W
         A = XtW * X + L2
         b = XtW * z
-        coef = splinalg.spsolve(A, b)
+        # coef = splinalg.spsolve(A, b)
+        coef, *_ = splinalg.lsmr(A, b)
     else:
         XtW = (X.T * W)
         A = XtW.dot(X)
@@ -852,7 +854,8 @@ def _irls_step(X, W, P2, z):
         else:
             A += P2
         b = XtW.dot(z)
-        coef = linalg.solve(A, b)
+        # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True)
+        coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
     return coef
 
 
@@ -1340,12 +1343,15 @@ def fit(self, X, y, sample_weight=None):
                     raise ValueError("P1 must not have negative values.")
             # check if P2 is positive semidefinite
             # np.linalg.cholesky(P2) 'only' asserts positive definite
-            if self.P2 != 'identity':
+            if not isinstance(self.P2, str):  # self.P2 != 'identity'
                 # due to numerical precision, we allow eigenvalues to be a
                 # tiny bit negative
-                epsneg = 10 * np.finfo(P2.dtype).epsneg
+                epsneg = -10 * np.finfo(P2.dtype).epsneg
                 if P2.ndim == 1 or P2.shape[0] == 1:
-                    if not np.all(P2 >= 0):
+                    p2 = P2
+                    if sparse.issparse(P2):
+                        p2 = P2.toarray()
+                    if not np.all(p2 >= 0):
                         raise ValueError("1d array P2 must not have negative "
                                          "values.")
                 elif sparse.issparse(P2):
@@ -1360,6 +1366,7 @@ def fit(self, X, y, sample_weight=None):
                         raise ValueError("P2 must be positive semi-definite.")
                 else:
                     if not np.all(linalg.eigvalsh(P2) >= epsneg):
+                        return P2
                         raise ValueError("P2 must be positive semi-definite.")
             # TODO: if alpha=0 check that Xnew is not rank deficient
             # TODO: what else to check?
@@ -1689,7 +1696,7 @@ def Hs(s):
                     mn_subgrad = (np.where(coef + d == 0,
                                   np.sign(A)*np.maximum(np.abs(A)-P1, 0),
                                   A+np.sign(coef+d)*P1))
-                    mn_subgrad = np.sum(np.abs(mn_subgrad))
+                    mn_subgrad = linalg.norm(mn_subgrad, ord=1)
                     if mn_subgrad <= inner_tol:
                         if inner_iter == 1:
                             inner_tol = inner_tol/4.
@@ -1740,7 +1747,7 @@ def Hs(s):
                 mn_subgrad = (np.where(coef == 0,
                               np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
                               fp_wP2+np.sign(coef)*P1))
-                mn_subgrad = np.sum(np.abs(mn_subgrad))
+                mn_subgrad = linalg.norm(mn_subgrad, ord=1)
                 if mn_subgrad <= self.tol:
                     converged = True
                     break
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index fde1604ad16e3..8893028d0176a 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -2,7 +2,7 @@
 from numpy.testing import assert_allclose
 import pytest
 import scipy as sp
-from scipy import sparse
+from scipy import sparse, optimize
 
 from sklearn.linear_model.glm import (
     Link,
@@ -199,8 +199,7 @@ def test_glm_P1_argument(P1):
 
 
 @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]],
-                                sparse.csr_matrix([1, 2, 3]),
-                                sparse.lil_matrix([[1]])])
+                                sparse.csr_matrix([1, 2, 3])])
 def test_glm_P2_argument(P2):
     """Test GLM P2 arguments."""
     y = np.array([1, 2])
@@ -515,12 +514,29 @@ def test_poisson_enet():
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
-                                     link='log', solver='cd', tol=1e-7,
+                                     link='log', solver='cd', tol=1e-8,
                                      selection='random', random_state=42)
     glm.fit(X, y)
     assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7)
     assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7)
 
+    # test results with general optimization procedure
+    def obj(coef):
+        pd = PoissonDistribution()
+        link = LogLink()
+        N = y.shape[0]
+        mu = link.inverse(X @ coef[1:]+coef[0])
+        alpha, l1_ratio = (1, 0.5)
+        return 1./(2.*N) * pd.deviance(y, mu) \
+            + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \
+            + alpha * l1_ratio * np.sum(np.abs(coef[1:]))
+    res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10,
+                            options={'maxiter': 1000, 'disp': False})
+    assert_almost_equal(glm.intercept_, res.x[0], decimal=5)
+    assert_almost_equal(glm.coef_, res.x[1:], decimal=5)
+    assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))),
+                        res.fun, decimal=8)
+
     # same for start_params='zero' and selection='cyclic'
     # with reduced precision
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',

From ed8e74f97d2b1921af4b8c2907c9e30629788bdc Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 23 Feb 2019 14:13:22 +0100
Subject: [PATCH 47/69] Use pytest decorators and pytest.raises

---
 sklearn/linear_model/glm.py            |   7 +-
 sklearn/linear_model/tests/test_glm.py | 510 ++++++++++++-------------
 2 files changed, 257 insertions(+), 260 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index bc1a0434fa3b0..f583e17433ee3 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -765,8 +765,8 @@ def __init__(self):
         super(InverseGaussianDistribution, self).__init__(power=3)
 
 
-class GeneralizedHyperbolicSecand(ExponentialDispersionModel):
-    """A class for the Generalized Hyperbolic Secand (GHS) distribution.
+class GeneralizedHyperbolicSecant(ExponentialDispersionModel):
+    """A class for the Generalized Hyperbolic Secant (GHS) distribution.
 
     The GHS distribution is for tagets y in (-inf, inf).
     """
@@ -1338,7 +1338,7 @@ def fit(self, X, y, sample_weight=None):
                 raise ValueError("Sample weights must be non-negative.")
             # check if P1 has only non-negative values, negative values might
             # indicate group lasso in the future.
-            if self.P1 != 'identity':
+            if not isinstance(self.P1, str):  # if self.P1 != 'identity':
                 if not np.all(P1 >= 0):
                     raise ValueError("P1 must not have negative values.")
             # check if P2 is positive semidefinite
@@ -1366,7 +1366,6 @@ def fit(self, X, y, sample_weight=None):
                         raise ValueError("P2 must be positive semi-definite.")
                 else:
                     if not np.all(linalg.eigvalsh(P2) >= epsneg):
-                        return P2
                         raise ValueError("P2 must be positive semi-definite.")
             # TODO: if alpha=0 check that Xnew is not rank deficient
             # TODO: what else to check?
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 8893028d0176a..361a237f2cc9f 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -2,7 +2,7 @@
 from numpy.testing import assert_allclose
 import pytest
 import scipy as sp
-from scipy import sparse, optimize
+from scipy import linalg, optimize, sparse
 
 from sklearn.linear_model.glm import (
     Link,
@@ -11,354 +11,355 @@
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
-    GeneralizedHyperbolicSecand,
+    GeneralizedHyperbolicSecant,
     GeneralizedLinearRegressor)
 from sklearn.linear_model import ElasticNet, Ridge
 
 from sklearn.utils.testing import (
     assert_equal, assert_almost_equal,
-    assert_array_equal, assert_array_almost_equal,
-    assert_raises)
+    assert_array_equal, assert_array_almost_equal)
 
 
-def test_link_properties():
-    """Test link inverse and derivative
-    """
+@pytest.mark.parametrize('link', Link.__subclasses__())
+def test_link_properties(link):
+    """Test link inverse and derivative."""
     rng = np.random.RandomState(0)
     x = rng.rand(100)*100
-    # from sklearn.linear_model.glm import Link
-    # for link in vars()['Link'].__subclasses__():
-    for link in Link.__subclasses__():
-        link = link()
-        assert_almost_equal(link.link(link.inverse(x)), x, decimal=10)
-        assert_almost_equal(link.inverse_derivative(link.link(x)),
-                            1/link.derivative(x), decimal=10)
-
-
-def test_family_bounds():
-    """Test the valid range of distributions
-    """
-    family = NormalDistribution()
-    result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, [True, True, True])
-
-    family = PoissonDistribution()
+    link = link()  # instatiate object
+    assert_almost_equal(link.link(link.inverse(x)), x, decimal=10)
+    assert_almost_equal(link.inverse_derivative(link.link(x)),
+                        1/link.derivative(x), decimal=10)
+
+
+@pytest.mark.parametrize(
+    'family, expected',
+    [(NormalDistribution(), [True, True, True]),
+     (PoissonDistribution(), [False, True, True]),
+     (TweedieDistribution(power=1.5), [False, True, True]),
+     (GammaDistribution(), [False, False, True]),
+     (InverseGaussianDistribution(), [False, False, True]),
+     (TweedieDistribution(power=4.5), [False, False, True])])
+def test_family_bounds(family, expected):
+    """Test the valid range of distributions at -1, 0, 1."""
     result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, [False, True, True])
-
-    family = TweedieDistribution(power=1.5)
-    result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, [False, True, True])
-
-    family = GammaDistribution()
-    result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, [False, False, True])
-
-    family = InverseGaussianDistribution()
-    result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, [False, False, True])
-
-    family = TweedieDistribution(power=4.5)
-    result = family.in_y_range([-1, 0, 1])
-    assert_array_equal(result, [False, False, True])
-
-
-def test_deviance_zero():
-    """Test deviance(y,y) = 0 for different families
-    """
-    for family in [NormalDistribution(), PoissonDistribution(),
-                   GammaDistribution(), InverseGaussianDistribution(),
-                   TweedieDistribution(power=-2.5),
-                   TweedieDistribution(power=-1),
-                   TweedieDistribution(power=1.5),
-                   TweedieDistribution(power=2.5),
-                   TweedieDistribution(power=4),
-                   GeneralizedHyperbolicSecand()]:
-        assert_almost_equal(family.deviance(0.1, 0.1), 0, decimal=10)
-        assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10)
-
-
-def test_fisher_matrix():
+    assert_array_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    'family, chk_values',
+    [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
+     (PoissonDistribution(), [0.1, 1.5]),
+     (GammaDistribution(), [0.1, 1.5]),
+     (InverseGaussianDistribution(), [0.1, 1.5]),
+     (TweedieDistribution(power=-2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-1), [0.1, 1.5]),
+     (TweedieDistribution(power=1.5), [0.1, 1.5]),
+     (TweedieDistribution(power=2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-4), [0.1, 1.5]),
+     (GeneralizedHyperbolicSecant(), [0.1, 1.5])])
+def test_deviance_zero(family, chk_values):
+    """Test deviance(y,y) = 0 for different families."""
+    for x in chk_values:
+        assert_almost_equal(family.deviance(x, x), 0, decimal=10)
+
+
+@pytest.mark.parametrize(
+    'family, link',
+    [(NormalDistribution(), IdentityLink()),
+     (PoissonDistribution(), LogLink()),
+     (GammaDistribution(), LogLink()),
+     (InverseGaussianDistribution(), LogLink()),
+     (TweedieDistribution(power=1.5), LogLink()),
+     (TweedieDistribution(power=4.5), LogLink())])
+def test_fisher_matrix(family, link):
     """Test the Fisher matrix numerically.
     Trick: Use numerical differentiation with y = mu"""
-    for family in [NormalDistribution(), PoissonDistribution(),
-                   GammaDistribution(), InverseGaussianDistribution()]:
-        link = LogLink()
-        rng = np.random.RandomState(0)
-        coef = np.array([-2, 1, 0, 1, 2.5])
-        phi = 0.5
-        X = rng.randn(10, 5)
-        lin_pred = np.dot(X, coef)
-        mu = link.inverse(lin_pred)
-        weights = rng.randn(10)**2 + 1
-        fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu,
-                                       weights=weights, link=link)
-        approx = np.array([]).reshape(0, coef.shape[0])
-        for i in range(coef.shape[0]):
-            def f(coef):
-                return -family._score(coef=coef, phi=phi, X=X, y=mu,
-                                      weights=weights, link=link)[i]
-            approx = np.vstack(
-                [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]
-                )
-        assert_allclose(fisher, approx, rtol=1e-3)
+    rng = np.random.RandomState(0)
+    coef = np.array([-2, 1, 0, 1, 2.5])
+    phi = 0.5
+    X = rng.randn(10, 5)
+    lin_pred = np.dot(X, coef)
+    mu = link.inverse(lin_pred)
+    weights = rng.randn(10)**2 + 1
+    fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu,
+                                   weights=weights, link=link)
+    approx = np.array([]).reshape(0, coef.shape[0])
+    for i in range(coef.shape[0]):
+        def f(coef):
+            return -family._score(coef=coef, phi=phi, X=X, y=mu,
+                                  weights=weights, link=link)[i]
+        approx = np.vstack(
+            [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)])
+    assert_allclose(fisher, approx, rtol=1e-3)
 
 
 def test_sample_weights_validation():
-    """Test the raised errors in the validation of sample_weight"""
+    """Test the raised errors in the validation of sample_weight."""
     # 1. scalar value but not positive
     X = [[1]]
     y = [1]
     weights = 0
     glm = GeneralizedLinearRegressor(fit_intercept=False)
-    assert_raises(ValueError, glm.fit, X, y, weights)
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
 
     # 2. 2d array
     weights = [[0]]
-    assert_raises(ValueError, glm.fit, X, y, weights)
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
 
     # 3. 1d but wrong length
     weights = [1, 0]
-    assert_raises(ValueError, glm.fit, X, y, weights)
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
 
     # 4. 1d but only zeros (sum not greater than 0)
     weights = [0, 0]
     X = [[0], [1]]
     y = [1, 2]
-    assert_raises(ValueError, glm.fit, X, y, weights)
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
 
     # 5. 1d but weith a negative value
     weights = [2, -1]
-    assert_raises(ValueError, glm.fit, X, y, weights)
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
 
 
 def test_glm_family_argument():
-    """Test GLM family argument set as string
-    """
+    """Test GLM family argument set as string."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
+    X = np.array([[1], [2]])
     for (f, fam) in [('normal', NormalDistribution()),
                      ('poisson', PoissonDistribution()),
                      ('gamma', GammaDistribution()),
                      ('inverse.gaussian', InverseGaussianDistribution())]:
-        glm = GeneralizedLinearRegressor(family=f, fit_intercept=False,
-                                         alpha=0).fit(X, y)
+        glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
         assert_equal(type(glm._family_instance), type(fam))
 
     glm = GeneralizedLinearRegressor(family='not a family',
                                      fit_intercept=False)
-    assert_raises(ValueError, glm.fit, X, y)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
 def test_glm_link_argument():
-    """Test GLM link argument set as string
-    """
+    """Test GLM link argument set as string."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
+    X = np.array([[1], [2]])
     for (l, link) in [('identity', IdentityLink()),
                       ('log', LogLink())]:
-        glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
-                                         link=l).fit(X, y)
+        glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y)
         assert_equal(type(glm._link_instance), type(link))
 
-    glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
-                                     link='not a link')
-    assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(family='normal', link='not a link')
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_alpha_argument():
-    """Test GLM alpha argument
-    """
+@pytest.mark.parametrize('alpha', ['not a number', -4.2])
+def test_glm_alpha_argument(alpha):
+    """Test GLM for invalid alpha argument."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    for alpha in ['not a number', -4.2]:
-        glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
-                                         alpha=alpha)
-        assert_raises(ValueError, glm.fit, X, y)
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', alpha=alpha)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_l1_ratio_argument():
-    """Test GLM l1_ratio argument
-    """
+@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]])
+def test_glm_l1_ratio_argument(l1_ratio):
+    """Test GLM for invalid l1_ratio argument."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    for l1_ratio in ['not a number', -4.2, 1.1, [1]]:
-        glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False,
-                                         l1_ratio=l1_ratio)
-        assert_raises(ValueError, glm.fit, X, y)
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3]])
+@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3],
+                                [-1]])
 def test_glm_P1_argument(P1):
-    """Test GLM P1 arguments."""
+    """Test GLM for invalid P1 argument."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    glm = GeneralizedLinearRegressor(P1=P1)
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True)
     with pytest.raises((ValueError, TypeError)):
         glm.fit(X, y)
 
 
 @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]],
-                                sparse.csr_matrix([1, 2, 3])])
+                                sparse.csr_matrix([1, 2, 3]), [-1]])
 def test_glm_P2_argument(P2):
-    """Test GLM P2 arguments."""
+    """Test GLM for invalid P2 argument."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False)
-    with pytest.raises((ValueError, TypeError)):
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(P2=P2, check_input=True)
+    with pytest.raises(ValueError):
         glm.fit(X, y)
 
 
-def test_glm_fit_intercept_argument():
-    """Test GLM fit_intercept argument
-    """
-    y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    for fit_intercept in ['not bool', 1, 0, [True]]:
-        glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
-        assert_raises(ValueError, glm.fit, X, y)
+def test_glm_P2_positive_semidefinite():
+    """Test GLM for a positive semi-definite P2 argument."""
+    n_samples, n_features = 10, 5
+    rng = np.random.RandomState(42)
+    y = np.arange(n_samples)
+    X = np.zeros((n_samples, n_features))
+    P2 = np.diag([100, 10, 5, 0, -1E-5])
+    # construct random orthogonal matrix Q
+    Q, R = linalg.qr(rng.randn(n_features, n_features))
+    P2 = Q.T @ P2 @ Q
+    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
+                                     check_input=True)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_solver_argument():
-    """Test GLM solver argument
-    """
+@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]])
+def test_glm_fit_intercept_argument(fit_intercept):
+    """Test GLM for invalid fit_intercept argument."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for solver in ['not a solver', 1, [1]]:
-        glm = GeneralizedLinearRegressor(solver=solver)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
-    # solver not suitable for L1 penalty
-    for solver in ['irls', 'lbfgs', 'newton-cg']:
-        glm = GeneralizedLinearRegressor(solver=solver, alpha=1, l1_ratio=0.1)
-        assert_raises(ValueError, glm.fit, X, y)
 
+@pytest.mark.parametrize('solver, l1_ratio',
+                         [('not a solver', 0), (1, 0), ([1], 0),
+                          ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)])
+def test_glm_solver_argument(solver, l1_ratio):
+    """Test GLM for invalid solver argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
-def test_glm_max_iter_argument():
-    """Test GLM max_iter argument
-    """
+
+@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]])
+def test_glm_max_iter_argument(max_iter):
+    """Test GLM for invalid max_iter argument."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    for max_iter in ['not a number', 0, -1, 5.5, [1]]:
-        glm = GeneralizedLinearRegressor(max_iter=max_iter)
-        assert_raises(ValueError, glm.fit, X, y)
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(max_iter=max_iter)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_tol_argument():
-    """Test GLM tol argument
-    """
+@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]])
+def test_glm_tol_argument(tol):
+    """Test GLM for invalid tol argument."""
     y = np.array([1, 2])
-    X = np.array([[1], [1]])
-    for tol in ['not a number', 0, -1.0, [1e-3]]:
-        glm = GeneralizedLinearRegressor(tol=tol)
-        assert_raises(ValueError, glm.fit, X, y)
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(tol=tol)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_warm_start_argument():
-    """Test GLM warm_start argument
-    """
+@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]])
+def test_glm_warm_start_argument(warm_start):
+    """Test GLM for invalid warm_start argument."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for warm_start in ['not bool', 1, 0, [True]]:
-        glm = GeneralizedLinearRegressor(warm_start=warm_start)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(warm_start=warm_start)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_start_params_argument():
-    """Test GLM start_params argument
-    """
+@pytest.mark.parametrize('start_params',
+                         ['not a start_params', ['zero'], [0, 0, 0],
+                          [[0, 0]], ['a', 'b']])
+def test_glm_start_params_argument(start_params):
+    """Test GLM for invalid start_params argument."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for start_params in ['not a start_params', ['zero'], [0, 0, 0],
-                         [[0, 0]], ['a', 'b']]:
-        glm = GeneralizedLinearRegressor(start_params=start_params)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(start_params=start_params)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_selection_argument():
-    """Test GLM selection argument
-    """
+@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']])
+def test_glm_selection_argument(selection):
+    """Test GLM for invalid selection argument"""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for selection in ['not a selection', 1, 0, ['cyclic']]:
-        glm = GeneralizedLinearRegressor(selection=selection)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(selection=selection)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_random_state_argument():
-    """Test GLM random_state argument
-    """
+@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]])
+def test_glm_random_state_argument(random_state):
+    """Test GLM for invalid random_state argument."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for random_state in ['a string', 0.5, [0]]:
-        glm = GeneralizedLinearRegressor(random_state=random_state)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(random_state=random_state)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_copy_X_argument():
-    """Test GLM copy_X arguments
-    """
+@pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]])
+def test_glm_copy_X_argument(copy_X):
+    """Test GLM for invalid copy_X arguments."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for copy_X in ['not bool', 1, 0, [True]]:
-        glm = GeneralizedLinearRegressor(copy_X=copy_X)
-        assert_raises(ValueError, glm.fit, X, y)
+    glm = GeneralizedLinearRegressor(copy_X=copy_X)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
 
-def test_glm_check_input_argument():
-    """Test GLM check_input argument
-    """
+@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]])
+def test_glm_check_input_argument(check_input):
+    """Test GLM for invalid check_input argument."""
     y = np.array([1, 2])
     X = np.array([[1], [1]])
-    for check_input in ['not bool', 1, 0, [True]]:
-        glm = GeneralizedLinearRegressor(check_input=check_input)
-        assert_raises(ValueError, glm.fit, X, y)
-
+    glm = GeneralizedLinearRegressor(check_input=check_input)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
 
-# TODO: check additional validations if check_input == True
 
-def test_glm_identiy_regression():
-    """Test GLM regression with identity link on a simple dataset
-    """
+@pytest.mark.parametrize(
+    'family',
+    [NormalDistribution(), PoissonDistribution(),
+     GammaDistribution(), InverseGaussianDistribution(),
+     TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
+     GeneralizedHyperbolicSecant()])
+@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
+def test_glm_identiy_regression(family, solver):
+    """Test GLM regression with identity link on a simple dataset."""
     coef = [1, 2]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
-    families = (
-        NormalDistribution(), PoissonDistribution(),
-        GammaDistribution(), InverseGaussianDistribution(),
-        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
-        GeneralizedHyperbolicSecand())
-    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
-        for family in families:
-            glm = GeneralizedLinearRegressor(
-                alpha=0, family=family, fit_intercept=False, solver=solver)
-            res = glm.fit(X, y)
-            assert_array_almost_equal(res.coef_, coef)
-
-
-def test_glm_log_regression():
-    """Test GLM regression with log link on a simple dataset
-    """
+    glm = GeneralizedLinearRegressor(alpha=0, family=family,
+                                     fit_intercept=False, solver=solver)
+    res = glm.fit(X, y)
+    assert_array_almost_equal(res.coef_, coef)
+
+
+@pytest.mark.parametrize(
+    'family',
+    [NormalDistribution(), PoissonDistribution(),
+     GammaDistribution(), InverseGaussianDistribution(),
+     TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
+     GeneralizedHyperbolicSecant()])
+@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
+def test_glm_log_regression(family, solver):
+    """Test GLM regression with log link on a simple dataset."""
     coef = [1, 2]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.exp(np.dot(X, coef))
-    families = (
-        NormalDistribution(), PoissonDistribution(),
-        GammaDistribution(), InverseGaussianDistribution(),
-        TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
-        GeneralizedHyperbolicSecand())
-    for solver in ['irls', 'lbfgs', 'newton-cg']:
-        for family in families:
-            glm = GeneralizedLinearRegressor(
+    glm = GeneralizedLinearRegressor(
                 alpha=0, family=family, link=LogLink(), fit_intercept=False,
                 solver=solver, start_params='least_squares')
-            res = glm.fit(X, y)
-            assert_array_almost_equal(res.coef_, coef)
+    res = glm.fit(X, y)
+    assert_array_almost_equal(res.coef_, coef)
 
 
 @pytest.mark.filterwarnings('ignore::DeprecationWarning')
-def test_normal_ridge():
-    """Test ridge regression for Normal distributions
+@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
+def test_normal_ridge(solver):
+    """Test ridge regression for Normal distributions.
 
     Compare to test_ridge in test_ridge.py.
     """
@@ -375,25 +376,23 @@ def test_normal_ridge():
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6,
                   solver='svd', normalize=False)
     ridge.fit(X, y)
-    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
-        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0,
-                                         family='normal', link='identity',
-                                         fit_intercept=True, tol=1e-6,
-                                         max_iter=100, solver=solver,
-                                         random_state=42)
-        glm.fit(X, y)
-        assert_equal(glm.coef_.shape, (X.shape[1], ))
-        assert_array_almost_equal(glm.coef_, ridge.coef_)
-        assert_almost_equal(glm.intercept_, ridge.intercept_)
-        assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=True,
+                                     tol=1e-6, max_iter=100, solver=solver,
+                                     random_state=42)
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_)
+    assert_almost_equal(glm.intercept_, ridge.intercept_)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T))
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
                   solver='svd', normalize=False)
     ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6,
-                                     family='normal', link='identity',
-                                     fit_intercept=False, solver='irls',
-                                     fit_dispersion='chisqr')
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=False,
+                                     tol=1e-6, max_iter=100, solver=solver,
+                                     random_state=42, fit_dispersion='chisqr')
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
     assert_array_almost_equal(glm.coef_, ridge.coef_)
@@ -413,23 +412,22 @@ def test_normal_ridge():
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9,
                   solver='sag', normalize=False, max_iter=100000)
     ridge.fit(X, y)
-    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
-        glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8,
-                                         family='normal', link='identity',
-                                         fit_intercept=True, solver=solver,
-                                         max_iter=300, random_state=42)
-        glm.fit(X, y)
-        assert_equal(glm.coef_.shape, (X.shape[1], ))
-        assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5)
-        assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5)
-        assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8,
+                                     family='normal', link='identity',
+                                     fit_intercept=True, solver=solver,
+                                     max_iter=300, random_state=42)
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5)
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7,
                   solver='sag', normalize=False, max_iter=1000)
     ridge.fit(X, y)
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7,
                                      family='normal', link='identity',
-                                     fit_intercept=False, solver='irls')
+                                     fit_intercept=False, solver=solver)
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
     assert_array_almost_equal(glm.coef_, ridge.coef_)
@@ -438,7 +436,7 @@ def test_normal_ridge():
 
 
 def test_poisson_ridge():
-    """Test ridge regression with poisson family and LogLink
+    """Test ridge regression with poisson family and LogLink.
 
     Compare to R's glmnet"""
     # library("glmnet")
@@ -470,7 +468,7 @@ def test_poisson_ridge():
 
 
 def test_normal_enet():
-    """Tet elastic net regression with normal/gaussian family"""
+    """Test elastic net regression with normal/gaussian family."""
     rng = np.random.RandomState(0)
     alpha, l1_ratio = 0.3, 0.7
     n_samples, n_features = 20, 2
@@ -495,7 +493,7 @@ def test_normal_enet():
 
 
 def test_poisson_enet():
-    """Test elastic net regression with poisson family and LogLink
+    """Test elastic net regression with poisson family and LogLink.
 
     Compare to R's glmnet"""
     # library("glmnet")

From fe876da908a7d5aefe8fa9ac56f4c5130ccf83df Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 24 Feb 2019 12:45:55 +0100
Subject: [PATCH 48/69] Add Logistic regression=Binomial + Logit

    * add Binomial distribution

    * add Logit link

    * tests for binomial against LogisticRegression

    * option 'auto' for link

    * reduce code duplication by replacing @abstractproperty by @property
---
 sklearn/linear_model/glm.py            | 160 +++++++++++++++----------
 sklearn/linear_model/tests/test_glm.py |  57 +++++++--
 2 files changed, 149 insertions(+), 68 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index f583e17433ee3..01e40b322946c 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -42,13 +42,12 @@
 
 
 from __future__ import division
-from abc import ABCMeta, abstractmethod, abstractproperty
+from abc import ABCMeta, abstractmethod
 import numbers
 import numpy as np
-from scipy import linalg, sparse
+from scipy import linalg, sparse, special
 import scipy.sparse.linalg as splinalg
 from scipy.optimize import fmin_l_bfgs_b
-from scipy.special import xlogy
 import warnings
 from .base import LinearRegression
 from .coordinate_descent import ElasticNet
@@ -191,6 +190,28 @@ def inverse_derivative2(self, lin_pred):
         return np.exp(lin_pred)
 
 
+class LogitLink(Link):
+    """The logit link function g(x)=logit(x)."""
+
+    def link(self, mu):
+        return special.logit(mu)
+
+    def derivative(self, mu):
+        return 1. / (mu * (1 - mu))
+
+    def inverse(self, lin_pred):
+        return special.expit(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        ep = special.expit(lin_pred)
+        return ep * (1. - ep)
+
+    def inverse_derivative2(self, lin_pred):
+        ep = special.expit(lin_pred)
+        ep = special.expit(lin_pred)
+        return ep * (1. - ep) * (1. - 2 * ep)
+
+
 class ExponentialDispersionModel(metaclass=ABCMeta):
     r"""Base class for reproductive Exponential Dispersion Models (EDM).
 
@@ -238,26 +259,25 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
 
     https://en.wikipedia.org/wiki/Exponential_dispersion_model.
     """
-
-    @abstractproperty
+    @property
     def lower_bound(self):
-        """The lower bound of values of Y~EDM."""
-        raise NotImplementedError()
+        """Get the lower bound of values for Y~EDM."""
+        return self._lower_bound
 
-    @abstractproperty
+    @property
     def upper_bound(self):
-        """The upper bound of values of Y~EDM."""
-        raise NotImplementedError()
+        """Get the upper bound of values for Y~EDM."""
+        return self._upper_bound
 
-    @abstractproperty
+    @property
     def include_lower_bound(self):
-        """If True, values of y may equal lower bound: y >= lower_bound."""
-        raise NotImplementedError()
+        """Get True if lower bound for y is included: y >= lower_bound."""
+        return self._include_lower_bound
 
-    @abstractproperty
+    @property
     def include_upper_bound(self):
-        """If True, values of y may equal upper bound: y <= upper_bound."""
-        raise NotImplementedError()
+        """Get True if upper bound for y is includede: y <= upper_bound."""
+        return self._include_upper_bound
 
     def in_y_range(self, x):
         """Returns true if `x` is in the valid range of Y~EDM.
@@ -685,22 +705,6 @@ def power(self, power):
                             .format(power))
         self._power = power
 
-    @property
-    def lower_bound(self):
-        return self._lower_bound
-
-    @property
-    def upper_bound(self):
-        return self._upper_bound
-
-    @property
-    def include_lower_bound(self):
-        return self._include_lower_bound
-
-    @property
-    def include_upper_bound(self):
-        return self._include_upper_bound
-
     def unit_variance(self, mu):
         """Compute the unit variance of a Tweedie distribution v(mu)=mu**power.
 
@@ -730,7 +734,7 @@ def unit_deviance(self, y, mu):
         if p == 1:
             # PoissonDistribution
             # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
-            return 2 * (xlogy(y, y/mu) - y + mu)
+            return 2 * (special.xlogy(y, y/mu) - y + mu)
         elif p == 2:
             # GammaDistribution
             return 2 * (np.log(mu/y)+y/mu-1)
@@ -776,22 +780,6 @@ def __init__(self):
         self._include_lower_bound = False
         self._include_upper_bound = False
 
-    @property
-    def lower_bound(self):
-        return self._lower_bound
-
-    @property
-    def upper_bound(self):
-        return self._upper_bound
-
-    @property
-    def include_lower_bound(self):
-        return self._include_lower_bound
-
-    @property
-    def include_upper_bound(self):
-        return self._include_upper_bound
-
     def unit_variance(self, mu):
         return 1 + mu**2
 
@@ -803,6 +791,27 @@ def unit_deviance(self, y, mu):
                 np.log((1+mu**2)/(1+y**2)))
 
 
+class BinomialDistribution(ExponentialDispersionModel):
+    """A class for the Binomial distribution.
+
+    The Binomial distribution is for tagets y in [0, 1].
+    """
+    def __init__(self):
+        self._lower_bound = 0
+        self._upper_bound = 1
+        self._include_lower_bound = True
+        self._include_upper_bound = True
+
+    def unit_variance(self, mu):
+        return mu * (1 - mu)
+
+    def unit_variance_derivative(self, mu):
+        return 1 - 2 * mu
+
+    def unit_deviance(self, y, mu):
+        return 2*(special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu)))
+
+
 def _irls_step(X, W, P2, z):
     """Compute one step in iteratively reweighted least squares.
 
@@ -933,15 +942,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X*coef+intercept).
 
-    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\
-            of class ExponentialDispersionModel, optional(default='normal')
+    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \
+            or an instance of class ExponentialDispersionModel, \
+            optional(default='normal')
         The distributional assumption of the GLM, i.e. which distribution from
         the EDM, specifies the loss function to be minimized.
 
-    link : {'identity', 'log'} or an instance of class Link,
-        optional (default='identity')
+    link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link,
+        optional (default='auto')
         The link function of the GLM, i.e. mapping from linear predictor
-        (X*coef) to expectation (mu).
+        (X*coef) to expectation (mu). Option 'auto' sets the link depending on
+        the chosen family as follows:
+
+        - 'identity' for family 'normal'
+
+        - 'log' for families 'poisson', 'gamma', 'inverse.gaussian'
+
+        - 'logit' for family 'binomial'
 
     fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None)
         Method for estimation of the dispersion parameter phi. Whether to use
@@ -1084,7 +1101,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
           https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
     """
     def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
-                 fit_intercept=True, family='normal', link='identity',
+                 fit_intercept=True, family='normal', link='auto',
                  fit_dispersion=None, solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False, start_params='irls',
                  selection='cyclic', random_state=None, copy_X=True,
@@ -1159,27 +1176,48 @@ def fit(self, X, y, sample_weight=None):
                 self._family_instance = GammaDistribution()
             elif self.family == 'inverse.gaussian':
                 self._family_instance = InverseGaussianDistribution()
+            elif self.family == 'binomial':
+                self._family_instance = BinomialDistribution()
             else:
                 raise ValueError(
                     "The family must be an instance of class"
                     " ExponentialDispersionModel or an element of"
-                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];"
-                    " got (family={0})".format(self.family))
+                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
+                    "'binomial']; got (family={0})".format(self.family))
 
         # Guarantee that self._link_instance is set to an instance of
         # class Link
         if isinstance(self.link, Link):
             self._link_instance = self.link
         else:
-            if self.link == 'identity':
+            if self.link == 'auto':
+                if isinstance(self._family_instance, TweedieDistribution):
+                    if self._family_instance.power <= 0:
+                        self._link_instance = IdentityLink()
+                    if self._family_instance.power >= 1:
+                        self._link_instance = LogLink()
+                elif isinstance(self._family_instance,
+                                GeneralizedHyperbolicSecant):
+                    self._link_instance = IdentityLink()
+                elif isinstance(self._family_instance, BinomialDistribution):
+                    self._link_instance = LogitLink()
+                else:
+                    raise ValueError("No default link known for the "
+                                     "specified distribution family. Please "
+                                     "set link manually, i.e. not to 'auto'; "
+                                     "got (link='auto', family={}"
+                                     .format(self.family))
+            elif self.link == 'identity':
                 self._link_instance = IdentityLink()
             elif self.link == 'log':
                 self._link_instance = LogLink()
+            elif self.link == 'logit':
+                self._link_instance = LogitLink()
             else:
                 raise ValueError(
-                    "The link must be an instance of class Link or"
-                    " an element of ['identity', 'log']; got (link={0})"
-                    .format(self.link))
+                    "The link must be an instance of class Link or "
+                    "an element of ['auto', 'identity', 'log', 'logit']; "
+                    "got (link={0})".format(self.link))
 
         if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
             raise ValueError("Penalty term must be a non-negative number;"
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 361a237f2cc9f..de0857a34fe3a 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -4,16 +4,18 @@
 import scipy as sp
 from scipy import linalg, optimize, sparse
 
+from sklearn.datasets import make_classification
 from sklearn.linear_model.glm import (
     Link,
     IdentityLink,
     LogLink,
+    LogitLink,
     TweedieDistribution,
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
-    GeneralizedHyperbolicSecant,
+    GeneralizedHyperbolicSecant, BinomialDistribution,
     GeneralizedLinearRegressor)
-from sklearn.linear_model import ElasticNet, Ridge
+from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
 
 from sklearn.utils.testing import (
     assert_equal, assert_almost_equal,
@@ -26,9 +28,19 @@ def test_link_properties(link):
     rng = np.random.RandomState(0)
     x = rng.rand(100)*100
     link = link()  # instatiate object
-    assert_almost_equal(link.link(link.inverse(x)), x, decimal=10)
-    assert_almost_equal(link.inverse_derivative(link.link(x)),
-                        1/link.derivative(x), decimal=10)
+    decimal = 10
+    if isinstance(link, LogitLink):
+        # careful for large x, note expit(36) = 1
+        # limit max eta to 15
+        x = x / 100 * 15
+        decimal = 8
+    assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal)
+    # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
+    assert_almost_equal(link.derivative(link.inverse(x)),
+                        1./link.inverse_derivative(x), decimal=decimal)
+    # for LogitLink, in the following x should be between 0 and 1.
+    # assert_almost_equal(link.inverse_derivative(link.link(x)),
+    #                     1./link.derivative(x), decimal=decimal)
 
 
 @pytest.mark.parametrize(
@@ -214,6 +226,12 @@ def test_glm_P2_positive_semidefinite():
     with pytest.raises(ValueError):
         glm.fit(X, y)
 
+    P2 = sparse.csr_matrix(P2)
+    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
+                                     check_input=True)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
 
 @pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]])
 def test_glm_fit_intercept_argument(fit_intercept):
@@ -331,7 +349,7 @@ def test_glm_identiy_regression(family, solver):
     coef = [1, 2]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
-    glm = GeneralizedLinearRegressor(alpha=0, family=family,
+    glm = GeneralizedLinearRegressor(alpha=0, family=family, link='identity',
                                      fit_intercept=False, solver=solver)
     res = glm.fit(X, y)
     assert_array_almost_equal(res.coef_, coef)
@@ -350,7 +368,7 @@ def test_glm_log_regression(family, solver):
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.exp(np.dot(X, coef))
     glm = GeneralizedLinearRegressor(
-                alpha=0, family=family, link=LogLink(), fit_intercept=False,
+                alpha=0, family=family, link='log', fit_intercept=False,
                 solver=solver, start_params='least_squares')
     res = glm.fit(X, y)
     assert_array_almost_equal(res.coef_, coef)
@@ -557,3 +575,28 @@ def obj(coef):
     glm.fit(X, y)
     assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
     assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
+
+
+@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10])
+def test_binomial_enet(alpha):
+    """Test elastic net regression with binomial family and LogitLink.
+
+    Compare to LogisticRegression.
+    """
+    l1_ratio = 0.5
+    n_samples = 500
+    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6,
+                               n_informative=5, n_redundant=0, n_repeated=0,
+                               random_state=0)
+    log = LogisticRegression(
+        penalty='elasticnet', random_state=0, fit_intercept=False, tol=1e-6,
+        max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha),
+        solver='saga')
+    log.fit(X, y)
+    glm = GeneralizedLinearRegressor(
+        family=BinomialDistribution(), link=LogitLink(), fit_intercept=False,
+        alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic',
+        tol=1e-7)
+    glm.fit(X, y)
+    assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6)
+    assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6)

From 2993e03dbfc89b068373718c82f65957639767ac Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 7 Apr 2019 15:33:27 +0200
Subject: [PATCH 49/69] More efficient sparse matrices and refactor of irls and
 cd solver

* refactor into function _irls_solver

* refactor into function _cd_solver

* replace of safe_sparse_dot by matmul operator @

* more efficient handling of fisher matrix

* sparse coo matrices are converted to csc or csr

* sample weights don't except sparse matrices

* minor doc changes
---
 sklearn/linear_model/glm.py            | 1101 ++++++++++++++----------
 sklearn/linear_model/tests/test_glm.py |   85 +-
 2 files changed, 717 insertions(+), 469 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 01e40b322946c..b2de866a4b69d 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -55,13 +55,12 @@
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array, check_X_y
-from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import newton_cg
 from ..utils.validation import check_is_fitted, check_random_state
 
 
 def _check_weights(sample_weight, n_samples):
-    """Check that weights are non-negative and have the right shape."""
+    """Check that sample weights are non-negative and have the right shape."""
     if sample_weight is None:
         weights = np.ones(n_samples)
     elif np.isscalar(sample_weight):
@@ -70,7 +69,7 @@ def _check_weights(sample_weight, n_samples):
         weights = sample_weight * np.ones(n_samples)
     else:
         _dtype = [np.float64, np.float32]
-        weights = check_array(sample_weight, accept_sparse='csr',
+        weights = check_array(sample_weight, accept_sparse=False,
                               force_all_finite=True, ensure_2d=False,
                               dtype=_dtype)
         if weights.ndim > 1:
@@ -247,12 +246,11 @@ class ExponentialDispersionModel(metaclass=ABCMeta):
     deviance_derivative
     starting_mu
 
+    _mu_deviance_derivative
     _score
     _fisher_matrix
     _observed_information
-    _deviance
-    _deviance_derivative
-    _deviance_hessian
+    _eta_mu_score_fisher
 
     References
     ----------
@@ -280,7 +278,7 @@ def include_upper_bound(self):
         return self._include_upper_bound
 
     def in_y_range(self, x):
-        """Returns true if `x` is in the valid range of Y~EDM.
+        """Returns ``True`` if x is in the valid range of Y~EDM.
 
         Parameters
         ----------
@@ -411,7 +409,7 @@ def unit_deviance_derivative(self, y, mu):
         mu : array, shape (n_samples,)
             Predicted mean.
         """
-        return -2*(y-mu)/self.unit_variance(mu)
+        return -2 * (y - mu) / self.unit_variance(mu)
 
     def deviance(self, y, mu, weights=1):
         r"""Compute the deviance.
@@ -434,13 +432,7 @@ def deviance(self, y, mu, weights=1):
         weights : array, shape (n_samples,) (default=1)
             Weights or exposure to which variance is inverse proportional.
         """
-        return np.sum(weights*self.unit_deviance(y, mu))
-
-    def _deviance(self, coef, X, y, weights, link):
-        """Compute the deviance as a function of the coefficients and data."""
-        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
-        mu = link.inverse(lin_pred)
-        return self.deviance(y, mu, weights)
+        return np.sum(weights * self.unit_deviance(y, mu))
 
     def deviance_derivative(self, y, mu, weights=1):
         """Compute the derivative of the deviance w.r.t. mu.
@@ -458,7 +450,36 @@ def deviance_derivative(self, y, mu, weights=1):
         weights : array, shape (n_samples,) (default=1)
             Weights or exposure to which variance is inverse proportional.
         """
-        return weights*self.unit_deviance_derivative(y, mu)
+        return weights * self.unit_deviance_derivative(y, mu)
+
+    def starting_mu(self, y, weights=1, ind_weight=0.5):
+        """Set starting values for the mean mu.
+
+        These may be good starting points for the (unpenalized) IRLS solver.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+
+        ind_weight : float (default=0.5)
+            Must be between 0 and 1. Specifies how much weight is given to the
+            individual observations instead of the mean of y.
+        """
+        return (ind_weight * y +
+                (1. - ind_weight) * np.average(y, weights=weights))
+
+    def _mu_deviance_derivative(self, coef, X, y, weights, link):
+        """Compute mu, the deviance and it's derivative w.r.t coef."""
+        lin_pred = X @ coef
+        mu = link.inverse(lin_pred)
+        dev = self.deviance(y, mu, weights)
+        d1 = link.inverse_derivative(lin_pred)
+        devp = X.T @ (d1 * self.deviance_derivative(y, mu, weights))
+        return mu, dev, devp
 
     def _score(self, coef, phi, X, y, weights, link):
         r"""Compute the score function.
@@ -476,16 +497,14 @@ def _score(self, coef, phi, X, y, weights, link):
 
         with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and
         :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`.
+        Note: The derivative of the deviance w.r.t. coef equals -2 * score.
         """
-        n_samples = X.shape[0]
-        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        lin_pred = X @ coef
         mu = link.inverse(lin_pred)
         sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         d = link.inverse_derivative(lin_pred)
-        d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0),
-                                        shape=(n_samples, n_samples))
-        temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=True)
-        score = safe_sparse_dot(X.T, temp, dense_output=True)
+        temp = sigma_inv * d * (y - mu)
+        score = X.T @ temp
         return score
 
     def _fisher_matrix(self, coef, phi, X, y, weights, link):
@@ -508,14 +527,13 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link):
         see func:`_score`.
         """
         n_samples = X.shape[0]
-        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        lin_pred = X @ coef
         mu = link.inverse(lin_pred)
         sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         d2 = link.inverse_derivative(lin_pred)**2
         d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0),
                                          shape=(n_samples, n_samples))
-        temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False)
-        fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False)
+        fisher_matrix = X.T @ d2_sigma_inv @ X
         return fisher_matrix
 
     def _observed_information(self, coef, phi, X, y, weights, link):
@@ -542,7 +560,7 @@ def _observed_information(self, coef, phi, X, y, weights, link):
         see :func:`score_` function and :func:`_fisher_matrix`.
         """
         n_samples = X.shape[0]
-        lin_pred = safe_sparse_dot(X, coef, dense_output=True)
+        lin_pred = X @ coef
         mu = link.inverse(lin_pred)
         sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         dp = link.inverse_derivative2(lin_pred)
@@ -551,80 +569,59 @@ def _observed_information(self, coef, phi, X, y, weights, link):
         r = y - mu
         temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0),
                                  shape=(n_samples, n_samples))
-        temp = safe_sparse_dot(temp, X, dense_output=False)
-        observed_information = safe_sparse_dot(X.T, temp, dense_output=False)
+        observed_information = X.T @ temp @ X
         return observed_information
 
-    def _deviance_derivative(self, coef, X, y, weights, link):
-        r"""Compute the derivative of the deviance w.r.t. coef.
-
-        The derivative of the deviance w.r.t. `coef` (:math:`w`) as a
-        function of the coefficients `coef` and the data.
-        This is equivalent to :math:`-2\phi` times the score function
-        :func:`_score` (derivative of the log-likelihood).
-        """
-        score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights,
-                            link=link)
-        return -2*score
-
-    def _deviance_hessian(self, coef, X, y, weights, link):
-        r"""Compute the hessian matrix of the deviance w.r.t. coef.
-
-        The hessian of the deviance w.r.t. `coef` (:math:`w`) is evaluated as
-        a function of the coefficients `coef` and the data.
-        It is equivalent to :math:`+2\phi` times the observed information
-        matrix.
-        """
-        info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y,
-                                                 weights=weights, link=link)
-        return 2*info_matrix
-
-    def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link):
+    def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link,
+                             diag_fisher=False):
         """Compute linear predictor, mean, score function and fisher matrix.
 
         It calculates the linear predictor, the mean, score function
         (derivative of log-likelihood) and Fisher information matrix
         all in one go as function of `coef` (:math:`w`) and the data.
+
+        Parameters
+        ----------
+        diag_fisher : boolean, optional (default=False)
+            If ``True``, returns only an array d such that
+            fisher = X.T @ np.diag(d) @ X.
+
+        Returns
+        -------
+        (eta, mu, score, fisher) : tuple with 4 elements
+            The 4 elements are:
+
+            * eta: ndarray, shape (X.shape[0],)
+            * mu: ndarray, shape (X.shape[0],)
+            * score: ndarray, shape (X.shape[0],)
+            * fisher:
+
+                * If diag_fisher is ``False``, the full fisher matrix,
+                  an array of shape (X.shape[1], X.shape[1])
+                * If diag_fisher is ``True`, an array of shape (X.shape[0])
         """
         n_samples, n_features = X.shape
         # eta = linear predictor
-        eta = safe_sparse_dot(X, coef, dense_output=True)
+        eta = X @ coef
         mu = link.inverse(eta)
         sigma_inv = 1./self.variance(mu, phi=phi, weights=weights)
         d1 = link.inverse_derivative(eta)  # = h'(eta)
         # Alternatively:
         # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
         # d1 = 1./link.derivative(mu)
-        d1_sigma_inv = sparse.dia_matrix((sigma_inv*d1, 0),
-                                         shape=(n_samples, n_samples))
-        temp = safe_sparse_dot(d1_sigma_inv, (y-mu), dense_output=True)
-        score = safe_sparse_dot(X.T, temp, dense_output=True)
+        score = X.T @ (sigma_inv * d1 * (y - mu))
         #
-        d2_sigma_inv = sparse.dia_matrix((sigma_inv*(d1**2), 0),
-                                         shape=(n_samples, n_samples))
-        temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False)
-        fisher = safe_sparse_dot(X.T, temp, dense_output=False)
-        return eta, mu, score, fisher
-
-    def starting_mu(self, y, weights=1, ind_weight=0.5):
-        """Set starting values for the mean mu.
-
-        These may be good starting points for the (unpenalized) IRLS solver.
-
-        Parameters
-        ----------
-        y : array, shape (n_samples,)
-            Target values.
-
-        weights : array, shape (n_samples,) (default=1)
-            Weights or exposure to which variance is inverse proportional.
-
-        ind_weight : float (default=0.5)
-            Must be between 0 and 1. Specifies how much weight is given to the
-            individual observations instead of the mean of y.
-        """
-        return (ind_weight * y +
-                (1. - ind_weight) * np.average(y, weights=weights))
+        d2_sigma_inv = sigma_inv * (d1**2)
+        if diag_fisher:
+            return eta, mu, score, d2_sigma_inv
+        else:
+            if sparse.issparse(X):
+                d2_sigma_inv = sparse.dia_matrix((d2_sigma_inv, 0),
+                                                 shape=(n_samples, n_samples))
+                fisher = (X.T @ d2_sigma_inv @ X).toarray()
+            else:
+                fisher = (X.T * d2_sigma_inv) @ X
+            return eta, mu, score, fisher
 
 
 class TweedieDistribution(ExponentialDispersionModel):
@@ -724,20 +721,20 @@ def unit_variance_derivative(self, mu):
         mu : array, shape (n_samples,)
             Predicted mean.
         """
-        return self.power*np.power(mu, self.power-1)
+        return self.power * np.power(mu, self.power - 1)
 
     def unit_deviance(self, y, mu):
         p = self.power
         if p == 0:
             # NormalDistribution
-            return (y-mu)**2
+            return (y - mu)**2
         if p == 1:
             # PoissonDistribution
             # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
             return 2 * (special.xlogy(y, y/mu) - y + mu)
         elif p == 2:
             # GammaDistribution
-            return 2 * (np.log(mu/y)+y/mu-1)
+            return 2 * (np.log(mu/y) + y/mu - 1)
         else:
             # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
             #    - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p))
@@ -784,11 +781,11 @@ def unit_variance(self, mu):
         return 1 + mu**2
 
     def unit_variance_derivative(self, mu):
-        return 2*mu
+        return 2 * mu
 
     def unit_deviance(self, y, mu):
-        return (2*y*(np.arctan(y) - np.arctan(mu)) +
-                np.log((1+mu**2)/(1+y**2)))
+        return (2 * y * (np.arctan(y) - np.arctan(mu)) +
+                np.log((1 + mu**2)/(1 + y**2)))
 
 
 class BinomialDistribution(ExponentialDispersionModel):
@@ -809,7 +806,7 @@ def unit_variance_derivative(self, mu):
         return 1 - 2 * mu
 
     def unit_deviance(self, y, mu):
-        return 2*(special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu)))
+        return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu)))
 
 
 def _irls_step(X, W, P2, z):
@@ -824,20 +821,20 @@ def _irls_step(X, W, P2, z):
 
     Parameters
     ----------
-    X : {numpy array, sparse matrix}, shape (n_samples, n_features)
+    X : {ndarray, sparse matrix}, shape (n_samples, n_features)
         Training data (with intercept included if present)
 
-    W : numpy array, shape (n_samples,)
+    W : ndarray, shape (n_samples,)
 
-    P2 : {numpy array, sparse matrix}, shape (n_features, n_features)
+    P2 : {ndarray, sparse matrix}, shape (n_features, n_features)
         The L2-penalty matrix or vector (=diagonal matrix)
 
-    z  : numpy array, shape (n_samples,)
+    z  : ndarray, shape (n_samples,)
         Working observations
 
     Returns
     -------
-    coef: array, shape (X.shape[1])
+    coef: ndarray, shape (X.shape[1])
     """
     # Note: solve vs least squares, what is more appropriate?
     #       scipy.linalg.solve seems faster, but scipy.linalg.lstsq
@@ -868,27 +865,422 @@ def _irls_step(X, W, P2, z):
     return coef
 
 
+def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol):
+    """Solve GLM with L2 penalty by IRLS algorithm.
+
+    Note: If X is sparse, P2 must also be sparse.
+    """
+    # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj'
+    #   Obj = objective function = 1/2 Dev + l2/2 w P2 w
+    #   Dev = deviance, s = normalized weights, variance V(mu) but phi=1
+    #   D   = link.inverse_derivative(eta) = diag_matrix(h'(X w))
+    #   D2  = link.inverse_derivative(eta)^2 = D^2
+    #   W   = D2/V(mu)
+    #   l2  = alpha * (1 - l1_ratio)
+    #   Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w
+    #        = -X' D (y-mu)/V(mu) + l2 P2 w
+    #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
+    #   Use Fisher matrix instead of full info matrix -X'(...) X,
+    #    i.e. E[Dev''] with E[y-mu]=0:
+    #   Obj'' ~ X' W X + l2 P2
+    # (1): w = (X' W X + l2 P2)^-1 X' W z,
+    #      with z = eta + D^-1 (y-mu)
+    # Note: P2 must be symmetrized
+    # Note: ' denotes derivative, but also transpose for matrices
+
+    # eta = linear predictor
+    eta = X @ coef
+    mu = link.inverse(eta)
+    # D = h'(eta)
+    hp = link.inverse_derivative(eta)
+    V = family.variance(mu, phi=1, weights=weights)
+    n_iter = 0
+    while n_iter < max_iter:
+        n_iter += 1
+        # coef_old not used so far.
+        # coef_old = coef
+        # working weights W, in principle a diagonal matrix
+        # therefore here just as 1d array
+        W = hp**2 / V
+        # working observations
+        z = eta + (y - mu) / hp
+        # solve A*coef = b
+        # A = X' W X + P2, b = X' W z
+        coef = _irls_step(X, W, P2, z)
+        # updated linear predictor
+        # do it here for updated values for tolerance
+        eta = X @ coef
+        mu = link.inverse(eta)
+        hp = link.inverse_derivative(eta)
+        V = family.variance(mu, phi=1, weights=weights)
+
+        # which tolerace? |coef - coef_old| or gradient?
+        # use gradient for compliance with newton-cg and lbfgs
+        # gradient = -X' D (y-mu)/V(mu) + l2 P2 w
+        gradient = -(X.T @ (hp*(y-mu)/V))
+        if P2.ndim == 1:
+            gradient += P2*coef
+        else:
+            gradient += P2 @ coef
+        if (np.max(np.abs(gradient)) <= tol):
+            converged = True
+            break
+
+    if not converged:
+        warnings.warn("irls failed to converge. Increase the number "
+                      "of iterations (currently {0})"
+                      .format(max_iter), ConvergenceWarning)
+
+    return coef, n_iter
+
+
+def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
+              max_inner_iter=1000, selection='cyclic',
+              random_state=None, diag_fisher=False):
+    """Compute inner loop of coordinate descent = cycles through features.
+
+    Minimization of 1-d subproblems::
+
+        min_z q(d+z*e_j) - q(d)
+        = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1
+
+    A = f'(w) + d*H(w) + (w+d)*P2
+    B = H+P2
+    Note: f'=-score and H=fisher are updated at the end of outer iteration.
+    """
+    # TODO: use sparsity (coefficient already 0 due to L1 penalty)
+    #       => active set of features for featurelist, see paper
+    #          of Improved GLMNET or Gap Safe Screening Rules
+    #          https://arxiv.org/abs/1611.05780
+    n_samples, n_features = X.shape
+    B = fisher
+    if P2.ndim == 1:
+        coef_P2 = coef * P2
+        if not diag_fisher:
+            B[np.diag_indices_from(B)] += P2
+    else:
+        coef_P2 = P2 @ coef  # P2 is symmetric, mat @ vec is usually faster
+        if not diag_fisher:
+            if sparse.issparse(P2):
+                B += P2.toarray()
+            else:
+                B += P2
+    A = -score + coef_P2  # + d @ (H+P2) but d=0 so far
+    # inner loop
+    inner_iter = 0
+    while inner_iter < max_inner_iter:
+        inner_iter += 1
+        n_cycles += 1
+        if selection == 'random':
+            featurelist = random_state.permutation(n_features)
+        else:
+            featurelist = np.arange(n_features)
+        for j in featurelist:
+            # minimize_z: a z + 1/2 b z^2 + c |d+z|
+            # a = A_j
+            # b = B_jj > 0
+            # c = |P1_j| = P1_j > 0, see 1.3
+            # d = w_j + d_j
+            # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
+            # with beta = z+d, beta_hat = d-a/b and gamma = c/b
+            # z = 1/b * S(bd-a,c) - d
+            # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
+            a = A[j]
+            if diag_fisher:
+                if sparse.issparse(X):
+                    xj = X[:, j]
+                    b = xj.transpose() @ xj.multiply(fisher[:, np.newaxis])
+                    b = b[0, 0]
+                else:
+                    b = X[:, j] @ (fisher * X[:, j])
+
+                if P2.ndim == 1:
+                    b += P2[j]
+                else:
+                    b += P2[j, j]
+            else:
+                b = B[j, j]
+
+            if b <= 0:
+                z = 0
+            elif P1[j] == 0:
+                z = -a/b
+            elif a + P1[j] < b * (coef[j] + d[j]):
+                z = -(a + P1[j])/b
+            elif a - P1[j] > b * (coef[j] + d[j]):
+                z = -(a - P1[j])/b
+            else:
+                z = -(coef[j] + d[j])
+
+            # update direction d
+            d[j] += z
+            # update A because d_j is now d_j+z
+            # A = f'(w) + d*H(w) + (w+d)*P2
+            # => A += (H+P2)*e_j z = B_j * z
+            # Note: B is symmetric B = B.transpose
+            if diag_fisher:
+                if sparse.issparse(X):
+                    A += (X.transpose() @
+                          X[:, j].multiply(fisher[:, np.newaxis])
+                          ).toarray().ravel() * z
+                else:
+                    # A += (X.T @ (fisher * X[:, j])) * z
+                    # same without transpose of X
+                    A += ((fisher * X[:, j]) @ X) * z
+
+                if P2.ndim == 1:
+                    A[j] += P2[j] * z
+                elif sparse.issparse(P2):
+                    # slice columns as P2 is csc
+                    A += P2[:, j].toarray().ravel() * z
+                else:
+                    A += P2[:, j] * z
+            else:
+                # B is symmetric, C- or F-contiguous, but never sparse
+                if B.flags['F_CONTIGUOUS']:
+                    # slice columns like for sparse csc
+                    A += B[:, j] * z
+                else:  # B.flags['C_CONTIGUOUS'] might be true
+                    # slice rows
+                    A += B[j, :] * z
+            # end of cycle
+        # stopping criterion for inner loop
+        # sum_i(|minimum of norm of subgrad of q(d)_i|)
+        mn_subgrad = np.where(coef + d == 0,
+                              np.sign(A) * np.maximum(np.abs(A) - P1, 0),
+                              A + np.sign(coef + d) * P1)
+        mn_subgrad = linalg.norm(mn_subgrad, ord=1)
+        if mn_subgrad <= inner_tol:
+            if inner_iter == 1:
+                inner_tol = inner_tol/4.
+            break
+        # end of inner loop
+    return d, coef_P2, n_cycles, inner_tol
+
+
+def _cd_solver(coef, X, y, weights, P1, P2, family, link,
+               max_iter=100, max_inner_iter=1000, tol=1e-4,
+               selection='cyclic ', random_state=None,
+               diag_fisher=False, copy_X=True):
+    """Solve GLM with L1 and L2 penalty by coordinate descent algorithm.
+
+    The objective beeing minimized in the coefficients w=coef is::
+
+        F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1
+
+    An Improved GLMNET for L1-regularized Logistic Regression:
+
+    1. Find optimal descent direction d by minimizing
+       min_d F(w+d) = min_d F(w+d) - F(w)
+    2. Quadrdatic approximation of F(w+d)-F(w) = q(d):
+       using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives:
+       q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
+       + ||P1*(w+d)||_1 - ||P1*w||_1
+       Then minimize q(d): min_d q(d)
+    3. Coordinate descent by updating coordinate j (d -> d+z*e_j):
+       min_z q(d+z*e_j)
+       = min_z q(d+z*e_j) - q(d)
+       = min_z A_j z + 1/2 B_jj z^2
+               + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
+       A = f'(w) + d*H(w) + (w+d)*P2
+       B = H+P2
+
+    Repeat steps 1-3 until convergence.
+    Note: Use Fisher matrix instead of Hessian for H.
+    Note: f' = -score, H = Fisher matrix
+
+    Parameters
+    ----------
+    coef: ndarray, shape (n_features,)
+
+    X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
+        Training data (with intercept included if present). If not sparse,
+        pass directly as Fortran-contiguous data to avoid
+        unnecessary memory duplication.
+
+    y : ndarray, shape (n_samples,)
+        Target values.
+
+    weights: ndarray, shape (n_samples,)
+        Sample weights with which the deviance is weighted. The weights must
+        bee normalized and sum to 1.
+
+    P1 : {ndarray}, shape (n_features,)
+        The L1-penalty vector (=diagonal matrix)
+
+    P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features)
+        The L2-penalty matrix or vector (=diagonal matrix). If a matrix is
+        passed, it must be symmetric. If X is sparse, P2 must also be sparse.
+
+    family : ExponentialDispersionModel
+
+    link : Link
+
+    max_iter : int, optional (default=100)
+        Maximum numer of outer (Newton) iterations.
+
+    max_inner_iter : int, optional (default=1000)
+        Maximum number of iterations, i.e. cycles over all features, in inner
+        loop.
+
+    tol : float, optional (default=1e-4)
+        Covergence criterion is
+        sum_i(|minimum of norm of subgrad of objective_i|)<=tol.
+
+    selection : str, optional (default='cyclic')
+        If 'random', randomly chose features in inner loop.
+
+    random_state : {int, RandomState instance, None}, optional (default=None)
+
+    diag_fisher : boolean, optional (default=False)
+        'False' calculates full fisher matrix, 'True' only diagonal matrix s.t.
+        fisher = X.T @ diag @ X. This saves storage but needs more
+        matrix-vector multiplications.
+
+    copy_X : boolean, optional (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    Returns
+    -------
+    coef : ndarray, shape (n_features,)
+
+    n_iter : numer of outer iterations = newton iterations
+
+    n_cycles : number of cycles over features
+
+    References
+    ----------
+    Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+    An Improved GLMNET for L1-regularized Logistic Regression,
+    Journal of Machine Learning Research 13 (2012) 1999-2030
+    https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    """
+    X = check_array(X, 'csc', dtype=[np.float64, np.float32],
+                    order='F', copy=copy_X)
+    if P2.ndim == 2:
+        P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32],
+                         order='F', copy=copy_X)
+    if sparse.issparse(X):
+        if not sparse.isspmatrix_csc(X):
+            raise ValueError("If X is sparse, it must be in csc format"
+                             "; got (format={})".format(X.format))
+        if not sparse.isspmatrix_csc(P2):
+            raise ValueError("If X is sparse, P2 must also be sparse csc"
+                             "format. Got P2 not sparse.")
+    random_state = check_random_state(random_state)
+    # Note: we already set P2 = l2*P2, P1 = l1*P1
+    # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+    n_iter = 0  # number of outer iterations
+    n_cycles = 0  # number of (complete) cycles over features
+    converged = False
+    n_samples, n_features = X.shape
+    # line search parameters
+    (beta, sigma) = (0.5, 0.01)
+    # some precalculations
+    # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a
+    #       1d array representing a diagonal matrix.
+    eta, mu, score, fisher = family._eta_mu_score_fisher(
+        coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
+        diag_fisher=diag_fisher)
+    # set up space for search direction d for inner loop
+    d = np.zeros_like(coef)
+    # initial stopping tolerance of inner loop
+    # use L1-norm of minimum of norm of subgradient of F
+    # fp_wP2 = f'(w) + w*P2
+    if P2.ndim == 1:
+        fp_wP2 = -score + coef * P2
+    else:
+        # Note: P2 is symmetric and matrix @ vector is faster for sparse
+        #       matrices.
+        fp_wP2 = -score + P2 @ coef
+    inner_tol = np.where(coef == 0,
+                         np.sign(fp_wP2) * np.maximum(np.abs(fp_wP2) - P1, 0),
+                         fp_wP2 + np.sign(coef) * P1)
+    inner_tol = linalg.norm(inner_tol, ord=1)
+    # outer loop
+    while n_iter < max_iter:
+        n_iter += 1
+        # initialize search direction d (to be optimized) with zero
+        d.fill(0)
+        # inner loop = _cd_cycle
+        d, coef_P2, n_cycles, inner_tol = \
+            _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
+                      max_inner_iter=max_inner_iter, selection=selection,
+                      random_state=random_state, diag_fisher=diag_fisher)
+        # line search by sequence beta^k, k=0, 1, ..
+        # F(w + lambda d) - F(w) <= lambda * bound
+        # bound = sigma * (f'(w)*d + w*P2*d
+        #                  +||P1 (w+d)||_1 - ||P1 w||_1)
+        P1w_1 = linalg.norm(P1 * coef, ord=1)
+        # Note: coef_P2 already calculated and still valid
+        bound = sigma * (-(score @ d) + coef_P2 @ d +
+                         linalg.norm(P1 * (coef + d), ord=1) - P1w_1)
+        Fw = (0.5 * family.deviance(y, mu, weights) +
+              0.5 * (coef_P2 @ coef) + P1w_1)
+        la = 1./beta
+        for k in range(20):
+            la *= beta  # starts with la=1
+            coef_wd = coef + la * d
+            mu_wd = link.inverse(X @ coef_wd)
+            Fwd = (0.5 * family.deviance(y, mu_wd, weights) +
+                   linalg.norm(P1 * coef_wd, ord=1))
+            if P2.ndim == 1:
+                Fwd += 0.5 * ((coef_wd * P2) @ coef_wd)
+            else:
+                Fwd += 0.5 * (coef_wd @ (P2 @ coef_wd))
+            if Fwd - Fw <= sigma * la * bound:
+                break
+        # update coefficients
+        # coef_old = coef.copy()
+        coef += la * d
+        # calculate eta, mu, score, Fisher matrix for next iteration
+        eta, mu, score, fisher = family._eta_mu_score_fisher(
+            coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
+            diag_fisher=diag_fisher)
+        # stopping criterion for outer loop
+        # sum_i(|minimum of norm of subgrad of F(w)_i|)
+        # fp_wP2 = f'(w) + w*P2
+        # Note: eta, mu and score are already updated
+        if P2.ndim == 1:
+            fp_wP2 = -score + coef * P2
+        else:
+            fp_wP2 = -score + P2 @ coef  # P2 is symmetric, mat @ vec is faster
+        mn_subgrad = np.where(coef == 0,
+                              np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
+                              fp_wP2 + np.sign(coef) * P1)
+        mn_subgrad = linalg.norm(mn_subgrad, ord=1)
+        if mn_subgrad <= tol:
+            converged = True
+            break
+        # end of outer loop
+    if not converged:
+        warnings.warn("Coordinate descent failed to converge. Increase"
+                      " the number of iterations (currently {0})"
+                      .format(max_iter), ConvergenceWarning)
+
+    return coef, n_iter, n_cycles
+
+
 class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     """Regression via a Generalized Linear Model (GLM) with penalties.
 
     GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
-    fitting and predicting the mean `mu=h(X*w)`. Therefore the fit minimizes
-    the following objective function with combined L1 and L2 priors as
-    regularizer::
+    fitting and predicting the mean of the target y as mu=h(X*w). Therefore,
+    the fit minimizes the following objective function with combined L1 and L2
+    priors as regularizer::
 
             1/(2*sum(s)) * deviance(y, h(X*w); s)
             + alpha * l1_ratio * ||P1*w||_1
             + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
 
-    with inverse link function `h` and s=`sample_weight` (for
-    ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`).
-    For ``P1=P2='identity'`` (``P1=None``, ``P2=None``), the penalty is the
-    elastic net::
+    with inverse link function h and s=sample_weight. Note that for
+    ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples).
+    For ``P1=P2='identity'``, the penalty is the elastic net::
 
             alpha * l1_ratio * ||w||_1
             + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2
 
-    If you are interested in controlling the L1 and L2 penalty
+    If you are interested in controlling the L1 and L2 penalties
     separately, keep in mind that this is equivalent to::
 
             a * L1 + b * L2
@@ -897,9 +1289,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
             alpha = a + b and l1_ratio = a / (a + b)
 
-    The parameter `l1_ratio` corresponds to alpha in the glmnet R package while
-    'alpha' corresponds to the lambda parameter in glmnet. Specifically,
-    l1_ratio = 1 is the lasso penalty.
+    The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet,
+    while ``alpha`` corresponds to the lambda parameter in glmnet.
+    Specifically, l1_ratio = 1 is the lasso penalty.
 
     Read more in the :ref:`User Guide <Generalized_linear_regression>`.
 
@@ -948,8 +1340,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         The distributional assumption of the GLM, i.e. which distribution from
         the EDM, specifies the loss function to be minimized.
 
-    link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link,
-        optional (default='auto')
+    link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \
+            optional (default='auto')
         The link function of the GLM, i.e. mapping from linear predictor
         (X*coef) to expectation (mu). Option 'auto' sets the link depending on
         the chosen family as follows:
@@ -982,7 +1374,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         'cd'
             Coordinate descent algorithm. It can deal with L1 as well as L2
-            penalties.
+            penalties. Note that in order to avoid unnecessary memory
+            duplication of the X argument in the ``fit`` method, X should be
+            directly passed as a Fortran-contiguous numpy array or sparse csc
+            matrix.
 
     max_iter : int, optional (default=100)
         The maximal number of iterations for solver algorithms.
@@ -990,8 +1385,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     tol : float, optional (default=1e-4)
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
-        where ``g_i`` is the i-th component of the gradient (derivative of
-        the objective function).
+        where g_i is the i-th component of the gradient (derivative) of
+        the objective function. For the cd solver, covergence is reached
+        when ``sum_i(|minimum of norm of g_i|)``, where g_i is the
+        subgradient of the objective.
 
     warm_start : boolean, optional (default=False)
         If set to ``True``, reuse the solution of the previous call to ``fit``
@@ -1007,7 +1404,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         'irls'
             Start values of mu are calculated by family.starting_mu(..). Then,
-            one step of irls obtains start values for ``coef_`. This gives
+            one step of irls obtains start values for ``coef_``. This gives
             usually good results.
 
         'least_squares'
@@ -1042,6 +1439,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         RandomState instance used by `np.random`. Used when ``selection`` ==
         'random'.
 
+    diag_fisher : boolean, (default=False)
+        Only relevant for solver 'cd'. If ``False``, the full Fisher matrix
+        (expected Hessian) is computed in each outer iteretion (Newton
+        iteration). If ``True``, only a diagonal matrix (stored as 1d array) is
+        computed, such that fisher = X.T @ diag @ X. This saves memory and
+        matrix-matrix multiplications, but needs more matrix-vector
+        multiplications. If you use large sparse X or if you have many
+        features, i.e. n_features >> n_samples, you might set this option to
+        ``True``.
+
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
 
@@ -1056,40 +1463,43 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     Attributes
     ----------
     coef_ : array, shape (n_features,)
-        Estimated coefficients for the linear predictor (X*coef_) in the GLM.
+        Estimated coefficients for the linear predictor (X*coef_+intercept_) in
+        the GLM.
 
     intercept_ : float
         Intercept (a.k.a. bias) added to linear predictor.
 
     dispersion_ : float
-        The dispersion parameter :math:`\\phi` if fit_dispersion is set.
+        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
 
     n_iter_ : int
-        Actual number of iterations of the solver.
+        Actual number of iterations used in solver.
 
     Notes
     -----
     The fit itself does not need Y to be from an EDM, but only assumes
-    the first two moments :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
-    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`.
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    :ref:`User Guide <Generalized_linear_regression>`.
 
     The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
-    (penalized) maximum likelihood which is equivalent to minimizing the
-    deviance.
+    minimizing the deviance plus penalty term, which is equivalent to
+    (penalized) maximum likelihood estimation.
 
-    For `alpha` > 0, the feature matrix `X` should be standardized in order to
+    For alpha > 0, the feature matrix X should be standardized in order to
     penalize features equally strong. Call
     :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
 
-    If the target `y` is a ratio, appropriate sample weights `s` should be
+    If the target y is a ratio, appropriate sample weights s should be
     provided.
-    As an example, consider Poission distributed counts `z` (integers) and
-    weights `s=exposure` (time, money, persons years, ...). Then you fit
-    `y = z/s`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
+    As an example, consider Poission distributed counts z (integers) and
+    weights s=exposure (time, money, persons years, ...). Then you fit
+    y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
     sample_weight=s)``. The weights are necessary for the right (finite
     sample) mean.
     Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
-    in this case one might say that `y` has a 'scaled' Poisson distributions.
+    in this case one might say that y has a 'scaled' Poisson distributions.
     The same holds for other distributions.
 
     References
@@ -1104,8 +1514,8 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
                  fit_intercept=True, family='normal', link='auto',
                  fit_dispersion=None, solver='auto', max_iter=100,
                  tol=1e-4, warm_start=False, start_params='irls',
-                 selection='cyclic', random_state=None, copy_X=True,
-                 check_input=True, verbose=0):
+                 selection='cyclic', random_state=None, diag_fisher=False,
+                 copy_X=True, check_input=True, verbose=0):
         self.alpha = alpha
         self.l1_ratio = l1_ratio
         self.P1 = P1
@@ -1121,6 +1531,7 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
         self.start_params = start_params
         self.selection = selection
         self.random_state = random_state
+        self.diag_fisher = diag_fisher
         self.copy_X = copy_X
         self.check_input = check_input
         self.verbose = verbose
@@ -1154,11 +1565,13 @@ def fit(self, X, y, sample_weight=None):
         #######################################################################
         # 1.1 validate arguments of fit #######################################
         _dtype = [np.float64, np.float32]
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                         dtype=_dtype, y_numeric=True, multi_output=False)
+        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'],
+                         dtype=_dtype, y_numeric=True, multi_output=False,
+                         copy=self.copy_X)
         # Without converting y to float, deviance might raise
         # ValueError: Integers to negative integer powers are not allowed.
-        y = y.astype(np.float64)
+        # Also, y must not be sparse.
+        y = np.asarray(y, dtype=np.float64)
 
         weights = _check_weights(sample_weight, y.shape[0])
 
@@ -1263,7 +1676,7 @@ def fit(self, X, y, sample_weight=None):
                                  " correct length;"
                                  " got(start_params={0})".format(start_params))
         else:
-            start_params = check_array(start_params, accept_sparse='csr',
+            start_params = check_array(start_params, accept_sparse=False,
                                        force_all_finite=True, ensure_2d=False,
                                        dtype=_dtype, copy=True)
             if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
@@ -1274,12 +1687,14 @@ def fit(self, X, y, sample_weight=None):
                                  .format(X.shape[1] + self.fit_intercept,
                                          start_params.shape[0],
                                          start_params.ndim))
-
         if self.selection not in ['cyclic', 'random']:
             raise ValueError("The argument selection must be 'cyclic' or "
                              "'random'; got (selection={0})"
                              .format(self.selection))
         random_state = check_random_state(self.random_state)
+        if not isinstance(self.diag_fisher, bool):
+            raise ValueError("The argument diag_fisher must be bool;"
+                             " got {0}".format(self.diag_fisher))
         if not isinstance(self.copy_X, bool):
             raise ValueError("The argument copy_X must be bool;"
                              " got {0}".format(self.copy_X))
@@ -1303,26 +1718,34 @@ def fit(self, X, y, sample_weight=None):
                                  "got (P1.shape[0]={0}), "
                                  "needed (X.shape[1]={1})."
                                  .format(P1.shape[0], X.shape[1]))
+        # If X is sparse, make P2 sparse, too.
         if isinstance(self.P2, str) and self.P2 == 'identity':
-            if not sparse.issparse(X):
-                P2 = np.ones(X.shape[1])
-            else:
+            if sparse.issparse(X):
                 P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0),
                       shape=(X.shape[1], X.shape[1]))).tocsr()
+            else:
+                P2 = np.ones(X.shape[1])
         else:
             P2 = check_array(self.P2, copy=True,
-                             accept_sparse=['csr', 'csc', 'coo'],
+                             accept_sparse=['csr', 'csc'],
                              dtype=_dtype, ensure_2d=False)
             if P2.ndim == 1:
+                P2 = np.asarray(P2)
                 if P2.shape[0] != X.shape[1]:
                     raise ValueError("P2 should be a 1d array of shape "
                                      "(n_features,) with "
                                      "n_features=X.shape[1]; "
                                      "got (P2.shape=({0},)), needed ({1},)"
                                      .format(P2.shape[0], X.shape[1]))
-            elif ((P2.ndim != 2) or
-                    (P2.shape[0] != P2.shape[1]) or
-                    (P2.shape[0] != X.shape[1])):
+                if sparse.issparse(X):
+                    P2 = (sparse.dia_matrix((P2, 0),
+                          shape=(X.shape[1], X.shape[1]))).tocsr()
+            elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and
+                    P2.shape[0] == X.shape[1]):
+                if sparse.issparse(X):
+                    P2 = (sparse.dia_matrix((P2, 0),
+                          shape=(X.shape[1], X.shape[1]))).tocsr()
+            else:
                 raise ValueError("P2 must be either None or an array of shape "
                                  "(n_features, n_features) with "
                                  "n_features=X.shape[1]; "
@@ -1335,7 +1758,8 @@ def fit(self, X, y, sample_weight=None):
         if self.fit_intercept:
             # Note: intercept is first column <=> coef[0] is for intecept
             if sparse.issparse(X):
-                Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X])
+                Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X],
+                                     format=X.format)
             else:
                 Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
             P1 = np.concatenate((np.array([0]), P1))
@@ -1343,6 +1767,7 @@ def fit(self, X, y, sample_weight=None):
                 P2 = np.concatenate((np.array([0]), P2))
             elif sparse.issparse(P2):
                 P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2),
+                                       format=P2.format,
                                        dtype=P2.dtype).tocsr()
             else:
                 # as of numpy 1.13 this would work:
@@ -1355,7 +1780,7 @@ def fit(self, X, y, sample_weight=None):
 
         n_samples, n_features = Xnew.shape
         l1 = self.alpha * self.l1_ratio
-        l2 = self.alpha * (1-self.l1_ratio)
+        l2 = self.alpha * (1 - self.l1_ratio)
         P1 *= l1
         P2 *= l2
         # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
@@ -1484,315 +1909,114 @@ def fit(self, X, y, sample_weight=None):
         #######################################################################
         # algorithms for optimiation
         # TODO: Parallelize it?
-        self.n_iter_ = 0
-        converged = False
+
         # 4.1 IRLS ############################################################
-        # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj'
-        #   Obj = objective function = 1/2 Dev + l2/2 w P2 w
-        #   Dev = deviance, s = normalized weights, variance V(mu) but phi=1
-        #   D   = link.inverse_derivative(eta) = diag_matrix(h'(X w))
-        #   D2  = link.inverse_derivative(eta)^2 = D^2
-        #   W   = D2/V(mu)
-        #   l2  = alpha * (1 - l1_ratio)
-        #   Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w
-        #        = -X' D (y-mu)/V(mu) + l2 P2 w
-        #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
-        #   Use Fisher matrix instead of full info matrix -X'(...) X,
-        #    i.e. E[Dev''] with E[y-mu]=0:
-        #   Obj'' ~ X' W X + l2 P2
-        # (1): w = (X' W X + l2 P2)^-1 X' W z,
-        #      with z = eta + D^-1 (y-mu)
         # Note: we already set P2 = l2*P2, see above
         # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
-        # Note: ' denotes derivative, but also transpose for matrices
         if solver == 'irls':
-            # eta = linear predictor
-            eta = safe_sparse_dot(Xnew, coef, dense_output=True)
-            mu = link.inverse(eta)
-            # D = h'(eta)
-            hp = link.inverse_derivative(eta)
-            V = family.variance(mu, phi=1, weights=weights)
-            while self.n_iter_ < self.max_iter:
-                self.n_iter_ += 1
-                # coef_old not used so far.
-                # coef_old = coef
-                # working weights W, in principle a diagonal matrix
-                # therefore here just as 1d array
-                W = (hp**2 / V)
-                # working observations
-                z = eta + (y-mu)/hp
-                # solve A*coef = b
-                # A = X' W X + P2, b = X' W z
-                coef = _irls_step(Xnew, W, P2, z)
-                # updated linear predictor
-                # do it here for updated values for tolerance
-                eta = safe_sparse_dot(Xnew, coef, dense_output=True)
-                mu = link.inverse(eta)
-                hp = link.inverse_derivative(eta)
-                V = family.variance(mu, phi=1, weights=weights)
-
-                # which tolerace? |coef - coef_old| or gradient?
-                # use gradient for compliance with newton-cg and lbfgs
-                # gradient = family._deviance_derivative(
-                #     coef=coef, X=Xnew, y=y, weights=weights, link=link)
-                # gradient = -X' D (y-mu)/V(mu) + l2 P2 w
-                gradient = -safe_sparse_dot(Xnew.T, hp*(y-mu)/V)
+            coef, self.n_iter_ = \
+                _irls_solver(coef=coef, X=Xnew, y=y, weights=weights, P2=P2,
+                             family=family, link=link, max_iter=self.max_iter,
+                             tol=self.tol)
+
+        # 4.2 L-BFGS ##########################################################
+        elif solver == 'lbfgs':
+            def func(coef, X, y, weights, P2, family, link):
+                mu, dev, devp = \
+                    family._mu_deviance_derivative(coef, X, y, weights, link)
                 if P2.ndim == 1:
-                    gradient += P2*coef
+                    L2 = P2 * coef
                 else:
-                    gradient += safe_sparse_dot(P2, coef)
-                if (np.max(np.abs(gradient)) <= self.tol):
-                    converged = True
-                    break
-
-            if not converged:
-                warnings.warn("irls failed to converge. Increase the number "
-                              "of iterations (currently {0})"
-                              .format(self.max_iter), ConvergenceWarning)
-
-        # 4.2 L-BFGS and Newton-CG ############################################
-        # TODO: performance: make one function return both deviance and
-        #       gradient of deviance
-        elif solver in ['lbfgs', 'newton-cg']:
-            def func(coef, *args):
+                    L2 = P2 @ coef
+                obj = 0.5 * dev + 0.5 * (coef @ L2)
+                objp = 0.5 * devp + L2
+                return obj, objp
+
+            args = (Xnew, y, weights, P2, family, link)
+            coef, loss, info = fmin_l_bfgs_b(
+                func, coef, fprime=None, args=args,
+                iprint=(self.verbose > 0) - 1, pgtol=self.tol,
+                maxiter=self.max_iter)
+            if self.verbose > 0:
+                if info["warnflag"] == 1:
+                    warnings.warn("lbfgs failed to converge."
+                                  " Increase the number of iterations.",
+                                  ConvergenceWarning)
+                elif info["warnflag"] == 2:
+                    warnings.warn("lbfgs failed for the reason: {0}"
+                                  .format(info["task"]))
+            self.n_iter_ = info['nit']
+
+        # 4.3 Newton-CG #######################################################
+        # We use again the fisher matrix instead of the hessian. More
+        # precisely, expected hessian of deviance.
+        elif solver == 'newton-cg':
+            def func(coef, X, y, weights, P2, family, link):
                 if P2.ndim == 1:
-                    L2 = safe_sparse_dot(coef.T, P2*coef)
+                    L2 = coef @ (P2 * coef)
                 else:
-                    L2 = safe_sparse_dot(coef.T, safe_sparse_dot(P2, coef))
-                    # A[np.diag_indices_from(A)] += P2
-                return 0.5*family._deviance(coef, *args) + 0.5*L2
+                    L2 = coef @ (P2 @ coef)
+                mu = link.inverse(X @ coef)
+                return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2
 
-            def fprime(coef, *args):
+            def grad(coef, X, y, weights, P2, family, link):
                 if P2.ndim == 1:
-                    L2 = P2*coef
+                    L2 = P2 * coef
                 else:
-                    L2 = safe_sparse_dot(P2, coef)
-                return 0.5*family._deviance_derivative(coef, *args) + L2
+                    L2 = P2 @ coef
+                eta = X @ coef
+                mu = link.inverse(eta)
+                d1 = link.inverse_derivative(eta)
+                grad = X.T @ (d1 * family.deviance_derivative(y, mu, weights))
+                return 0.5 * grad + L2
 
-            def grad_hess(coef, X, y, weights, link):
+            def grad_hess(coef, X, y, weights, P2, family, link):
                 if P2.ndim == 1:
-                    L2 = P2*coef
+                    L2 = P2 * coef
                 else:
-                    L2 = safe_sparse_dot(P2, coef)
-                grad = 0.5*family._deviance_derivative(
-                    coef, X, y, weights, link) + L2
-                hessian = 0.5*family._deviance_hessian(
-                    coef, X, y, weights, link)
-                if P2.ndim == 1:
-                    hessian[np.diag_indices_from(hessian)] += P2
-                else:
-                    hessian = hessian + P2
+                    L2 = P2 @ coef
+                eta = X @ coef
+                mu = link.inverse(eta)
+                d1 = link.inverse_derivative(eta)
+                grad = 0.5 * \
+                    (X.T @ (d1 * family.deviance_derivative(y, mu, weights))) \
+                    + L2
+                # expected hessian = X.T @ diag_matrix @ X
+                # calculate only diag_matrix
+                diag = d1**2 / family.variance(mu, phi=1, weights=weights)
 
                 def Hs(s):
-                    ret = safe_sparse_dot(hessian, s)
+                    ret = 0.5 * (X.T @ (diag * (X @ s)))
+                    if P2.ndim == 1:
+                        ret += P2 * s
+                    else:
+                        ret += P2 @ s
                     return ret
+
                 return grad, Hs
 
-            args = (Xnew, y, weights, link)
-
-            if solver == 'lbfgs':
-                coef, loss, info = fmin_l_bfgs_b(
-                    func, coef, fprime=fprime, args=args,
-                    iprint=(self.verbose > 0) - 1, pgtol=self.tol,
-                    maxiter=self.max_iter)
-                if self.verbose > 0:
-                    if info["warnflag"] == 1:
-                        warnings.warn("lbfgs failed to converge."
-                                      " Increase the number of iterations.",
-                                      ConvergenceWarning)
-                    elif info["warnflag"] == 2:
-                        warnings.warn("lbfgs failed for the reason: {0}"
-                                      .format(info["task"]))
-                self.n_iter_ = info['nit']
-            elif solver == 'newton-cg':
-                coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef,
-                                           args=args, maxiter=self.max_iter,
-                                           tol=self.tol)
-
-        # 4.3 coordinate descent ##############################################
-        # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
-        # An Improved GLMNET for L1-regularized Logistic Regression,
-        # Journal of Machine Learning Research 13 (2012) 1999-2030
-        # Note: Use Fisher matrix instead of Hessian for H
-        #
-        # 1. find optimal descent direction d by minimizing
-        #    min_d F(w+d) = min_d F(w+d) - F(w)
-        #    F = f + g, f(w) = 1/2 deviance, g(w) = 1/2 w*P2*w + ||P1*w||_1
-        # 2. quadrdatic approximation of F(w+d)-F(w) = q(d):
-        #    using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives
-        #    q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
-        #           + ||P1*(w+d)||_1 - ||P1*w||_1
-        #    min_d q(d)
-        # 3. coordinate descent by updating coordinate j (d -> d+z*e_j):
-        #    min_z q(d+z*e_j)
-        #    = min_z q(d+z*e_j) - q(d)
-        #    = min_z A_j z + 1/2 B_jj z^2
-        #            + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
-        #    A = f'(w) + d*H(w) + (w+d)*P2
-        #    B = H+P2
-        # Note: we already set P2 = l2*P2, P1 = l1*P1, see above
+            args = (Xnew, y, weights, P2, family, link)
+            coef, n_iter_i = newton_cg(grad_hess, func, grad, coef,
+                                       args=args, maxiter=self.max_iter,
+                                       tol=self.tol)
+
+        # 4.4 coordinate descent ##############################################
+        # Note: we already set P1 = l1*P1, see above
+        # Note: we already set P2 = l2*P2, see above
         # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
-        # Note: f' = -score, H = Fisher matrix
         elif solver == 'cd':
-            # line search parameters
-            (beta, sigma) = (0.5, 0.01)
-            # max inner loops (cycles through all features)
-            max_inner_iter = 1000
-            # some precalculations
-            eta, mu, score, fisher = family._eta_mu_score_fisher(
-                coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link)
-            # set up space for search direction d for inner loop
-            d = np.zeros_like(coef)
-            # initial stopping tolerance of inner loop
-            # use L1-norm of minimum-norm of subgradient of F
-            # fp_wP2 = f'(w) + w*P2
-            if P2.ndim == 1:
-                fp_wP2 = -score + coef*P2
-            else:
-                fp_wP2 = -score + safe_sparse_dot(coef, P2)
-            inner_tol = (np.where(coef == 0,
-                         np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
-                         fp_wP2+np.sign(coef)*P1))
-            inner_tol = linalg.norm(inner_tol, ord=1)
-            # outer loop
-            while self.n_iter_ < self.max_iter:
-                self.n_iter_ += 1
-                # initialize search direction d (to be optimized) with zero
-                d.fill(0)
-                # inner loop
-                # TODO: use sparsity (coefficient already 0 due to L1 penalty)
-                #       => active set of features for featurelist, see paper
-                #          of Improved GLMNET or Gap Safe Screening Rules
-                #          https://arxiv.org/abs/1611.05780
-                # A = f'(w) + d*H(w) + (w+d)*P2
-                # B = H+P2
-                # Note: f'=-score and H=fisher are updated at the end of outer
-                #       iteration
-                B = fisher
-                if P2.ndim == 1:
-                    coef_P2 = coef * P2
-                    B[np.diag_indices_from(B)] += P2
-                else:
-                    coef_P2 = safe_sparse_dot(coef, P2)
-                    B = B + P2
-                A = -score + coef_P2  # + d*(H+P2) but d=0 so far
-                inner_iter = 0
-                while inner_iter < max_inner_iter:
-                    inner_iter += 1
-                    if self.selection == 'random':
-                        featurelist = random_state.permutation(n_features)
-                    else:
-                        featurelist = np.arange(n_features)
-                    for j in featurelist:
-                        # minimize_z: a z + 1/2 b z^2 + c |d+z|
-                        # a = A_j
-                        # b = B_jj > 0
-                        # c = |P1_j| = P1_j > 0, see 1.3
-                        # d = w_j + d_j
-                        # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
-                        # with beta = z+d, beta_hat = d-a/b and gamma = c/b
-                        # z = 1/b * S(bd-a,c) - d
-                        # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
-                        a = A[j]
-                        b = B[j, j]
-                        if P1[j] == 0:
-                            if b == 0:
-                                z = 0
-                            else:
-                                z = -a/b
-                        elif a + P1[j] < b * (coef[j]+d[j]):
-                            if b == 0:
-                                z = 0
-                            else:
-                                z = -(a + P1[j])/b
-                        elif a - P1[j] > b * (coef[j]+d[j]):
-                            if b == 0:
-                                z = 0
-                            else:
-                                z = -(a - P1[j])/b
-                        else:
-                            z = -(coef[j] + d[j])
-                        # update direction d
-                        d[j] += z
-                        # update A because d_j is now d_j+z
-                        # A = f'(w) + d*H(w) + (w+d)*P2
-                        # => A += (H+P2)*e_j z  = B_j * z
-                        # Note: B is symmetric B = B.transpose
-                        if sparse.issparse(B):
-                            if sparse.isspmatrix_csc(B):
-                                # slice columns
-                                A += B[:, j].toarray().ravel() * z
-                            else:
-                                # slice rows
-                                A += B[j, :].toarray().ravel() * z
-                        else:
-                            A += B[j, :] * z
-                        # end of cycle
-                    # stopping criterion for inner loop
-                    # sum_i(|minimum-norm subgrad of q(d)_i|)
-                    mn_subgrad = (np.where(coef + d == 0,
-                                  np.sign(A)*np.maximum(np.abs(A)-P1, 0),
-                                  A+np.sign(coef+d)*P1))
-                    mn_subgrad = linalg.norm(mn_subgrad, ord=1)
-                    if mn_subgrad <= inner_tol:
-                        if inner_iter == 1:
-                            inner_tol = inner_tol/4.
-                        break
-                    # end of inner loop
-                # line search by sequence beta^k, k=0, 1, ..
-                # F(w + lambda d) - F(w) <= lambda * bound
-                # bound = sigma * (f'(w)*d + w*P2*d
-                #                  +||P1 (w+d)||_1 - ||P1 w||_1)
-                P1w_1 = linalg.norm(P1*coef, ord=1)
-                # Note: coef_P2 already calculated and still valid
-                bound = sigma * (
-                    safe_sparse_dot(-score, d) +
-                    safe_sparse_dot(coef_P2, d) +
-                    linalg.norm(P1*(coef+d), ord=1) -
-                    P1w_1)
-                Fw = (0.5 * family.deviance(y, mu, weights) +
-                      0.5 * safe_sparse_dot(coef_P2, coef) +
-                      P1w_1)
-                la = 1./beta
-                for k in range(20):
-                    la *= beta  # starts with la=1
-                    mu_wd = link.inverse(safe_sparse_dot(Xnew, coef+la*d,
-                                         dense_output=True))
-                    Fwd = (0.5 * family.deviance(y, mu_wd, weights) +
-                           linalg.norm(P1*(coef+la*d), ord=1))
-                    if P2.ndim == 1:
-                        Fwd += 0.5 * safe_sparse_dot((coef+la*d)*P2, coef+la*d)
-                    else:
-                        Fwd += 0.5 * (safe_sparse_dot(coef+la*d,
-                                      safe_sparse_dot(P2, coef+la*d)))
-                    if Fwd-Fw <= sigma*la*bound:
-                        break
-                # update coefficients
-                # coef_old = coef.copy()
-                coef += la * d
-                # calculate eta, mu, score, Fisher matrix for next iteration
-                eta, mu, score, fisher = family._eta_mu_score_fisher(
-                    coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link)
-                # stopping criterion for outer loop
-                # sum_i(|minimum-norm subgrad of F(w)_i|)
-                # fp_wP2 = f'(w) + w*P2
-                # Note: eta, mu and score are already updated
-                if P2.ndim == 1:
-                    fp_wP2 = -score + coef*P2
-                else:
-                    fp_wP2 = -score + safe_sparse_dot(coef, P2)
-                mn_subgrad = (np.where(coef == 0,
-                              np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
-                              fp_wP2+np.sign(coef)*P1))
-                mn_subgrad = linalg.norm(mn_subgrad, ord=1)
-                if mn_subgrad <= self.tol:
-                    converged = True
-                    break
-                # end of outer loop
-            if not converged:
-                warnings.warn("Coordinate descent failed to converge. Increase"
-                              " the number of iterations (currently {0})"
-                              .format(self.max_iter), ConvergenceWarning)
+            # For coordinate descent, if X is sparse, it should be csc format
+            # If X is sparse, P2 must also be csc
+            if sparse.issparse(Xnew):
+                Xnew = Xnew.tocsc(copy=self.copy_X)
+                P2 = sparse.csc_matrix(P2)
+
+            coef, self.n_iter_, self._n_cycles = \
+                _cd_solver(coef=coef, X=Xnew, y=y, weights=weights, P1=P1,
+                           P2=P2, family=family, link=link,
+                           max_iter=self.max_iter, tol=self.tol,
+                           selection=self.selection, random_state=random_state,
+                           diag_fisher=self.diag_fisher, copy_X=self.copy_X)
 
         #######################################################################
         # 5. postprocessing                                                   #
@@ -1828,8 +2052,7 @@ def linear_predictor(self, X):
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
                         dtype='numeric', copy=True, ensure_2d=True,
                         allow_nd=False)
-        return safe_sparse_dot(X, self.coef_,
-                               dense_output=True) + self.intercept_
+        return X @ self.coef_ + self.intercept_
 
     def predict(self, X, sample_weight=None):
         """Predict uing GLM with feature matrix X.
@@ -1884,7 +2107,7 @@ def estimate_phi(self, X, y, sample_weight=None):
                          dtype=_dtype, y_numeric=True, multi_output=False)
         n_samples, n_features = X.shape
         weights = _check_weights(sample_weight, n_samples)
-        eta = safe_sparse_dot(X, self.coef_, dense_output=True)
+        eta = X @ self.coef_
         if self.fit_intercept is True:
             eta += self.intercept_
             n_features += 1
@@ -1907,19 +2130,19 @@ def estimate_phi(self, X, y, sample_weight=None):
     # "AssertionError: -0.28014056555724598 not greater than 0.5"
     # unless GeneralizedLinearRegressor has a score which passes the test.
     def score(self, X, y, sample_weight=None):
-        r"""Compute D^2, the percentage of deviance explained.
+        """Compute D^2, the percentage of deviance explained.
 
         D^2 is a generalization of the coefficient of determination R^2.
         R^2 uses squared error and D^2 deviance. Note that those two are equal
         for family='normal'.
 
         D^2 is defined as
-        :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}`
-        is the null deviance, i.e. the deviance of a model with intercept
-        alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean
-        :math:`\bar{y}` is averaged by sample_weight.
-        Best possible score is 1.0 and it can be negative (because the
-        model can be arbitrarily worse).
+        :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
+        :math:`D_{null}` is the null deviance, i.e. the deviance of a model
+        with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
+        The mean :math:`\\bar{y}` is averaged by sample_weight.
+        Best possible score is 1.0 and it can be negative (because the model
+        can be arbitrarily worse).
 
         Parameters
         ----------
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index de0857a34fe3a..ae8a5f4cfc5e7 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -138,16 +138,18 @@ def test_sample_weights_validation():
         glm.fit(X, y, weights)
 
 
-def test_glm_family_argument():
+@pytest.mark.parametrize('f, fam',
+                         [('normal', NormalDistribution()),
+                          ('poisson', PoissonDistribution()),
+                          ('gamma', GammaDistribution()),
+                          ('inverse.gaussian', InverseGaussianDistribution()),
+                          ('binomial', BinomialDistribution())])
+def test_glm_family_argument(f, fam):
     """Test GLM family argument set as string."""
-    y = np.array([1, 2])
+    y = np.array([0.1, 0.5])  # in range of all distributions
     X = np.array([[1], [2]])
-    for (f, fam) in [('normal', NormalDistribution()),
-                     ('poisson', PoissonDistribution()),
-                     ('gamma', GammaDistribution()),
-                     ('inverse.gaussian', InverseGaussianDistribution())]:
-        glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
-        assert_equal(type(glm._family_instance), type(fam))
+    glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
+    assert_equal(type(glm._family_instance), type(fam))
 
     glm = GeneralizedLinearRegressor(family='not a family',
                                      fit_intercept=False)
@@ -155,14 +157,16 @@ def test_glm_family_argument():
         glm.fit(X, y)
 
 
-def test_glm_link_argument():
+@pytest.mark.parametrize('l, link',
+                         [('identity', IdentityLink()),
+                          ('log', LogLink()),
+                          ('logit', LogitLink())])
+def test_glm_link_argument(l, link):
     """Test GLM link argument set as string."""
-    y = np.array([1, 2])
+    y = np.array([0.1, 0.5])  # in range of all distributions
     X = np.array([[1], [2]])
-    for (l, link) in [('identity', IdentityLink()),
-                      ('log', LogLink())]:
-        glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y)
-        assert_equal(type(glm._link_instance), type(link))
+    glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y)
+    assert_equal(type(glm._link_instance), type(link))
 
     glm = GeneralizedLinearRegressor(family='normal', link='not a link')
     with pytest.raises(ValueError):
@@ -317,6 +321,16 @@ def test_glm_random_state_argument(random_state):
         glm.fit(X, y)
 
 
+@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]])
+def test_glm_diag_fisher_argument(diag_fisher):
+    """Test GLM for invalid diag_fisher arguments."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
 @pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]])
 def test_glm_copy_X_argument(copy_X):
     """Test GLM for invalid copy_X arguments."""
@@ -453,7 +467,12 @@ def test_normal_ridge(solver):
     assert_array_almost_equal(glm.predict(T), ridge.predict(T))
 
 
-def test_poisson_ridge():
+@pytest.mark.parametrize('solver, decimal, tol',
+                         [('irls', 7, 1e-8),
+                          ('lbfgs', 5, 1e-7),
+                          ('newton-cg', 5, 1e-7),
+                          ('cd', 7, 1e-8)])
+def test_poisson_ridge(solver, decimal, tol):
     """Test ridge regression with poisson family and LogLink.
 
     Compare to R's glmnet"""
@@ -470,22 +489,20 @@ def test_poisson_ridge():
     # b            0.03741173122
     X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
     y = np.array([0, 1, 1, 2])
-    s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7}
-    s_tol = {'irls': 1e-8, 'lbfgs': 1e-7, 'newton-cg': 1e-7, 'cd': 1e-8}
-    for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']:
-        glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
-                                         fit_intercept=True, family='poisson',
-                                         link='log', tol=s_tol[solver],
-                                         solver=solver, max_iter=300,
-                                         random_state=42)
-        glm.fit(X, y)
-        assert_almost_equal(glm.intercept_, -0.12889386979,
-                            decimal=s_dec[solver])
-        assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
-                                  decimal=s_dec[solver])
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
+                                     fit_intercept=True, family='poisson',
+                                     link='log', tol=tol,
+                                     solver=solver, max_iter=300,
+                                     random_state=42)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, -0.12889386979,
+                        decimal=decimal)
+    assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
+                              decimal=decimal)
 
 
-def test_normal_enet():
+@pytest.mark.parametrize('diag_fisher', [False, True])
+def test_normal_enet(diag_fisher):
     """Test elastic net regression with normal/gaussian family."""
     rng = np.random.RandomState(0)
     alpha, l1_ratio = 0.3, 0.7
@@ -494,12 +511,14 @@ def test_normal_enet():
     beta = rng.randn(n_features)
     y = 2 + np.dot(X, beta) + rng.randn(n_samples)
 
+    # 1. test normal enet on dense data
     glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio,
                                      family='normal', link='identity',
                                      fit_intercept=True, tol=1e-8,
                                      max_iter=100, selection='cyclic',
                                      solver='cd', start_params='zero',
-                                     check_input=False)
+                                     check_input=False,
+                                     diag_fisher=diag_fisher)
     glm.fit(X, y)
 
     enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True,
@@ -509,6 +528,12 @@ def test_normal_enet():
     assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
     assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
 
+    # 2. test normal enet on sparse data
+    X = sparse.csc_matrix(X)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
+    assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
+
 
 def test_poisson_enet():
     """Test elastic net regression with poisson family and LogLink.

From a6f9f13db9dd11d01d1dcab93819e67d5ca18b9d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sat, 20 Apr 2019 15:59:39 +0200
Subject: [PATCH 50/69] Treat the intercept separately, i.e. X, P1, P2 never
 include intercept

---
 sklearn/linear_model/glm.py            | 637 +++++++++++++++----------
 sklearn/linear_model/tests/test_glm.py |   6 +-
 2 files changed, 386 insertions(+), 257 deletions(-)

diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index b2de866a4b69d..2afd1ddf8c79c 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -7,8 +7,6 @@
 # License: BSD 3 clause
 
 # TODO: Write more examples.
-# TODO: Make option self.copy_X more meaningful.
-#       So far, fit uses Xnew instead of X.
 # TODO: Should the option `normalize` be included (like other linear models)?
 #       So far, it is not included. User must pass a normalized X.
 # TODO: Add cross validation support?
@@ -39,6 +37,7 @@
 #   sklearn.linear_models uses w for coefficients, standard literature on
 #   GLMs use beta for coefficients and w for (sample) weights.
 #   So far, coefficients=w and sample weights=s.
+# - The intercept term is the first index, i.e. coef[0]
 
 
 from __future__ import division
@@ -86,6 +85,83 @@ def _check_weights(sample_weight, n_samples):
     return weights
 
 
+def _safe_lin_pred(X, coef):
+    """Compute the linear predictor taking care if intercept is present."""
+    if coef.size == X.shape[1] + 1:
+        return X @ coef[1:] + coef[0]
+    else:
+        return X @ coef
+
+
+def _safe_sandwich_dot(X, d, intercept=False):
+    """Compute sandwich product X.T @ diag(d) @ X.
+
+    With ``intercept=True``, X is treated as if a column of 1 were appended as
+    first column of X.
+    X can be sparse, d must be an ndarray. Always returns a ndarray."""
+    if sparse.issparse(X):
+        temp = (X.transpose().multiply(d) @ X).toarray()
+    else:
+        temp = (X.T * d) @ X
+    if intercept:
+        dim = X.shape[1] + 1
+        if sparse.issparse(X):
+            order = 'F' if sparse.isspmatrix_csc(X) else 'C'
+        else:
+            order = 'F' if X.flags['F_CONTIGUOUS'] else 'C'
+        res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order)
+        res[0, 0] = d.sum()
+        res[1:, 0] = d @ X
+        res[0, 1:] = res[1:, 0]
+        res[1:, 1:] = temp
+    else:
+        res = temp
+    return res
+
+
+def _min_norm_sugrad(coef, grad, P2, P1):
+    """Compute the gradient of all subgradients with minimal L2-norm.
+
+    subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1)
+
+    g_i = grad_i + (P2*coef)_i
+
+    if coef_i > 0:   g_i + P1_i
+    if coef_i < 0:   g_i - P1_i
+    if coef_i = 0:   sign(g_i) * max(|g_i|-P1_i, 0)
+
+    Parameters
+    ----------
+    coef : ndarray
+        coef[0] may be intercept.
+
+    grad : ndarray, shape=coef.shape
+
+    P2 : {1d or 2d array, None}
+        always without intercept, ``None`` means P2 = 0
+
+    P1 : ndarray
+        always without intercept
+    """
+    intercept = (coef.size == P1.size + 1)
+    idx = 1 if intercept else 0  # offset if coef[0] is intercept
+    # compute grad + coef @ P2 without intercept
+    grad_wP2 = grad[idx:].copy()
+    if P2 is None:
+        pass
+    elif P2.ndim == 1:
+        grad_wP2 += coef[idx:] * P2
+    else:
+        grad_wP2 += coef[idx:] @ P2
+    res = np.where(coef[idx:] == 0,
+                   np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0),
+                   grad_wP2 + np.sign(coef[idx:]) * P1)
+    if intercept:
+        return np.concatenate(([grad[0]], res))
+    else:
+        return res
+
+
 class Link(metaclass=ABCMeta):
     """Abstract base class for Link funtions."""
 
@@ -473,13 +549,16 @@ def starting_mu(self, y, weights=1, ind_weight=0.5):
                 (1. - ind_weight) * np.average(y, weights=weights))
 
     def _mu_deviance_derivative(self, coef, X, y, weights, link):
-        """Compute mu, the deviance and it's derivative w.r.t coef."""
-        lin_pred = X @ coef
+        """Compute mu and the derivative of the deviance w.r.t coef."""
+        lin_pred = _safe_lin_pred(X, coef)
         mu = link.inverse(lin_pred)
-        dev = self.deviance(y, mu, weights)
         d1 = link.inverse_derivative(lin_pred)
-        devp = X.T @ (d1 * self.deviance_derivative(y, mu, weights))
-        return mu, dev, devp
+        temp = d1 * self.deviance_derivative(y, mu, weights)
+        if coef.size == X.shape[1] + 1:
+            devp = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            devp = temp @ X  # sampe as X.T @ temp
+        return mu, devp
 
     def _score(self, coef, phi, X, y, weights, link):
         r"""Compute the score function.
@@ -499,12 +578,15 @@ def _score(self, coef, phi, X, y, weights, link):
         :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`.
         Note: The derivative of the deviance w.r.t. coef equals -2 * score.
         """
-        lin_pred = X @ coef
+        lin_pred = _safe_lin_pred(X, coef)
         mu = link.inverse(lin_pred)
         sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         d = link.inverse_derivative(lin_pred)
         temp = sigma_inv * d * (y - mu)
-        score = X.T @ temp
+        if coef.size == X.shape[1] + 1:
+            score = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            score = temp @ X  # sampe as X.T @ temp
         return score
 
     def _fisher_matrix(self, coef, phi, X, y, weights, link):
@@ -526,14 +608,14 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link):
         with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`,
         see func:`_score`.
         """
-        n_samples = X.shape[0]
-        lin_pred = X @ coef
+        lin_pred = _safe_lin_pred(X, coef)
         mu = link.inverse(lin_pred)
         sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
-        d2 = link.inverse_derivative(lin_pred)**2
-        d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0),
-                                         shape=(n_samples, n_samples))
-        fisher_matrix = X.T @ d2_sigma_inv @ X
+        d = link.inverse_derivative(lin_pred)
+        d2_sigma_inv = sigma_inv * d * d
+        intercept = (coef.size == X.shape[1] + 1)
+        fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv,
+                                           intercept=intercept)
         return fisher_matrix
 
     def _observed_information(self, coef, phi, X, y, weights, link):
@@ -559,17 +641,17 @@ def _observed_information(self, coef, phi, X, y, weights, link):
         \right)`,
         see :func:`score_` function and :func:`_fisher_matrix`.
         """
-        n_samples = X.shape[0]
-        lin_pred = X @ coef
+        lin_pred = _safe_lin_pred(X, coef)
         mu = link.inverse(lin_pred)
         sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
         dp = link.inverse_derivative2(lin_pred)
         d2 = link.inverse_derivative(lin_pred)**2
         v = self.unit_variance_derivative(mu)/self.unit_variance(mu)
         r = y - mu
-        temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0),
-                                 shape=(n_samples, n_samples))
-        observed_information = X.T @ temp @ X
+        temp = sigma_inv * (-dp * r + d2 * v * r + d2)
+        intercept = (coef.size == X.shape[1] + 1)
+        observed_information = _safe_sandwich_dot(X, temp,
+                                                  intercept=intercept)
         return observed_information
 
     def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link,
@@ -600,28 +682,29 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link,
                   an array of shape (X.shape[1], X.shape[1])
                 * If diag_fisher is ``True`, an array of shape (X.shape[0])
         """
-        n_samples, n_features = X.shape
+        intercept = (coef.size == X.shape[1] + 1)
         # eta = linear predictor
-        eta = X @ coef
+        eta = _safe_lin_pred(X, coef)
         mu = link.inverse(eta)
         sigma_inv = 1./self.variance(mu, phi=phi, weights=weights)
         d1 = link.inverse_derivative(eta)  # = h'(eta)
         # Alternatively:
         # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
         # d1 = 1./link.derivative(mu)
-        score = X.T @ (sigma_inv * d1 * (y - mu))
-        #
-        d2_sigma_inv = sigma_inv * (d1**2)
+        d1_sigma_inv = d1 * sigma_inv
+        temp = d1_sigma_inv * (y - mu)
+        if intercept:
+            score = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            score = temp @ X
+
+        d2_sigma_inv = d1 * d1_sigma_inv
         if diag_fisher:
-            return eta, mu, score, d2_sigma_inv
+            fisher_matrix = d2_sigma_inv
         else:
-            if sparse.issparse(X):
-                d2_sigma_inv = sparse.dia_matrix((d2_sigma_inv, 0),
-                                                 shape=(n_samples, n_samples))
-                fisher = (X.T @ d2_sigma_inv @ X).toarray()
-            else:
-                fisher = (X.T * d2_sigma_inv) @ X
-            return eta, mu, score, fisher
+            fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv,
+                                               intercept=intercept)
+        return eta, mu, score, fisher_matrix
 
 
 class TweedieDistribution(ExponentialDispersionModel):
@@ -809,7 +892,7 @@ def unit_deviance(self, y, mu):
         return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu)))
 
 
-def _irls_step(X, W, P2, z):
+def _irls_step(X, W, P2, z, fit_intercept=True):
     """Compute one step in iteratively reweighted least squares.
 
     Solve A w = b for w with
@@ -829,43 +912,57 @@ def _irls_step(X, W, P2, z):
     P2 : {ndarray, sparse matrix}, shape (n_features, n_features)
         The L2-penalty matrix or vector (=diagonal matrix)
 
-    z  : ndarray, shape (n_samples,)
+    z : ndarray, shape (n_samples,)
         Working observations
 
+    fit_intercept : boolean, optional (default=True)
+
     Returns
     -------
-    coef: ndarray, shape (X.shape[1])
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
     """
     # Note: solve vs least squares, what is more appropriate?
     #       scipy.linalg.solve seems faster, but scipy.linalg.lstsq
     #       is more robust.
-    n_samples, n_features = X.shape
-    if sparse.issparse(X):
-        W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr()
+    # Note: X.T @ W @ X is not sparse, even when X is sparse.
+    #      Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b)
+    if fit_intercept:
+        Wz = W * z
+        if sparse.issparse(X):
+            b = np.concatenate(([Wz.sum()], X.transpose() @ Wz))
+        else:
+            b = np.concatenate(([Wz.sum()], X.T @ Wz))
+        A = _safe_sandwich_dot(X, W, intercept=fit_intercept)
         if P2.ndim == 1:
-            L2 = (sparse.dia_matrix((P2, 0), shape=(n_features, n_features))
-                  ).tocsr()
+            idx = np.arange(start=1, stop=A.shape[0])
+            A[(idx, idx)] += P2  # add to diag elements without intercept
+        elif sparse.issparse(P2):
+            A[1:, 1:] += P2.toarray()
         else:
-            L2 = sparse.csr_matrix(P2)
-        XtW = X.transpose() * W
-        A = XtW * X + L2
-        b = XtW * z
-        # coef = splinalg.spsolve(A, b)
-        coef, *_ = splinalg.lsmr(A, b)
+            A[1:, 1:] += P2
     else:
-        XtW = (X.T * W)
-        A = XtW.dot(X)
+        if sparse.issparse(X):
+            XtW = X.transpose().multiply(W)
+            A = (XtW @ X).toarray()
+        else:
+            XtW = (X.T * W)
+            A = XtW @ X
+        b = XtW @ z
         if P2.ndim == 1:
             A[np.diag_indices_from(A)] += P2
+        elif sparse.issparse(P2):
+            A += P2.toarray()
         else:
             A += P2
-        b = XtW.dot(z)
-        # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True)
-        coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
+    # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True)
+    coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
     return coef
 
 
-def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol):
+def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link,
+                 max_iter, tol):
     """Solve GLM with L2 penalty by IRLS algorithm.
 
     Note: If X is sparse, P2 must also be sparse.
@@ -889,7 +986,7 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol):
     # Note: ' denotes derivative, but also transpose for matrices
 
     # eta = linear predictor
-    eta = X @ coef
+    eta = _safe_lin_pred(X, coef)
     mu = link.inverse(eta)
     # D = h'(eta)
     hp = link.inverse_derivative(eta)
@@ -906,10 +1003,10 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol):
         z = eta + (y - mu) / hp
         # solve A*coef = b
         # A = X' W X + P2, b = X' W z
-        coef = _irls_step(X, W, P2, z)
+        coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept)
         # updated linear predictor
         # do it here for updated values for tolerance
-        eta = X @ coef
+        eta = _safe_lin_pred(X, coef)
         mu = link.inverse(eta)
         hp = link.inverse_derivative(eta)
         V = family.variance(mu, phi=1, weights=weights)
@@ -917,11 +1014,18 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol):
         # which tolerace? |coef - coef_old| or gradient?
         # use gradient for compliance with newton-cg and lbfgs
         # gradient = -X' D (y-mu)/V(mu) + l2 P2 w
-        gradient = -(X.T @ (hp*(y-mu)/V))
+        temp = hp * (y - mu) / V
+        if sparse.issparse(X):
+            gradient = -(X.transpose() @ temp)
+        else:
+            gradient = -(X.T @ temp)
+        idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
         if P2.ndim == 1:
-            gradient += P2*coef
+            gradient += P2 * coef[idx:]
         else:
-            gradient += P2 @ coef
+            gradient += P2 @ coef[idx:]
+        if fit_intercept:
+            gradient = np.concatenate(([-temp.sum()], gradient))
         if (np.max(np.abs(gradient)) <= tol):
             converged = True
             break
@@ -937,7 +1041,7 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol):
 def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
               max_inner_iter=1000, selection='cyclic',
               random_state=None, diag_fisher=False):
-    """Compute inner loop of coordinate descent = cycles through features.
+    """Compute inner loop of coordinate descent, i.e. cycles through features.
 
     Minimization of 1-d subproblems::
 
@@ -953,24 +1057,31 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
     #          of Improved GLMNET or Gap Safe Screening Rules
     #          https://arxiv.org/abs/1611.05780
     n_samples, n_features = X.shape
+    intercept = (coef.size == X.shape[1] + 1)
+    idx = 1 if intercept else 0  # offset if coef[0] is intercept
     B = fisher
     if P2.ndim == 1:
-        coef_P2 = coef * P2
+        coef_P2 = coef[idx:] * P2
         if not diag_fisher:
-            B[np.diag_indices_from(B)] += P2
+            idiag = np.arange(start=idx, stop=B.shape[0])
+            # B[np.diag_indices_from(B)] += P2
+            B[(idiag, idiag)] += P2
     else:
-        coef_P2 = P2 @ coef  # P2 is symmetric, mat @ vec is usually faster
+        coef_P2 = coef[idx:] @ P2
         if not diag_fisher:
             if sparse.issparse(P2):
-                B += P2.toarray()
+                B[idx:, idx:] += P2.toarray()
             else:
-                B += P2
-    A = -score + coef_P2  # + d @ (H+P2) but d=0 so far
+                B[idx:, idx:] += P2
+    # A = -score + coef_P2
+    A = -score
+    A[idx:] += coef_P2
+    # A += d @ (H+P2) but so far d=0
     # inner loop
-    inner_iter = 0
-    while inner_iter < max_inner_iter:
+    for inner_iter in range(1, max_inner_iter+1):
         inner_iter += 1
         n_cycles += 1
+        # cycle through features, update intercept separately at the end
         if selection == 'random':
             featurelist = random_state.permutation(n_features)
         else:
@@ -985,70 +1096,85 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
             # with beta = z+d, beta_hat = d-a/b and gamma = c/b
             # z = 1/b * S(bd-a,c) - d
             # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
-            a = A[j]
+            jdx = j+idx  # index for arrays containing entries for intercept
+            a = A[jdx]
             if diag_fisher:
+                # Note: fisher is ndarray of shape (n_samples,) => no idx
+                # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway
+                Bj = np.zeros_like(A)
+                if intercept:
+                    Bj[0] = fisher.sum()
                 if sparse.issparse(X):
-                    xj = X[:, j]
-                    b = xj.transpose() @ xj.multiply(fisher[:, np.newaxis])
-                    b = b[0, 0]
+                    Bj[idx:] = (X[:, j].transpose().multiply(fisher) @ X
+                                ).toarray().ravel()
                 else:
-                    b = X[:, j] @ (fisher * X[:, j])
+                    Bj[idx:] = (fisher * X[:, j]) @ X
 
                 if P2.ndim == 1:
-                    b += P2[j]
+                    Bj[idx:] += P2[j]
                 else:
-                    b += P2[j, j]
+                    if sparse.issparse(P2):
+                        # slice columns as P2 is csc
+                        Bj[idx:] += P2[:, j].toarray().ravel()
+                    else:
+                        Bj[idx:] += P2[:, j]
+                b = Bj[jdx]
             else:
-                b = B[j, j]
+                b = B[jdx, jdx]
 
+            # those ten lines aree what it is all about
             if b <= 0:
                 z = 0
             elif P1[j] == 0:
                 z = -a/b
-            elif a + P1[j] < b * (coef[j] + d[j]):
+            elif a + P1[j] < b * (coef[jdx] + d[jdx]):
                 z = -(a + P1[j])/b
-            elif a - P1[j] > b * (coef[j] + d[j]):
+            elif a - P1[j] > b * (coef[jdx] + d[jdx]):
                 z = -(a - P1[j])/b
             else:
-                z = -(coef[j] + d[j])
+                z = -(coef[jdx] + d[jdx])
 
             # update direction d
-            d[j] += z
+            d[jdx] += z
             # update A because d_j is now d_j+z
             # A = f'(w) + d*H(w) + (w+d)*P2
             # => A += (H+P2)*e_j z = B_j * z
             # Note: B is symmetric B = B.transpose
             if diag_fisher:
-                if sparse.issparse(X):
-                    A += (X.transpose() @
-                          X[:, j].multiply(fisher[:, np.newaxis])
-                          ).toarray().ravel() * z
-                else:
-                    # A += (X.T @ (fisher * X[:, j])) * z
-                    # same without transpose of X
-                    A += ((fisher * X[:, j]) @ X) * z
-
-                if P2.ndim == 1:
-                    A[j] += P2[j] * z
-                elif sparse.issparse(P2):
-                    # slice columns as P2 is csc
-                    A += P2[:, j].toarray().ravel() * z
-                else:
-                    A += P2[:, j] * z
+                # Bj = B[:, j] calculated above, still valid
+                A += Bj * z
             else:
                 # B is symmetric, C- or F-contiguous, but never sparse
                 if B.flags['F_CONTIGUOUS']:
                     # slice columns like for sparse csc
-                    A += B[:, j] * z
+                    A += B[:, jdx] * z
                 else:  # B.flags['C_CONTIGUOUS'] might be true
                     # slice rows
-                    A += B[j, :] * z
-            # end of cycle
+                    A += B[jdx, :] * z
+            # end of cycle over features
+        # update intercept
+        if intercept:
+            if diag_fisher:
+                Bj = np.zeros_like(A)
+                Bj[0] = fisher.sum()
+                Bj[1:] = fisher @ X
+                b = Bj[0]
+            else:
+                b = B[0, 0]
+            z = 0 if b <= 0 else -A[0]/b
+            d[0] += z
+            if diag_fisher:
+                A += Bj * z
+            else:
+                if B.flags['F_CONTIGUOUS']:
+                    A += B[:, 0] * z
+                else:
+                    A += B[0, :] * z
+        # end of complete cycle
         # stopping criterion for inner loop
         # sum_i(|minimum of norm of subgrad of q(d)_i|)
-        mn_subgrad = np.where(coef + d == 0,
-                              np.sign(A) * np.maximum(np.abs(A) - P1, 0),
-                              A + np.sign(coef + d) * P1)
+        # subgrad q(d) = A + subgrad ||P1*(w+d)||_1
+        mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1)
         mn_subgrad = linalg.norm(mn_subgrad, ord=1)
         if mn_subgrad <= inner_tol:
             if inner_iter == 1:
@@ -1058,7 +1184,7 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
     return d, coef_P2, n_cycles, inner_tol
 
 
-def _cd_solver(coef, X, y, weights, P1, P2, family, link,
+def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
                max_iter=100, max_inner_iter=1000, tol=1e-4,
                selection='cyclic ', random_state=None,
                diag_fisher=False, copy_X=True):
@@ -1083,7 +1209,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
        = min_z A_j z + 1/2 B_jj z^2
                + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
        A = f'(w) + d*H(w) + (w+d)*P2
-       B = H+P2
+       B = H + P2
 
     Repeat steps 1-3 until convergence.
     Note: Use Fisher matrix instead of Hessian for H.
@@ -1091,7 +1217,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
 
     Parameters
     ----------
-    coef: ndarray, shape (n_features,)
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
 
     X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
         Training data (with intercept included if present). If not sparse,
@@ -1112,6 +1240,10 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
         The L2-penalty matrix or vector (=diagonal matrix). If a matrix is
         passed, it must be symmetric. If X is sparse, P2 must also be sparse.
 
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
     family : ExponentialDispersionModel
 
     link : Link
@@ -1120,8 +1252,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
         Maximum numer of outer (Newton) iterations.
 
     max_inner_iter : int, optional (default=1000)
-        Maximum number of iterations, i.e. cycles over all features, in inner
-        loop.
+        Maximum number of iterations in each inner loop, i.e. max number of
+        cycles over all features per inner loop.
 
     tol : float, optional (default=1e-4)
         Covergence criterion is
@@ -1133,8 +1265,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
     random_state : {int, RandomState instance, None}, optional (default=None)
 
     diag_fisher : boolean, optional (default=False)
-        'False' calculates full fisher matrix, 'True' only diagonal matrix s.t.
-        fisher = X.T @ diag @ X. This saves storage but needs more
+        ``False`` calculates full fisher matrix, ``True`` only diagonal matrix
+        s.t. fisher = X.T @ diag @ X. This saves storage but needs more
         matrix-vector multiplications.
 
     copy_X : boolean, optional (default=True)
@@ -1142,7 +1274,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
 
     Returns
     -------
-    coef : ndarray, shape (n_features,)
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
 
     n_iter : numer of outer iterations = newton iterations
 
@@ -1174,6 +1308,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
     n_cycles = 0  # number of (complete) cycles over features
     converged = False
     n_samples, n_features = X.shape
+    idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
     # line search parameters
     (beta, sigma) = (0.5, 0.01)
     # some precalculations
@@ -1186,16 +1321,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
     d = np.zeros_like(coef)
     # initial stopping tolerance of inner loop
     # use L1-norm of minimum of norm of subgradient of F
-    # fp_wP2 = f'(w) + w*P2
-    if P2.ndim == 1:
-        fp_wP2 = -score + coef * P2
-    else:
-        # Note: P2 is symmetric and matrix @ vector is faster for sparse
-        #       matrices.
-        fp_wP2 = -score + P2 @ coef
-    inner_tol = np.where(coef == 0,
-                         np.sign(fp_wP2) * np.maximum(np.abs(fp_wP2) - P1, 0),
-                         fp_wP2 + np.sign(coef) * P1)
+    inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
     inner_tol = linalg.norm(inner_tol, ord=1)
     # outer loop
     while n_iter < max_iter:
@@ -1211,23 +1337,23 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
         # F(w + lambda d) - F(w) <= lambda * bound
         # bound = sigma * (f'(w)*d + w*P2*d
         #                  +||P1 (w+d)||_1 - ||P1 w||_1)
-        P1w_1 = linalg.norm(P1 * coef, ord=1)
+        P1w_1 = linalg.norm(P1 * coef[idx:], ord=1)
+        P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1)
         # Note: coef_P2 already calculated and still valid
-        bound = sigma * (-(score @ d) + coef_P2 @ d +
-                         linalg.norm(P1 * (coef + d), ord=1) - P1w_1)
+        bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1)
         Fw = (0.5 * family.deviance(y, mu, weights) +
-              0.5 * (coef_P2 @ coef) + P1w_1)
+              0.5 * (coef_P2 @ coef[idx:]) + P1w_1)
         la = 1./beta
         for k in range(20):
             la *= beta  # starts with la=1
             coef_wd = coef + la * d
-            mu_wd = link.inverse(X @ coef_wd)
+            mu_wd = link.inverse(_safe_lin_pred(X, coef_wd))
             Fwd = (0.5 * family.deviance(y, mu_wd, weights) +
-                   linalg.norm(P1 * coef_wd, ord=1))
+                   linalg.norm(P1 * coef_wd[idx:], ord=1))
             if P2.ndim == 1:
-                Fwd += 0.5 * ((coef_wd * P2) @ coef_wd)
+                Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:])
             else:
-                Fwd += 0.5 * (coef_wd @ (P2 @ coef_wd))
+                Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:]))
             if Fwd - Fw <= sigma * la * bound:
                 break
         # update coefficients
@@ -1238,16 +1364,10 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
             coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
             diag_fisher=diag_fisher)
         # stopping criterion for outer loop
-        # sum_i(|minimum of norm of subgrad of F(w)_i|)
+        # sum_i(|minimum-norm of subgrad of F(w)_i|)
         # fp_wP2 = f'(w) + w*P2
         # Note: eta, mu and score are already updated
-        if P2.ndim == 1:
-            fp_wP2 = -score + coef * P2
-        else:
-            fp_wP2 = -score + P2 @ coef  # P2 is symmetric, mat @ vec is faster
-        mn_subgrad = np.where(coef == 0,
-                              np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0),
-                              fp_wP2 + np.sign(coef) * P1)
+        mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
         mn_subgrad = linalg.norm(mn_subgrad, ord=1)
         if mn_subgrad <= tol:
             converged = True
@@ -1255,8 +1375,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link,
         # end of outer loop
     if not converged:
         warnings.warn("Coordinate descent failed to converge. Increase"
-                      " the number of iterations (currently {0})"
-                      .format(max_iter), ConvergenceWarning)
+                      " the maximum number of iterations max_iter"
+                      " (currently {0})".format(max_iter), ConvergenceWarning)
 
     return coef, n_iter, n_cycles
 
@@ -1387,8 +1507,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where g_i is the i-th component of the gradient (derivative) of
         the objective function. For the cd solver, covergence is reached
-        when ``sum_i(|minimum of norm of g_i|)``, where g_i is the
-        subgradient of the objective.
+        when ``sum_i(|minimum-norm of g_i|)``, where g_i is the
+        subgradient of the objective and minimum-norm of g_i is the element of
+        the subgradient g_i with the smallest L2-norm.
 
     warm_start : boolean, optional (default=False)
         If set to ``True``, reuse the solution of the previous call to ``fit``
@@ -1563,19 +1684,7 @@ def fit(self, X, y, sample_weight=None):
         #######################################################################
         # 1. input validation                                                 #
         #######################################################################
-        # 1.1 validate arguments of fit #######################################
-        _dtype = [np.float64, np.float32]
-        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'],
-                         dtype=_dtype, y_numeric=True, multi_output=False,
-                         copy=self.copy_X)
-        # Without converting y to float, deviance might raise
-        # ValueError: Integers to negative integer powers are not allowed.
-        # Also, y must not be sparse.
-        y = np.asarray(y, dtype=np.float64)
-
-        weights = _check_weights(sample_weight, y.shape[0])
-
-        # 1.2 validate arguments of __init__ ##################################
+        # 1.1 validate arguments of __init__ ##################################
         # Guarantee that self._family_instance is an instance of class
         # ExponentialDispersionModel
         if isinstance(self.family, ExponentialDispersionModel):
@@ -1668,25 +1777,6 @@ def fit(self, X, y, sample_weight=None):
         if not isinstance(self.warm_start, bool):
             raise ValueError("The argument warm_start must be bool;"
                              " got {0}".format(self.warm_start))
-        start_params = self.start_params
-        if isinstance(start_params, str):
-            if start_params not in ['irls', 'least_squares', 'zero']:
-                raise ValueError("The argument start_params must be 'irls', "
-                                 "'least-squares', 'zero' or an array of "
-                                 " correct length;"
-                                 " got(start_params={0})".format(start_params))
-        else:
-            start_params = check_array(start_params, accept_sparse=False,
-                                       force_all_finite=True, ensure_2d=False,
-                                       dtype=_dtype, copy=True)
-            if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
-                    (start_params.ndim != 1)):
-                raise ValueError("Start values for parameters must have the"
-                                 "right length and dimension; required (length"
-                                 "={0}, ndim=1); got (length={1}, ndim={2})."
-                                 .format(X.shape[1] + self.fit_intercept,
-                                         start_params.shape[0],
-                                         start_params.ndim))
         if self.selection not in ['cyclic', 'random']:
             raise ValueError("The argument selection must be 'cyclic' or "
                              "'random'; got (selection={0})"
@@ -1702,36 +1792,59 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("The argument check_input must be bool; got "
                              "(check_input={0})".format(self.check_input))
 
+        family = self._family_instance
+        link = self._link_instance
+
+        # 1.2 validate arguments of fit #######################################
+        _dtype = [np.float64, np.float32]
+        if solver == 'cd':
+            _stype = ['csc']
+        else:
+            _stype = ['csc', 'csr']
+        X, y = check_X_y(X, y, accept_sparse=_stype,
+                         dtype=_dtype, y_numeric=True, multi_output=False,
+                         copy=self.copy_X)
+        # Without converting y to float, deviance might raise
+        # ValueError: Integers to negative integer powers are not allowed.
+        # Also, y must not be sparse.
+        y = np.asarray(y, dtype=np.float64)
+
+        weights = _check_weights(sample_weight, y.shape[0])
+
+        n_samples, n_features = X.shape
+
+        # 1.3 arguments to take special care ##################################
+        # P1, P2, start_params
         if isinstance(self.P1, str) and self.P1 == 'identity':
-            P1 = np.ones(X.shape[1])
+            P1 = np.ones(n_features)
         else:
             P1 = np.atleast_1d(self.P1)
             try:
-                P1 = P1.astype(np.float64, casting='safe', copy=True)
+                P1 = P1.astype(np.float64, casting='safe', copy=False)
             except TypeError:
                 raise TypeError("The given P1 cannot be converted to a numeric"
                                 "array; got (P1.dtype={0})."
                                 .format(P1.dtype))
-            if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]):
+            if (P1.ndim != 1) or (P1.shape[0] != n_features):
                 raise ValueError("P1 must be either 'identity' or a 1d array "
                                  "with the length of X.shape[1]; "
                                  "got (P1.shape[0]={0}), "
                                  "needed (X.shape[1]={1})."
-                                 .format(P1.shape[0], X.shape[1]))
+                                 .format(P1.shape[0], n_features))
         # If X is sparse, make P2 sparse, too.
         if isinstance(self.P2, str) and self.P2 == 'identity':
             if sparse.issparse(X):
-                P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0),
-                      shape=(X.shape[1], X.shape[1]))).tocsr()
+                P2 = (sparse.dia_matrix((np.ones(n_features), 0),
+                      shape=(n_features, n_features))).tocsc()
             else:
-                P2 = np.ones(X.shape[1])
+                P2 = np.ones(n_features)
         else:
             P2 = check_array(self.P2, copy=True,
-                             accept_sparse=['csr', 'csc'],
+                             accept_sparse=_stype,
                              dtype=_dtype, ensure_2d=False)
             if P2.ndim == 1:
                 P2 = np.asarray(P2)
-                if P2.shape[0] != X.shape[1]:
+                if P2.shape[0] != n_features:
                     raise ValueError("P2 should be a 1d array of shape "
                                      "(n_features,) with "
                                      "n_features=X.shape[1]; "
@@ -1739,12 +1852,12 @@ def fit(self, X, y, sample_weight=None):
                                      .format(P2.shape[0], X.shape[1]))
                 if sparse.issparse(X):
                     P2 = (sparse.dia_matrix((P2, 0),
-                          shape=(X.shape[1], X.shape[1]))).tocsr()
+                          shape=(n_features, n_features))).tocsc()
             elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and
                     P2.shape[0] == X.shape[1]):
                 if sparse.issparse(X):
                     P2 = (sparse.dia_matrix((P2, 0),
-                          shape=(X.shape[1], X.shape[1]))).tocsr()
+                          shape=(n_features, n_features))).tocsc()
             else:
                 raise ValueError("P2 must be either None or an array of shape "
                                  "(n_features, n_features) with "
@@ -1752,37 +1865,31 @@ def fit(self, X, y, sample_weight=None):
                                  "got (P2.shape=({0}, {1})), needed ({2}, {2})"
                                  .format(P2.shape[0], P2.shape[1], X.shape[1]))
 
-        family = self._family_instance
-        link = self._link_instance
-
-        if self.fit_intercept:
-            # Note: intercept is first column <=> coef[0] is for intecept
-            if sparse.issparse(X):
-                Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X],
-                                     format=X.format)
-            else:
-                Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
-            P1 = np.concatenate((np.array([0]), P1))
-            if P2.ndim == 1:
-                P2 = np.concatenate((np.array([0]), P2))
-            elif sparse.issparse(P2):
-                P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2),
-                                       format=P2.format,
-                                       dtype=P2.dtype).tocsr()
-            else:
-                # as of numpy 1.13 this would work:
-                # P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))],
-                #                [np.zeros((X.shape[1], 1)), P2]])
-                P2 = np.hstack((np.zeros((X.shape[1], 1)), P2))
-                P2 = np.vstack((np.zeros((1, X.shape[1]+1)), P2))
+        start_params = self.start_params
+        if isinstance(start_params, str):
+            if start_params not in ['irls', 'least_squares', 'zero']:
+                raise ValueError("The argument start_params must be 'irls', "
+                                 "'least-squares', 'zero' or an array of "
+                                 " correct length;"
+                                 " got(start_params={0})".format(start_params))
         else:
-            Xnew = X
+            start_params = check_array(start_params, accept_sparse=False,
+                                       force_all_finite=True, ensure_2d=False,
+                                       dtype=_dtype, copy=True)
+            if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
+                    (start_params.ndim != 1)):
+                raise ValueError("Start values for parameters must have the"
+                                 "right length and dimension; required (length"
+                                 "={0}, ndim=1); got (length={1}, ndim={2})."
+                                 .format(X.shape[1] + self.fit_intercept,
+                                         start_params.shape[0],
+                                         start_params.ndim))
 
-        n_samples, n_features = Xnew.shape
         l1 = self.alpha * self.l1_ratio
         l2 = self.alpha * (1 - self.l1_ratio)
-        P1 *= l1
-        P2 *= l2
+        # P1 and P2 are now for sure copies
+        P1 = l1 * P1
+        P2 = l2 * P2
         # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
         # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
         if P2.ndim == 2:
@@ -1791,14 +1898,12 @@ def fit(self, X, y, sample_weight=None):
             else:
                 P2 = 0.5 * (P2 + P2.T)
 
-        # 1.3 additional validations ##########################################
+        # 1.4 additional validations ##########################################
         if self.check_input:
             if not np.all(family.in_y_range(y)):
                 raise ValueError("Some value(s) of y are out of the valid "
                                  "range for family {0}"
                                  .format(family.__class__.__name__))
-            if not np.all(weights >= 0):
-                raise ValueError("Sample weights must be non-negative.")
             # check if P1 has only non-negative values, negative values might
             # indicate group lasso in the future.
             if not isinstance(self.P1, str):  # if self.P1 != 'identity':
@@ -1830,7 +1935,7 @@ def fit(self, X, y, sample_weight=None):
                 else:
                     if not np.all(linalg.eigvalsh(P2) >= epsneg):
                         raise ValueError("P2 must be positive semi-definite.")
-            # TODO: if alpha=0 check that Xnew is not rank deficient
+            # TODO: if alpha=0 check that X is not rank deficient
             # TODO: what else to check?
 
         #######################################################################
@@ -1874,13 +1979,14 @@ def fit(self, X, y, sample_weight=None):
                 z = eta + (y-mu)/hp
                 # solve A*coef = b
                 # A = X' W X + l2 P2, b = X' W z
-                coef = _irls_step(Xnew, W, P2, z)
+                coef = _irls_step(X, W, P2, z,
+                                  fit_intercept=self.fit_intercept)
             elif start_params == 'least_squares':
                 # less restrictive tolerance for finding start values
                 tol = np.max([self.tol, np.sqrt(self.tol)])
                 if self.alpha == 0:
                     reg = LinearRegression(copy_X=True, fit_intercept=False)
-                    reg.fit(Xnew, link.link(y))
+                    reg.fit(X, link.link(y))
                     coef = reg.coef_
                 elif self.l1_ratio <= 0.01:
                     # ElasticNet says l1_ratio <= 0.01 is not reliable
@@ -1888,19 +1994,21 @@ def fit(self, X, y, sample_weight=None):
                     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
                     reg = Ridge(copy_X=True, fit_intercept=False,
                                 alpha=self.alpha*n_samples, tol=tol)
-                    reg.fit(Xnew, link.link(y))
+                    reg.fit(X, link.link(y))
                     coef = reg.coef_
                 else:
                     # TODO: Does this make sense at all?
                     reg = ElasticNet(copy_X=True, fit_intercept=False,
                                      alpha=self.alpha, l1_ratio=self.l1_ratio,
                                      tol=tol)
-                    reg.fit(Xnew, link.link(y))
+                    reg.fit(X, link.link(y))
                     coef = reg.coef_
             else:  # start_params == 'zero'
-                coef = np.zeros(n_features)
                 if self.fit_intercept:
+                    coef = np.zeros(n_features+1)
                     coef[0] = link.link(np.average(y, weights=weights))
+                else:
+                    coef = np.zeros(n_features)
         else:  # assign given array as start values
             coef = start_params
 
@@ -1915,24 +2023,28 @@ def fit(self, X, y, sample_weight=None):
         # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
         if solver == 'irls':
             coef, self.n_iter_ = \
-                _irls_solver(coef=coef, X=Xnew, y=y, weights=weights, P2=P2,
-                             family=family, link=link, max_iter=self.max_iter,
-                             tol=self.tol)
+                _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2,
+                             fit_intercept=self.fit_intercept, family=family,
+                             link=link, max_iter=self.max_iter, tol=self.tol)
 
         # 4.2 L-BFGS ##########################################################
         elif solver == 'lbfgs':
             def func(coef, X, y, weights, P2, family, link):
-                mu, dev, devp = \
+                mu, devp = \
                     family._mu_deviance_derivative(coef, X, y, weights, link)
+                dev = family.deviance(y, mu, weights)
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
                 if P2.ndim == 1:
-                    L2 = P2 * coef
+                    L2 = P2 * coef[idx:]
                 else:
-                    L2 = P2 @ coef
-                obj = 0.5 * dev + 0.5 * (coef @ L2)
-                objp = 0.5 * devp + L2
+                    L2 = P2 @ coef[idx:]
+                obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2)
+                objp = 0.5 * devp
+                objp[idx:] += L2
                 return obj, objp
 
-            args = (Xnew, y, weights, P2, family, link)
+            args = (X, y, weights, P2, family, link)
             coef, loss, info = fmin_l_bfgs_b(
                 func, coef, fprime=None, args=args,
                 iprint=(self.verbose > 0) - 1, pgtol=self.tol,
@@ -1952,50 +2064,66 @@ def func(coef, X, y, weights, P2, family, link):
         # precisely, expected hessian of deviance.
         elif solver == 'newton-cg':
             def func(coef, X, y, weights, P2, family, link):
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
                 if P2.ndim == 1:
-                    L2 = coef @ (P2 * coef)
+                    L2 = coef[idx:] @ (P2 * coef[idx:])
                 else:
-                    L2 = coef @ (P2 @ coef)
-                mu = link.inverse(X @ coef)
+                    L2 = coef[idx:] @ (P2 @ coef[idx:])
+                mu = link.inverse(_safe_lin_pred(X, coef))
                 return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2
 
             def grad(coef, X, y, weights, P2, family, link):
+                mu, devp = \
+                    family._mu_deviance_derivative(coef, X, y, weights, link)
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
                 if P2.ndim == 1:
-                    L2 = P2 * coef
+                    L2 = P2 * coef[idx:]
                 else:
-                    L2 = P2 @ coef
-                eta = X @ coef
-                mu = link.inverse(eta)
-                d1 = link.inverse_derivative(eta)
-                grad = X.T @ (d1 * family.deviance_derivative(y, mu, weights))
-                return 0.5 * grad + L2
+                    L2 = P2 @ coef[idx:]
+                objp = 0.5 * devp
+                objp[idx:] += L2
+                return objp
 
             def grad_hess(coef, X, y, weights, P2, family, link):
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
                 if P2.ndim == 1:
-                    L2 = P2 * coef
+                    L2 = P2 * coef[idx:]
                 else:
-                    L2 = P2 @ coef
-                eta = X @ coef
+                    L2 = P2 @ coef[idx:]
+                eta = _safe_lin_pred(X, coef)
                 mu = link.inverse(eta)
                 d1 = link.inverse_derivative(eta)
-                grad = 0.5 * \
-                    (X.T @ (d1 * family.deviance_derivative(y, mu, weights))) \
-                    + L2
-                # expected hessian = X.T @ diag_matrix @ X
+                temp = d1 * family.deviance_derivative(y, mu, weights)
+                if intercept:
+                    grad = np.concatenate(([0.5 * temp.sum()],
+                                           0.5 * temp @ X + L2))
+                else:
+                    grad = 0.5 * temp @ X + L2  # sampe as 0.5* X.T @ temp + L2
+
+                # expected hessian = fisher = X.T @ diag_matrix @ X
                 # calculate only diag_matrix
                 diag = d1**2 / family.variance(mu, phi=1, weights=weights)
 
-                def Hs(s):
-                    ret = 0.5 * (X.T @ (diag * (X @ s)))
+                def Hs(coef):
+                    # return (0.5 * fisher + P2) @ coef
+                    # ret = 0.5 * (X.T @ (diag * (X @ coef)))
+                    ret = 0.5 * ((diag * (X @ coef[idx:])) @ X)
                     if P2.ndim == 1:
-                        ret += P2 * s
+                        ret += P2 * coef[idx:]
                     else:
-                        ret += P2 @ s
+                        ret += P2 @ coef[idx:]
+                    if intercept:
+                        h0i = np.concatenate(([diag.sum()], diag @ X))
+                        ret = np.concatenate(([0.5 * (h0i @ coef)],
+                                             ret + 0.5 * coef[0] * h0i[1:]))
                     return ret
 
                 return grad, Hs
 
-            args = (Xnew, y, weights, P2, family, link)
+            args = (X, y, weights, P2, family, link)
             coef, n_iter_i = newton_cg(grad_hess, func, grad, coef,
                                        args=args, maxiter=self.max_iter,
                                        tol=self.tol)
@@ -2007,13 +2135,14 @@ def Hs(s):
         elif solver == 'cd':
             # For coordinate descent, if X is sparse, it should be csc format
             # If X is sparse, P2 must also be csc
-            if sparse.issparse(Xnew):
-                Xnew = Xnew.tocsc(copy=self.copy_X)
+            if sparse.issparse(X):
+                X = X.tocsc(copy=self.copy_X)
                 P2 = sparse.csc_matrix(P2)
 
             coef, self.n_iter_, self._n_cycles = \
-                _cd_solver(coef=coef, X=Xnew, y=y, weights=weights, P1=P1,
-                           P2=P2, family=family, link=link,
+                _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1,
+                           P2=P2, fit_intercept=self.fit_intercept,
+                           family=family, link=link,
                            max_iter=self.max_iter, tol=self.tol,
                            selection=self.selection, random_state=random_state,
                            diag_fisher=self.diag_fisher, copy_X=self.copy_X)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index ae8a5f4cfc5e7..cdac151b77de6 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -566,7 +566,7 @@ def obj(coef):
         pd = PoissonDistribution()
         link = LogLink()
         N = y.shape[0]
-        mu = link.inverse(X @ coef[1:]+coef[0])
+        mu = link.inverse(X @ coef[1:] + coef[0])
         alpha, l1_ratio = (1, 0.5)
         return 1./(2.*N) * pd.deviance(y, mu) \
             + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \
@@ -587,9 +587,9 @@ def obj(coef):
     assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
     assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
 
-    # start_params='least_squares' with different alpha
+    # check warm_start, therefore start with different alpha
     glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5,
-                                     family='poisson',
+                                     family='poisson', max_iter=300,
                                      link='log', solver='cd', tol=1e-5,
                                      start_params='zero')
     glm.fit(X, y)

From c9a7a95e89deaadc40b92edd8e1208d550998a72 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 21 Apr 2019 17:03:07 +0200
Subject: [PATCH 51/69] Revised option start_params

* renamed option irls into guess

* removed option least_squares

* updated tests
---
 doc/modules/linear_model.rst           |  12 +-
 sklearn/linear_model/glm.py            | 223 ++++++++++++++-----------
 sklearn/linear_model/tests/test_glm.py | 140 ++++++++--------
 3 files changed, 207 insertions(+), 168 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index e60e9e84a4747..4bede17af581a 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -925,12 +925,12 @@ follows:
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
     >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
     GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5,
-                               check_input=True, copy_X=True, family='poisson',
-                               fit_dispersion=None, fit_intercept=True, l1_ratio=0,
-                               link='log', max_iter=100, random_state=None,
-                               selection='cyclic', solver='auto',
-                               start_params='irls', tol=0.0001, verbose=0,
-                               warm_start=False)
+                               check_input=True, copy_X=True, diag_fisher=False,
+                               family='poisson', fit_dispersion=None,
+                               fit_intercept=True, l1_ratio=0, link='log',
+                               max_iter=100, random_state=None, selection='cyclic',
+                               solver='auto', start_params='guess', tol=0.0001,
+                               verbose=0, warm_start=False)
     >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE
     array([0.24630169, 0.43373464])
     >>> reg.intercept_ #doctest: +ELLIPSIS
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index 2afd1ddf8c79c..a53cc39ecd307 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -48,9 +48,6 @@
 import scipy.sparse.linalg as splinalg
 from scipy.optimize import fmin_l_bfgs_b
 import warnings
-from .base import LinearRegression
-from .coordinate_descent import ElasticNet
-from .ridge import Ridge
 from ..base import BaseEstimator, RegressorMixin
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array, check_X_y
@@ -93,6 +90,14 @@ def _safe_lin_pred(X, coef):
         return X @ coef
 
 
+def _safe_toarray(X):
+    """Returns a numpy array."""
+    if sparse.issparse(X):
+        return X.toarray()
+    else:
+        return np.asarray(X)
+
+
 def _safe_sandwich_dot(X, d, intercept=False):
     """Compute sandwich product X.T @ diag(d) @ X.
 
@@ -100,7 +105,9 @@ def _safe_sandwich_dot(X, d, intercept=False):
     first column of X.
     X can be sparse, d must be an ndarray. Always returns a ndarray."""
     if sparse.issparse(X):
-        temp = (X.transpose().multiply(d) @ X).toarray()
+        temp = (X.transpose() @ X.multiply(d[:, np.newaxis]))
+        # for older versions of numpy and scipy, temp may be a np.matrix
+        temp = _safe_toarray(temp)
     else:
         temp = (X.T * d) @ X
     if intercept:
@@ -945,7 +952,8 @@ def _irls_step(X, W, P2, z, fit_intercept=True):
     else:
         if sparse.issparse(X):
             XtW = X.transpose().multiply(W)
-            A = (XtW @ X).toarray()
+            # for older versions of numpy and scipy, A may be a np.matrix
+            A = _safe_toarray(XtW @ X)
         else:
             XtW = (X.T * W)
             A = XtW @ X
@@ -1105,8 +1113,9 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
                 if intercept:
                     Bj[0] = fisher.sum()
                 if sparse.issparse(X):
-                    Bj[idx:] = (X[:, j].transpose().multiply(fisher) @ X
-                                ).toarray().ravel()
+                    Bj[idx:] = _safe_toarray(X[:, j].transpose() @
+                                             X.multiply(fisher[:, np.newaxis])
+                                             ).ravel()
                 else:
                     Bj[idx:] = (fisher * X[:, j]) @ X
 
@@ -1477,27 +1486,32 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         the chi squared statisic or the deviance statistic. If None, the
         dispersion is not estimated.
 
-    solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \
+    solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \
             optional (default='auto')
         Algorithm to use in the optimization problem:
 
         'auto'
             Sets 'irls' if l1_ratio equals 0, else 'cd'.
 
+        'cd'
+            Coordinate descent algorithm. It can deal with L1 as well as L2
+            penalties. Note that in order to avoid unnecessary memory
+            duplication of X in the ``fit`` method, X should be directly passed
+            as a Fortran-contiguous numpy array or sparse csc matrix.
+
         'irls'
-            Iterated reweighted least squares (with Fisher scoring).
+            Iterated reweighted least squares.
             It is the standard algorithm for GLMs. It cannot deal with
             L1 penalties.
 
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties.
+
         'newton-cg', 'lbfgs'
-            Cannot deal with L1 penalties.
+            Newton conjugate gradient algorithm cannot deal with L1 penalties.
 
-        'cd'
-            Coordinate descent algorithm. It can deal with L1 as well as L2
-            penalties. Note that in order to avoid unnecessary memory
-            duplication of the X argument in the ``fit`` method, X should be
-            directly passed as a Fortran-contiguous numpy array or sparse csc
-            matrix.
+        Note that all solvers except lbfgs use the fisher matrix, i.e. the
+        expected Hessian instead of the Hessian matrix.
 
     max_iter : int, optional (default=100)
         The maximal number of iterations for solver algorithms.
@@ -1505,11 +1519,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     tol : float, optional (default=1e-4)
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
-        where g_i is the i-th component of the gradient (derivative) of
+        where ``g_i`` is the i-th component of the gradient (derivative) of
         the objective function. For the cd solver, covergence is reached
-        when ``sum_i(|minimum-norm of g_i|)``, where g_i is the
-        subgradient of the objective and minimum-norm of g_i is the element of
-        the subgradient g_i with the smallest L2-norm.
+        when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the
+        subgradient of the objective and minimum-norm of ``g_i`` is the element
+        of the subgradient ``g_i`` with the smallest L2-norm.
 
     warm_start : boolean, optional (default=False)
         If set to ``True``, reuse the solution of the previous call to ``fit``
@@ -1518,23 +1532,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         does not exit (first call to ``fit``), option ``start_params`` sets the
         start values for ``coef_`` and ``intercept_``.
 
-    start_params : {'irls', 'least_squares', 'zero', array of shape \
-            (n_features*, )}, optional (default='irls')
+    start_params : {'guess', 'zero', array of shape (n_features*, )}, \
+            optional (default='guess')
         Relevant only if ``warm_start=False`` or if fit is called
         the first time (``self.coef_`` does not yet exist).
 
-        'irls'
+        'guess'
             Start values of mu are calculated by family.starting_mu(..). Then,
-            one step of irls obtains start values for ``coef_``. This gives
-            usually good results.
-
-        'least_squares'
-        Start values for ``coef_`` are obtained by a least squares fit in the
-        link space (y is transformed to the space of the linear predictor).
+            one Newton step obtains start values for ``coef_``. If
+            ``solver='irls'``, it uses one irls step, else the Newton step is
+            calculated by the cd solver.
+            This gives usually good starting values.
 
         'zero'
         All coefficients are set to zero. If ``fit_intercept=True``, the
-        start value for the intercept is obtained by the average of y.
+        start value for the intercept is obtained by the weighted average of y.
 
         array
         The array of size n_features* is directly used as start values
@@ -1560,17 +1572,17 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         RandomState instance used by `np.random`. Used when ``selection`` ==
         'random'.
 
-    diag_fisher : boolean, (default=False)
-        Only relevant for solver 'cd'. If ``False``, the full Fisher matrix
-        (expected Hessian) is computed in each outer iteretion (Newton
-        iteration). If ``True``, only a diagonal matrix (stored as 1d array) is
-        computed, such that fisher = X.T @ diag @ X. This saves memory and
-        matrix-matrix multiplications, but needs more matrix-vector
-        multiplications. If you use large sparse X or if you have many
-        features, i.e. n_features >> n_samples, you might set this option to
-        ``True``.
-
-    copy_X : boolean, optional, default True
+    diag_fisher : boolean, optional, (default=False)
+        Only relevant for solver 'cd' (see also ``start_params='guess'``).
+        If ``False``, the full Fisher matrix (expected Hessian) is computed in
+        each outer iteration (Newton iteration). If ``True``, only a diagonal
+        matrix (stored as 1d array) is computed, such that
+        fisher = X.T @ diag @ X. This saves memory and matrix-matrix
+        multiplications, but needs more matrix-vector multiplications. If you
+        use large sparse X or if you have many features,
+        i.e. n_features >> n_samples, you might set this option to ``True``.
+
+    copy_X : boolean, optional, (default=True)
         If ``True``, X will be copied; else, it may be overwritten.
 
     check_input : boolean, optional (default=True)
@@ -1634,7 +1646,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
                  fit_intercept=True, family='normal', link='auto',
                  fit_dispersion=None, solver='auto', max_iter=100,
-                 tol=1e-4, warm_start=False, start_params='irls',
+                 tol=1e-4, warm_start=False, start_params='guess',
                  selection='cyclic', random_state=None, diag_fisher=False,
                  copy_X=True, check_input=True, verbose=0):
         self.alpha = alpha
@@ -1867,11 +1879,10 @@ def fit(self, X, y, sample_weight=None):
 
         start_params = self.start_params
         if isinstance(start_params, str):
-            if start_params not in ['irls', 'least_squares', 'zero']:
-                raise ValueError("The argument start_params must be 'irls', "
-                                 "'least-squares', 'zero' or an array of "
-                                 " correct length;"
-                                 " got(start_params={0})".format(start_params))
+            if start_params not in ['guess',  'zero']:
+                raise ValueError("The argument start_params must be 'guess', "
+                                 "'zero' or an array of correct length; "
+                                 "got(start_params={0})".format(start_params))
         else:
             start_params = check_array(start_params, accept_sparse=False,
                                        force_all_finite=True, ensure_2d=False,
@@ -1894,10 +1905,17 @@ def fit(self, X, y, sample_weight=None):
         # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
         if P2.ndim == 2:
             if sparse.issparse(P2):
-                P2 = 0.5 * (P2 + P2.transpose())
+                if sparse.isspmatrix_csc(P2):
+                    P2 = 0.5 * (P2 + P2.transpose()).tocsc()
+                else:
+                    P2 = 0.5 * (P2 + P2.transpose()).tocsr()
             else:
                 P2 = 0.5 * (P2 + P2.T)
 
+        # For coordinate descent, if X is sparse, P2 must also be csc
+        if solver == 'cd' and sparse.issparse(X):
+            P2 = sparse.csc_matrix(P2)
+
         # 1.4 additional validations ##########################################
         if self.check_input:
             if not np.all(family.in_y_range(y)):
@@ -1964,45 +1982,63 @@ def fit(self, X, y, sample_weight=None):
             else:
                 coef = self.coef_
         elif isinstance(start_params, str):
-            if start_params == 'irls':
-                # See 3.1 IRLS
-                # Use mu_start and apply one irls step to calculate coef
+            if start_params == 'guess':
+                # Set mu=starting_mu of the family and do one Newton step
+                # If solver=cd use cd, else irls
                 mu = family.starting_mu(y, weights=weights)
-                # linear predictor
-                eta = link.link(mu)
-                # h'(eta)
-                hp = link.inverse_derivative(eta)
-                # working weights W, in principle a diagonal matrix
-                # therefore here just as 1d array
-                W = (hp**2 / family.variance(mu, phi=1, weights=weights))
-                # working observations
-                z = eta + (y-mu)/hp
-                # solve A*coef = b
-                # A = X' W X + l2 P2, b = X' W z
-                coef = _irls_step(X, W, P2, z,
-                                  fit_intercept=self.fit_intercept)
-            elif start_params == 'least_squares':
-                # less restrictive tolerance for finding start values
-                tol = np.max([self.tol, np.sqrt(self.tol)])
-                if self.alpha == 0:
-                    reg = LinearRegression(copy_X=True, fit_intercept=False)
-                    reg.fit(X, link.link(y))
-                    coef = reg.coef_
-                elif self.l1_ratio <= 0.01:
-                    # ElasticNet says l1_ratio <= 0.01 is not reliable
-                    # => use Ridge
-                    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
-                    reg = Ridge(copy_X=True, fit_intercept=False,
-                                alpha=self.alpha*n_samples, tol=tol)
-                    reg.fit(X, link.link(y))
-                    coef = reg.coef_
+                eta = link.link(mu)  # linear predictor
+                if solver in ['cd', 'lbfgs', 'newton-cg']:
+                    # see function _cd_solver
+                    sigma_inv = 1/family.variance(mu, phi=1, weights=weights)
+                    d1 = link.inverse_derivative(eta)
+                    temp = sigma_inv * d1 * (y - mu)
+                    if self.fit_intercept:
+                        score = np.concatenate(([temp.sum()], temp @ X))
+                    else:
+                        score = temp @ X  # sampe as X.T @ temp
+
+                    d2_sigma_inv = d1 * d1 * sigma_inv
+                    diag_fisher = self.diag_fisher
+                    if diag_fisher:
+                        fisher = d2_sigma_inv
+                    else:
+                        fisher = \
+                            _safe_sandwich_dot(X, d2_sigma_inv,
+                                               intercept=self.fit_intercept)
+                    # set up space for search direction d for inner loop
+                    if self.fit_intercept:
+                        coef = np.zeros(n_features+1)
+                    else:
+                        coef = np.zeros(n_features)
+                    d = np.zeros_like(coef)
+                    # initial stopping tolerance of inner loop
+                    # use L1-norm of minimum of norm of subgradient of F
+                    # use less restrictive tolerance for initial guess
+                    inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2,
+                                                 P1=P1)
+                    inner_tol = 4 * linalg.norm(inner_tol, ord=1)
+                    # just one outer loop = Newton step
+                    n_cycles = 0
+                    d, coef_P2, n_cycles, inner_tol = \
+                        _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles,
+                                  inner_tol, max_inner_iter=1000,
+                                  selection=self.selection,
+                                  random_state=self.random_state,
+                                  diag_fisher=self.diag_fisher)
+                    coef += d  # for simplicity no line search here
                 else:
-                    # TODO: Does this make sense at all?
-                    reg = ElasticNet(copy_X=True, fit_intercept=False,
-                                     alpha=self.alpha, l1_ratio=self.l1_ratio,
-                                     tol=tol)
-                    reg.fit(X, link.link(y))
-                    coef = reg.coef_
+                    # See _irls_solver
+                    # h'(eta)
+                    hp = link.inverse_derivative(eta)
+                    # working weights W, in principle a diagonal matrix
+                    # therefore here just as 1d array
+                    W = (hp**2 / family.variance(mu, phi=1, weights=weights))
+                    # working observations
+                    z = eta + (y-mu)/hp
+                    # solve A*coef = b
+                    # A = X' W X + l2 P2, b = X' W z
+                    coef = _irls_step(X, W, P2, z,
+                                      fit_intercept=self.fit_intercept)
             else:  # start_params == 'zero'
                 if self.fit_intercept:
                     coef = np.zeros(n_features+1)
@@ -2048,7 +2084,7 @@ def func(coef, X, y, weights, P2, family, link):
             coef, loss, info = fmin_l_bfgs_b(
                 func, coef, fprime=None, args=args,
                 iprint=(self.verbose > 0) - 1, pgtol=self.tol,
-                maxiter=self.max_iter)
+                maxiter=self.max_iter, factr=1e3)
             if self.verbose > 0:
                 if info["warnflag"] == 1:
                     warnings.warn("lbfgs failed to converge."
@@ -2106,6 +2142,8 @@ def grad_hess(coef, X, y, weights, P2, family, link):
                 # expected hessian = fisher = X.T @ diag_matrix @ X
                 # calculate only diag_matrix
                 diag = d1**2 / family.variance(mu, phi=1, weights=weights)
+                if intercept:
+                    h0i = np.concatenate(([diag.sum()], diag @ X))
 
                 def Hs(coef):
                     # return (0.5 * fisher + P2) @ coef
@@ -2116,7 +2154,6 @@ def Hs(coef):
                     else:
                         ret += P2 @ coef[idx:]
                     if intercept:
-                        h0i = np.concatenate(([diag.sum()], diag @ X))
                         ret = np.concatenate(([0.5 * (h0i @ coef)],
                                              ret + 0.5 * coef[0] * h0i[1:]))
                     return ret
@@ -2124,21 +2161,15 @@ def Hs(coef):
                 return grad, Hs
 
             args = (X, y, weights, P2, family, link)
-            coef, n_iter_i = newton_cg(grad_hess, func, grad, coef,
-                                       args=args, maxiter=self.max_iter,
-                                       tol=self.tol)
+            coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef,
+                                           args=args, maxiter=self.max_iter,
+                                           tol=self.tol)
 
         # 4.4 coordinate descent ##############################################
         # Note: we already set P1 = l1*P1, see above
         # Note: we already set P2 = l2*P2, see above
         # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
         elif solver == 'cd':
-            # For coordinate descent, if X is sparse, it should be csc format
-            # If X is sparse, P2 must also be csc
-            if sparse.issparse(X):
-                X = X.tocsc(copy=self.copy_X)
-                P2 = sparse.csc_matrix(P2)
-
             coef, self.n_iter_, self._n_cycles = \
                 _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1,
                            P2=P2, fit_intercept=self.fit_intercept,
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index cdac151b77de6..17535c067bbb6 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -4,7 +4,7 @@
 import scipy as sp
 from scipy import linalg, optimize, sparse
 
-from sklearn.datasets import make_classification
+from sklearn.datasets import make_classification, make_regression
 from sklearn.linear_model.glm import (
     Link,
     IdentityLink,
@@ -22,10 +22,12 @@
     assert_array_equal, assert_array_almost_equal)
 
 
+rng = np.random.RandomState(42)
+
+
 @pytest.mark.parametrize('link', Link.__subclasses__())
 def test_link_properties(link):
     """Test link inverse and derivative."""
-    rng = np.random.RandomState(0)
     x = rng.rand(100)*100
     link = link()  # instatiate object
     decimal = 10
@@ -86,7 +88,6 @@ def test_deviance_zero(family, chk_values):
 def test_fisher_matrix(family, link):
     """Test the Fisher matrix numerically.
     Trick: Use numerical differentiation with y = mu"""
-    rng = np.random.RandomState(0)
     coef = np.array([-2, 1, 0, 1, 2.5])
     phi = 0.5
     X = rng.randn(10, 5)
@@ -218,7 +219,6 @@ def test_glm_P2_argument(P2):
 def test_glm_P2_positive_semidefinite():
     """Test GLM for a positive semi-definite P2 argument."""
     n_samples, n_features = 10, 5
-    rng = np.random.RandomState(42)
     y = np.arange(n_samples)
     X = np.zeros((n_samples, n_features))
     P2 = np.diag([100, 10, 5, 0, -1E-5])
@@ -351,20 +351,15 @@ def test_glm_check_input_argument(check_input):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize(
-    'family',
-    [NormalDistribution(), PoissonDistribution(),
-     GammaDistribution(), InverseGaussianDistribution(),
-     TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
-     GeneralizedHyperbolicSecant()])
 @pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
-def test_glm_identiy_regression(family, solver):
+def test_glm_identiy_regression(solver):
     """Test GLM regression with identity link on a simple dataset."""
     coef = [1, 2]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
-    glm = GeneralizedLinearRegressor(alpha=0, family=family, link='identity',
-                                     fit_intercept=False, solver=solver)
+    glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
+                                     fit_intercept=False, solver=solver,
+                                     start_params='zero', tol=1e-7)
     res = glm.fit(X, y)
     assert_array_almost_equal(res.coef_, coef)
 
@@ -375,34 +370,42 @@ def test_glm_identiy_regression(family, solver):
      GammaDistribution(), InverseGaussianDistribution(),
      TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
      GeneralizedHyperbolicSecant()])
-@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
-def test_glm_log_regression(family, solver):
+@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6),
+                                              ('lbfgs', 1e-6, 6),
+                                              ('newton-cg', 1e-7, 6),
+                                              ('cd', 1e-7, 6)])
+def test_glm_log_regression(family, solver, tol, dec):
     """Test GLM regression with log link on a simple dataset."""
-    coef = [1, 2]
+    coef = [0.2, -0.1]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.exp(np.dot(X, coef))
     glm = GeneralizedLinearRegressor(
                 alpha=0, family=family, link='log', fit_intercept=False,
-                solver=solver, start_params='least_squares')
+                solver=solver, start_params='guess', tol=tol)
     res = glm.fit(X, y)
-    assert_array_almost_equal(res.coef_, coef)
+    assert_array_almost_equal(res.coef_, coef, decimal=dec)
 
 
 @pytest.mark.filterwarnings('ignore::DeprecationWarning')
-@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
-def test_normal_ridge(solver):
+@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6),
+                                              ('lbfgs', 1e-6, 5),
+                                              ('newton-cg', 1e-6, 5),
+                                              ('cd', 1e-6, 6)])
+def test_normal_ridge(solver, tol, dec):
     """Test ridge regression for Normal distributions.
 
     Compare to test_ridge in test_ridge.py.
     """
-    rng = np.random.RandomState(0)
     alpha = 1.0
 
     # 1. With more samples than features
-    n_samples, n_features, n_predict = 10, 5, 10
-    y = rng.randn(n_samples)
-    X = rng.randn(n_samples, n_features)
-    T = rng.randn(n_predict, n_features)
+    n_samples, n_features, n_predict = 100, 7, 10
+    X, y, coef = make_regression(n_samples=n_samples+n_predict,
+                                 n_features=n_features,
+                                 n_informative=n_features-2, noise=0.5,
+                                 coef=True, random_state=rng)
+    y = y[0:n_samples]
+    X, T = X[0:n_samples], X[n_samples:]
 
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6,
@@ -410,69 +413,74 @@ def test_normal_ridge(solver):
     ridge.fit(X, y)
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
                                      link='identity', fit_intercept=True,
-                                     tol=1e-6, max_iter=100, solver=solver,
-                                     random_state=42)
+                                     tol=tol, max_iter=100, solver=solver,
+                                     check_input=False, random_state=rng)
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
-    assert_array_almost_equal(glm.coef_, ridge.coef_)
-    assert_almost_equal(glm.intercept_, ridge.intercept_)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
                   solver='svd', normalize=False)
     ridge.fit(X, y)
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
                                      link='identity', fit_intercept=False,
-                                     tol=1e-6, max_iter=100, solver=solver,
-                                     random_state=42, fit_dispersion='chisqr')
+                                     tol=tol, max_iter=100, solver=solver,
+                                     check_input=False, random_state=rng,
+                                     fit_dispersion='chisqr')
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
-    assert_array_almost_equal(glm.coef_, ridge.coef_)
-    assert_almost_equal(glm.intercept_, ridge.intercept_)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
     mu = glm.predict(X)
     assert_almost_equal(glm.dispersion_,
                         np.sum((y-mu)**2/(n_samples-n_features)))
 
     # 2. With more features than samples and sparse
-    n_samples, n_features, n_predict = 5, 10, 10
-    y = rng.randn(n_samples)
-    X = sparse.csr_matrix(rng.randn(n_samples, n_features))
-    T = sparse.csr_matrix(rng.randn(n_predict, n_features))
+    n_samples, n_features, n_predict = 10, 100, 10
+    X, y, coef = make_regression(n_samples=n_samples+n_predict,
+                                 n_features=n_features,
+                                 n_informative=n_features-2, noise=0.5,
+                                 coef=True, random_state=rng)
+    y = y[0:n_samples]
+    X, T = X[0:n_samples], X[n_samples:]
 
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9,
                   solver='sag', normalize=False, max_iter=100000)
     ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8,
-                                     family='normal', link='identity',
-                                     fit_intercept=True, solver=solver,
-                                     max_iter=300, random_state=42)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=True,
+                                     tol=tol, max_iter=300, solver=solver,
+                                     check_input=False, random_state=rng)
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
-    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5)
-    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5)
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7,
                   solver='sag', normalize=False, max_iter=1000)
     ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7,
-                                     family='normal', link='identity',
-                                     fit_intercept=False, solver=solver)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=False,
+                                     tol=tol*2, max_iter=300, solver=solver,
+                                     check_input=False, random_state=rng)
     glm.fit(X, y)
     assert_equal(glm.coef_.shape, (X.shape[1], ))
-    assert_array_almost_equal(glm.coef_, ridge.coef_)
-    assert_almost_equal(glm.intercept_, ridge.intercept_)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2)
 
 
-@pytest.mark.parametrize('solver, decimal, tol',
-                         [('irls', 7, 1e-8),
-                          ('lbfgs', 5, 1e-7),
-                          ('newton-cg', 5, 1e-7),
-                          ('cd', 7, 1e-8)])
-def test_poisson_ridge(solver, decimal, tol):
+@pytest.mark.parametrize('solver, tol, dec',
+                         [('irls', 1e-7, 6),
+                          ('lbfgs', 1e-7, 5),
+                          ('newton-cg', 1e-7, 5),
+                          ('cd', 1e-7, 7)])
+def test_poisson_ridge(solver, tol, dec):
     """Test ridge regression with poisson family and LogLink.
 
     Compare to R's glmnet"""
@@ -493,18 +501,17 @@ def test_poisson_ridge(solver, decimal, tol):
                                      fit_intercept=True, family='poisson',
                                      link='log', tol=tol,
                                      solver=solver, max_iter=300,
-                                     random_state=42)
+                                     random_state=rng)
     glm.fit(X, y)
     assert_almost_equal(glm.intercept_, -0.12889386979,
-                        decimal=decimal)
+                        decimal=dec)
     assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
-                              decimal=decimal)
+                              decimal=dec)
 
 
 @pytest.mark.parametrize('diag_fisher', [False, True])
 def test_normal_enet(diag_fisher):
     """Test elastic net regression with normal/gaussian family."""
-    rng = np.random.RandomState(0)
     alpha, l1_ratio = 0.3, 0.7
     n_samples, n_features = 20, 2
     X = rng.randn(n_samples, n_features).copy(order='F')
@@ -556,7 +563,8 @@ def test_poisson_enet():
     y = np.array([0, 1, 1, 2])
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
                                      link='log', solver='cd', tol=1e-8,
-                                     selection='random', random_state=42)
+                                     selection='random', random_state=rng,
+                                     start_params='guess')
     glm.fit(X, y)
     assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7)
     assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7)
@@ -591,7 +599,7 @@ def obj(coef):
     glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5,
                                      family='poisson', max_iter=300,
                                      link='log', solver='cd', tol=1e-5,
-                                     start_params='zero')
+                                     selection='cyclic', start_params='zero')
     glm.fit(X, y)
     # warm start with original alpha and use of sparse matrices
     glm.warm_start = True
@@ -612,9 +620,9 @@ def test_binomial_enet(alpha):
     n_samples = 500
     X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6,
                                n_informative=5, n_redundant=0, n_repeated=0,
-                               random_state=0)
+                               random_state=rng)
     log = LogisticRegression(
-        penalty='elasticnet', random_state=0, fit_intercept=False, tol=1e-6,
+        penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6,
         max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha),
         solver='saga')
     log.fit(X, y)

From a7755de2cdc5022b6d41285730ed12bd41628e66 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Tue, 4 Jun 2019 17:34:11 +0200
Subject: [PATCH 52/69] Fix a few typos

---
 doc/modules/linear_model.rst                  |  2 +-
 .../plot_poisson_spline_regression.py         |  2 +-
 sklearn/linear_model/glm.py                   | 53 ++++++++++---------
 sklearn/linear_model/tests/test_glm.py        |  4 +-
 4 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 4bede17af581a..8f17c67d950de 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -887,7 +887,7 @@ combination of the input variables :math:`X` via an inverse link function
 .. math::    \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p).
 
 Secondly, the squared loss function is replaced by the deviance :math:`D` of an
-exponential dispersion model (EDM) [11]_. The objective function beeing minimized
+exponential dispersion model (EDM) [11]_. The objective function being minimized
 becomes
 
 .. math::    \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1
diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py
index fce85fae1ea8c..30b5881bba1f5 100644
--- a/examples/linear_model/plot_poisson_spline_regression.py
+++ b/examples/linear_model/plot_poisson_spline_regression.py
@@ -5,7 +5,7 @@
 
 As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py`
 example, a Poisson regression with penalized B-splines (P-splines) [1]_ is
-fitted on slightly different sinusodial, Poisson distributed data and
+fitted on slightly different sinusoidal, Poisson distributed data and
 compared to an AdaBoost model with decision trees.
 One can see, that this is a hard problem for both estimators.
 
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
index a53cc39ecd307..a35c68828de81 100644
--- a/sklearn/linear_model/glm.py
+++ b/sklearn/linear_model/glm.py
@@ -29,7 +29,7 @@
 #   L2: w*P2*w with P2 a (semi-) positive definite matrix, e.g. P2 could be
 #   a 1st or 2nd order difference matrix (compare B-spline penalties and
 #   Tikhonov regularization).
-# - The link funtion (instance of class Link) is necessary for the evaluation
+# - The link function (instance of class Link) is necessary for the evaluation
 #   of deviance, score, Fisher and Hessian matrix as functions of the
 #   coefficients, which is needed by optimizers.
 #   Solution: link as argument in those functions
@@ -170,7 +170,7 @@ def _min_norm_sugrad(coef, grad, P2, P1):
 
 
 class Link(metaclass=ABCMeta):
-    """Abstract base class for Link funtions."""
+    """Abstract base class for Link functions."""
 
     @abstractmethod
     def link(self, mu):
@@ -201,7 +201,7 @@ def derivative(self, mu):
     def inverse(self, lin_pred):
         """Compute the inverse link function h(lin_pred).
 
-        Gives the inverse relationship between linkear predictor and the mean
+        Gives the inverse relationship between linker predictor and the mean
         mu=E[Y], i.e. h(linear predictor) = mu.
 
         Parameters
@@ -357,7 +357,7 @@ def include_lower_bound(self):
 
     @property
     def include_upper_bound(self):
-        """Get True if upper bound for y is includede: y <= upper_bound."""
+        """Get True if upper bound for y is included: y <= upper_bound."""
         return self._include_upper_bound
 
     def in_y_range(self, x):
@@ -859,7 +859,7 @@ def __init__(self):
 class GeneralizedHyperbolicSecant(ExponentialDispersionModel):
     """A class for the Generalized Hyperbolic Secant (GHS) distribution.
 
-    The GHS distribution is for tagets y in (-inf, inf).
+    The GHS distribution is for targets y in (-inf, inf).
     """
     def __init__(self):
         self._lower_bound = -np.Inf
@@ -881,7 +881,7 @@ def unit_deviance(self, y, mu):
 class BinomialDistribution(ExponentialDispersionModel):
     """A class for the Binomial distribution.
 
-    The Binomial distribution is for tagets y in [0, 1].
+    The Binomial distribution is for targets y in [0, 1].
     """
     def __init__(self):
         self._lower_bound = 0
@@ -1131,7 +1131,7 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
             else:
                 b = B[jdx, jdx]
 
-            # those ten lines aree what it is all about
+            # those ten lines are what it is all about
             if b <= 0:
                 z = 0
             elif P1[j] == 0:
@@ -1199,7 +1199,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
                diag_fisher=False, copy_X=True):
     """Solve GLM with L1 and L2 penalty by coordinate descent algorithm.
 
-    The objective beeing minimized in the coefficients w=coef is::
+    The objective being minimized in the coefficients w=coef is::
 
         F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1
 
@@ -1207,7 +1207,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
 
     1. Find optimal descent direction d by minimizing
        min_d F(w+d) = min_d F(w+d) - F(w)
-    2. Quadrdatic approximation of F(w+d)-F(w) = q(d):
+    2. Quadratic approximation of F(w+d)-F(w) = q(d):
        using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives:
        q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
        + ||P1*(w+d)||_1 - ||P1*w||_1
@@ -1228,7 +1228,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
     ----------
     coef : ndarray, shape (c,)
         If fit_intercept=False, shape c=X.shape[1].
-        If fit_intercept=True, then c=X.shapee[1] + 1.
+        If fit_intercept=True, then c=X.shape[1] + 1.
 
     X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
         Training data (with intercept included if present). If not sparse,
@@ -1265,7 +1265,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
         cycles over all features per inner loop.
 
     tol : float, optional (default=1e-4)
-        Covergence criterion is
+        Convergence criterion is
         sum_i(|minimum of norm of subgrad of objective_i|)<=tol.
 
     selection : str, optional (default='cyclic')
@@ -1285,9 +1285,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
     -------
     coef : ndarray, shape (c,)
         If fit_intercept=False, shape c=X.shape[1].
-        If fit_intercept=True, then c=X.shapee[1] + 1.
+        If fit_intercept=True, then c=X.shape[1] + 1.
 
-    n_iter : numer of outer iterations = newton iterations
+    n_iter : number of outer iterations = newton iterations
 
     n_cycles : number of cycles over features
 
@@ -1312,7 +1312,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
                              "format. Got P2 not sparse.")
     random_state = check_random_state(random_state)
     # Note: we already set P2 = l2*P2, P1 = l1*P1
-    # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+    # Note: we already symmetrized P2 = 1/2 (P2 + P2')
     n_iter = 0  # number of outer iterations
     n_cycles = 0  # number of (complete) cycles over features
     converged = False
@@ -1427,7 +1427,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
     Parameters
     ----------
     alpha : float, optional (default=1)
-        Constant that multiplies the penalty terms und thus determines the
+        Constant that multiplies the penalty terms and thus determines the
         regularization strength.
         See the notes for the exact mathematical meaning of this
         parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
@@ -1481,9 +1481,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
         - 'logit' for family 'binomial'
 
-    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None)
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
         Method for estimation of the dispersion parameter phi. Whether to use
-        the chi squared statisic or the deviance statistic. If None, the
+        the chi squared statistic or the deviance statistic. If None, the
         dispersion is not estimated.
 
     solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \
@@ -1520,7 +1520,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
         Stopping criterion. For the irls, newton-cg and lbfgs solvers,
         the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient (derivative) of
-        the objective function. For the cd solver, covergence is reached
+        the objective function. For the cd solver, convergence is reached
         when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the
         subgradient of the objective and minimum-norm of ``g_i`` is the element
         of the subgradient ``g_i`` with the smallest L2-norm.
@@ -1626,7 +1626,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
 
     If the target y is a ratio, appropriate sample weights s should be
     provided.
-    As an example, consider Poission distributed counts z (integers) and
+    As an example, consider Poisson distributed counts z (integers) and
     weights s=exposure (time, money, persons years, ...). Then you fit
     y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
     sample_weight=s)``. The weights are necessary for the right (finite
@@ -1681,7 +1681,7 @@ def fit(self, X, y, sample_weight=None):
             Target values.
 
         sample_weight : {None, array-like}, shape (n_samples,),\
-                optinal (default=None)
+                optional (default=None)
             Individual weights w_i for each sample. Note that for an
             Exponential Dispersion Model (EDM), one has
             Var[Y_i]=phi/w_i * v(mu).
@@ -1995,7 +1995,7 @@ def fit(self, X, y, sample_weight=None):
                     if self.fit_intercept:
                         score = np.concatenate(([temp.sum()], temp @ X))
                     else:
-                        score = temp @ X  # sampe as X.T @ temp
+                        score = temp @ X  # same as X.T @ temp
 
                     d2_sigma_inv = d1 * d1 * sigma_inv
                     diag_fisher = self.diag_fisher
@@ -2051,12 +2051,12 @@ def fit(self, X, y, sample_weight=None):
         #######################################################################
         # 4. fit                                                              #
         #######################################################################
-        # algorithms for optimiation
+        # algorithms for optimization
         # TODO: Parallelize it?
 
         # 4.1 IRLS ############################################################
         # Note: we already set P2 = l2*P2, see above
-        # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+        # Note: we already symmetrized P2 = 1/2 (P2 + P2')
         if solver == 'irls':
             coef, self.n_iter_ = \
                 _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2,
@@ -2137,7 +2137,7 @@ def grad_hess(coef, X, y, weights, P2, family, link):
                     grad = np.concatenate(([0.5 * temp.sum()],
                                            0.5 * temp @ X + L2))
                 else:
-                    grad = 0.5 * temp @ X + L2  # sampe as 0.5* X.T @ temp + L2
+                    grad = 0.5 * temp @ X + L2  # same as 0.5* X.T @ temp + L2
 
                 # expected hessian = fisher = X.T @ diag_matrix @ X
                 # calculate only diag_matrix
@@ -2168,7 +2168,7 @@ def Hs(coef):
         # 4.4 coordinate descent ##############################################
         # Note: we already set P1 = l1*P1, see above
         # Note: we already set P2 = l2*P2, see above
-        # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+        # Note: we already symmetrized P2 = 1/2 (P2 + P2')
         elif solver == 'cd':
             coef, self.n_iter_, self._n_cycles = \
                 _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1,
@@ -2215,7 +2215,8 @@ def linear_predictor(self, X):
         return X @ self.coef_ + self.intercept_
 
     def predict(self, X, sample_weight=None):
-        """Predict uing GLM with feature matrix X.
+        """Predict using GLM with feature matrix X.
+
         If sample_weight is given, returns prediction*sample_weight.
 
         Parameters
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 17535c067bbb6..6172824cf1b79 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -29,7 +29,7 @@
 def test_link_properties(link):
     """Test link inverse and derivative."""
     x = rng.rand(100)*100
-    link = link()  # instatiate object
+    link = link()  # instantiate object
     decimal = 10
     if isinstance(link, LogitLink):
         # careful for large x, note expit(36) = 1
@@ -133,7 +133,7 @@ def test_sample_weights_validation():
     with pytest.raises(ValueError):
         glm.fit(X, y, weights)
 
-    # 5. 1d but weith a negative value
+    # 5. 1d but with a negative value
     weights = [2, -1]
     with pytest.raises(ValueError):
         glm.fit(X, y, weights)

From 9aa1fc41d726e5ac9007d9c867546aa70ea79ba7 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Tue, 4 Jun 2019 17:40:08 +0200
Subject: [PATCH 53/69] Make module private

---
 sklearn/linear_model/__init__.py         | 4 ++--
 sklearn/linear_model/{glm.py => _glm.py} | 0
 sklearn/linear_model/tests/test_glm.py   | 9 +++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)
 rename sklearn/linear_model/{glm.py => _glm.py} (100%)

diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 0c5840f343a3a..0f7856fcc2046 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,8 +18,8 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
-from .glm import (TweedieDistribution,
-                  GeneralizedLinearRegressor)
+from ._glm import (TweedieDistribution,
+                   GeneralizedLinearRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/_glm.py
similarity index 100%
rename from sklearn/linear_model/glm.py
rename to sklearn/linear_model/_glm.py
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 6172824cf1b79..a148d11cb2632 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -1,3 +1,7 @@
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#
+# License: BSD 3 clause
+
 import numpy as np
 from numpy.testing import assert_allclose
 import pytest
@@ -5,7 +9,8 @@
 from scipy import linalg, optimize, sparse
 
 from sklearn.datasets import make_classification, make_regression
-from sklearn.linear_model.glm import (
+from sklearn.linear_model import GeneralizedLinearRegressor
+from sklearn.linear_model._glm import (
     Link,
     IdentityLink,
     LogLink,
@@ -14,7 +19,7 @@
     NormalDistribution, PoissonDistribution,
     GammaDistribution, InverseGaussianDistribution,
     GeneralizedHyperbolicSecant, BinomialDistribution,
-    GeneralizedLinearRegressor)
+)
 from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
 
 from sklearn.utils.testing import (

From ca3eae24b4db9931eeac9fb925dba4f48199976c Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Tue, 4 Jun 2019 21:35:11 +0200
Subject: [PATCH 54/69] Working on tests

---
 sklearn/linear_model/tests/test_glm.py | 52 +++++++++++++-------------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index a148d11cb2632..2c8a9c3d2c72c 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -23,7 +23,7 @@
 from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
 
 from sklearn.utils.testing import (
-    assert_equal, assert_almost_equal,
+    assert_almost_equal,
     assert_array_equal, assert_array_almost_equal)
 
 
@@ -35,16 +35,14 @@ def test_link_properties(link):
     """Test link inverse and derivative."""
     x = rng.rand(100)*100
     link = link()  # instantiate object
-    decimal = 10
     if isinstance(link, LogitLink):
         # careful for large x, note expit(36) = 1
         # limit max eta to 15
         x = x / 100 * 15
-        decimal = 8
-    assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal)
+    assert_allclose(link.link(link.inverse(x)), x)
     # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
-    assert_almost_equal(link.derivative(link.inverse(x)),
-                        1./link.inverse_derivative(x), decimal=decimal)
+    assert_allclose(link.derivative(link.inverse(x)),
+                    1./link.inverse_derivative(x))
     # for LogitLink, in the following x should be between 0 and 1.
     # assert_almost_equal(link.inverse_derivative(link.link(x)),
     #                     1./link.derivative(x), decimal=decimal)
@@ -79,7 +77,7 @@ def test_family_bounds(family, expected):
 def test_deviance_zero(family, chk_values):
     """Test deviance(y,y) = 0 for different families."""
     for x in chk_values:
-        assert_almost_equal(family.deviance(x, x), 0, decimal=10)
+        assert_allclose(family.deviance(x, x), 0, atol=1e-9)
 
 
 @pytest.mark.parametrize(
@@ -155,7 +153,7 @@ def test_glm_family_argument(f, fam):
     y = np.array([0.1, 0.5])  # in range of all distributions
     X = np.array([[1], [2]])
     glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
-    assert_equal(type(glm._family_instance), type(fam))
+    assert isinstance(glm._family_instance, fam.__class__)
 
     glm = GeneralizedLinearRegressor(family='not a family',
                                      fit_intercept=False)
@@ -172,7 +170,7 @@ def test_glm_link_argument(l, link):
     y = np.array([0.1, 0.5])  # in range of all distributions
     X = np.array([[1], [2]])
     glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y)
-    assert_equal(type(glm._link_instance), type(link))
+    assert isinstance(glm._link_instance, link.__class__)
 
     glm = GeneralizedLinearRegressor(family='normal', link='not a link')
     with pytest.raises(ValueError):
@@ -366,7 +364,7 @@ def test_glm_identiy_regression(solver):
                                      fit_intercept=False, solver=solver,
                                      start_params='zero', tol=1e-7)
     res = glm.fit(X, y)
-    assert_array_almost_equal(res.coef_, coef)
+    assert_allclose(res.coef_, coef)
 
 
 @pytest.mark.parametrize(
@@ -375,11 +373,11 @@ def test_glm_identiy_regression(solver):
      GammaDistribution(), InverseGaussianDistribution(),
      TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
      GeneralizedHyperbolicSecant()])
-@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6),
-                                              ('lbfgs', 1e-6, 6),
-                                              ('newton-cg', 1e-7, 6),
-                                              ('cd', 1e-7, 6)])
-def test_glm_log_regression(family, solver, tol, dec):
+@pytest.mark.parametrize('solver, tol', [('irls', 1e-6),
+                                         ('lbfgs', 1e-6),
+                                         ('newton-cg', 1e-7),
+                                         ('cd', 1e-7)])
+def test_glm_log_regression(family, solver, tol):
     """Test GLM regression with log link on a simple dataset."""
     coef = [0.2, -0.1]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
@@ -388,7 +386,7 @@ def test_glm_log_regression(family, solver, tol, dec):
                 alpha=0, family=family, link='log', fit_intercept=False,
                 solver=solver, start_params='guess', tol=tol)
     res = glm.fit(X, y)
-    assert_array_almost_equal(res.coef_, coef, decimal=dec)
+    assert_allclose(res.coef_, coef)
 
 
 @pytest.mark.filterwarnings('ignore::DeprecationWarning')
@@ -421,10 +419,10 @@ def test_normal_ridge(solver, tol, dec):
                                      tol=tol, max_iter=100, solver=solver,
                                      check_input=False, random_state=rng)
     glm.fit(X, y)
-    assert_equal(glm.coef_.shape, (X.shape[1], ))
-    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
-    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
+    assert glm.coef_.shape == (X.shape[1], )
+    assert_allclose(glm.coef_, ridge.coef_)
+    assert glm.intercept_ == pytest.approx(ridge.intercept_)
+    assert_allclose(glm.predict(T), ridge.predict(T))
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
                   solver='svd', normalize=False)
@@ -435,13 +433,13 @@ def test_normal_ridge(solver, tol, dec):
                                      check_input=False, random_state=rng,
                                      fit_dispersion='chisqr')
     glm.fit(X, y)
-    assert_equal(glm.coef_.shape, (X.shape[1], ))
-    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert glm.coef_.shape == (X.shape[1], )
+    assert_allclose(glm.coef_, ridge.coef_)
     assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
+    assert_allclose(glm.predict(T), ridge.predict(T))
     mu = glm.predict(X)
-    assert_almost_equal(glm.dispersion_,
-                        np.sum((y-mu)**2/(n_samples-n_features)))
+    assert_allclose(glm.dispersion_,
+                    np.sum((y-mu)**2/(n_samples-n_features)))
 
     # 2. With more features than samples and sparse
     n_samples, n_features, n_predict = 10, 100, 10
@@ -461,7 +459,7 @@ def test_normal_ridge(solver, tol, dec):
                                      tol=tol, max_iter=300, solver=solver,
                                      check_input=False, random_state=rng)
     glm.fit(X, y)
-    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert glm.coef_.shape == (X.shape[1], )
     assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
     assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
     assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
@@ -474,7 +472,7 @@ def test_normal_ridge(solver, tol, dec):
                                      tol=tol*2, max_iter=300, solver=solver,
                                      check_input=False, random_state=rng)
     glm.fit(X, y)
-    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert glm.coef_.shape == (X.shape[1], )
     assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1)
     assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1)
     assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2)

From 61bc6b8e2b1e227f539656744e9b4a4fa9f514f2 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 5 Jun 2019 18:45:20 +0200
Subject: [PATCH 55/69] Improve tests

---
 sklearn/linear_model/tests/test_glm.py | 82 +++++++++++++-------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 2c8a9c3d2c72c..7d747c23ae441 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -22,9 +22,7 @@
 )
 from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
 
-from sklearn.utils.testing import (
-    assert_almost_equal,
-    assert_array_equal, assert_array_almost_equal)
+from sklearn.utils.testing import assert_array_equal
 
 
 rng = np.random.RandomState(42)
@@ -355,16 +353,16 @@ def test_glm_check_input_argument(check_input):
 
 
 @pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
-def test_glm_identiy_regression(solver):
+def test_glm_identity_regression(solver):
     """Test GLM regression with identity link on a simple dataset."""
-    coef = [1, 2]
+    coef = [1., 2.]
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef)
     glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
                                      fit_intercept=False, solver=solver,
                                      start_params='zero', tol=1e-7)
     res = glm.fit(X, y)
-    assert_allclose(res.coef_, coef)
+    assert_allclose(res.coef_, coef, rtol=1e-6)
 
 
 @pytest.mark.parametrize(
@@ -386,7 +384,7 @@ def test_glm_log_regression(family, solver, tol):
                 alpha=0, family=family, link='log', fit_intercept=False,
                 solver=solver, start_params='guess', tol=tol)
     res = glm.fit(X, y)
-    assert_allclose(res.coef_, coef)
+    assert_allclose(res.coef_, coef, rtol=5e-6)
 
 
 @pytest.mark.filterwarnings('ignore::DeprecationWarning')
@@ -420,9 +418,9 @@ def test_normal_ridge(solver, tol, dec):
                                      check_input=False, random_state=rng)
     glm.fit(X, y)
     assert glm.coef_.shape == (X.shape[1], )
-    assert_allclose(glm.coef_, ridge.coef_)
-    assert glm.intercept_ == pytest.approx(ridge.intercept_)
-    assert_allclose(glm.predict(T), ridge.predict(T))
+    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6)
+    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
+    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6)
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
                   solver='svd', normalize=False)
@@ -434,9 +432,9 @@ def test_normal_ridge(solver, tol, dec):
                                      fit_dispersion='chisqr')
     glm.fit(X, y)
     assert glm.coef_.shape == (X.shape[1], )
-    assert_allclose(glm.coef_, ridge.coef_)
-    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
-    assert_allclose(glm.predict(T), ridge.predict(T))
+    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-5)
+    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6)
+    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6)
     mu = glm.predict(X)
     assert_allclose(glm.dispersion_,
                     np.sum((y-mu)**2/(n_samples-n_features)))
@@ -452,7 +450,8 @@ def test_normal_ridge(solver, tol, dec):
 
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9,
-                  solver='sag', normalize=False, max_iter=100000)
+                  solver='sag', normalize=False, max_iter=100000,
+                  random_state=42)
     ridge.fit(X, y)
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
                                      link='identity', fit_intercept=True,
@@ -460,22 +459,24 @@ def test_normal_ridge(solver, tol, dec):
                                      check_input=False, random_state=rng)
     glm.fit(X, y)
     assert glm.coef_.shape == (X.shape[1], )
-    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
-    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
+    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6)
+    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6)
+    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
 
     ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7,
-                  solver='sag', normalize=False, max_iter=1000)
+                  solver='sag', normalize=False, max_iter=1000,
+                  random_state=42)
     ridge.fit(X, y)
+
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
                                      link='identity', fit_intercept=False,
                                      tol=tol*2, max_iter=300, solver=solver,
                                      check_input=False, random_state=rng)
     glm.fit(X, y)
     assert glm.coef_.shape == (X.shape[1], )
-    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1)
-    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1)
-    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2)
+    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-4)
+    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
+    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
 
 
 @pytest.mark.parametrize('solver, tol, dec',
@@ -506,10 +507,8 @@ def test_poisson_ridge(solver, tol, dec):
                                      solver=solver, max_iter=300,
                                      random_state=rng)
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, -0.12889386979,
-                        decimal=dec)
-    assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
-                              decimal=dec)
+    assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5)
+    assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-6)
 
 
 @pytest.mark.parametrize('diag_fisher', [False, True])
@@ -535,14 +534,14 @@ def test_normal_enet(diag_fisher):
                       normalize=False, tol=1e-8, copy_X=True)
     enet.fit(X, y)
 
-    assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
-    assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
+    assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7)
+    assert_allclose(glm.coef_, enet.coef_, rtol=5e-5)
 
     # 2. test normal enet on sparse data
     X = sparse.csc_matrix(X)
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
-    assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
+    assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7)
+    assert_allclose(glm.coef_, enet.coef_, rtol=5e-5)
 
 
 def test_poisson_enet():
@@ -569,8 +568,8 @@ def test_poisson_enet():
                                      selection='random', random_state=rng,
                                      start_params='guess')
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7)
-    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7)
+    assert_allclose(glm.intercept_, glmnet_intercept, rtol=2e-6)
+    assert_allclose(glm.coef_, glmnet_coef, rtol=2e-7)
 
     # test results with general optimization procedure
     def obj(coef):
@@ -584,10 +583,10 @@ def obj(coef):
             + alpha * l1_ratio * np.sum(np.abs(coef[1:]))
     res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10,
                             options={'maxiter': 1000, 'disp': False})
-    assert_almost_equal(glm.intercept_, res.x[0], decimal=5)
-    assert_almost_equal(glm.coef_, res.x[1:], decimal=5)
-    assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))),
-                        res.fun, decimal=8)
+    assert_allclose(glm.intercept_, res.x[0], rtol=1e-5)
+    assert_allclose(glm.coef_, res.x[1:], rtol=1e-5, atol=1e-9)
+    assert_allclose(obj(np.concatenate(([glm.intercept_], glm.coef_))),
+                    res.fun, rtol=1e-8)
 
     # same for start_params='zero' and selection='cyclic'
     # with reduced precision
@@ -595,8 +594,8 @@ def obj(coef):
                                      link='log', solver='cd', tol=1e-5,
                                      selection='cyclic', start_params='zero')
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
-    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
+    assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4)
+    assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4)
 
     # check warm_start, therefore start with different alpha
     glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5,
@@ -609,8 +608,8 @@ def obj(coef):
     glm.alpha = 1
     X = sparse.csr_matrix(X)
     glm.fit(X, y)
-    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
-    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
+    assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4)
+    assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4)
 
 
 @pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10])
@@ -629,10 +628,11 @@ def test_binomial_enet(alpha):
         max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha),
         solver='saga')
     log.fit(X, y)
+
     glm = GeneralizedLinearRegressor(
         family=BinomialDistribution(), link=LogitLink(), fit_intercept=False,
         alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic',
         tol=1e-7)
     glm.fit(X, y)
-    assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6)
-    assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6)
+    assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6)
+    assert_allclose(log.coef_[0, :], glm.coef_, rtol=2e-6)

From b24a7cab9fc7d9f5dcec5b9d7657fee2d0a94283 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Wed, 5 Jun 2019 18:53:58 +0200
Subject: [PATCH 56/69] Remove unused dec parameter in tests

---
 sklearn/linear_model/tests/test_glm.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 7d747c23ae441..a3e943403a7a7 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -388,11 +388,11 @@ def test_glm_log_regression(family, solver, tol):
 
 
 @pytest.mark.filterwarnings('ignore::DeprecationWarning')
-@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6),
-                                              ('lbfgs', 1e-6, 5),
-                                              ('newton-cg', 1e-6, 5),
-                                              ('cd', 1e-6, 6)])
-def test_normal_ridge(solver, tol, dec):
+@pytest.mark.parametrize('solver, tol', [('irls', 1e-6),
+                                         ('lbfgs', 1e-6),
+                                         ('newton-cg', 1e-6),
+                                         ('cd', 1e-6)])
+def test_normal_ridge(solver, tol):
     """Test ridge regression for Normal distributions.
 
     Compare to test_ridge in test_ridge.py.
@@ -479,12 +479,12 @@ def test_normal_ridge(solver, tol, dec):
     assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
 
 
-@pytest.mark.parametrize('solver, tol, dec',
-                         [('irls', 1e-7, 6),
-                          ('lbfgs', 1e-7, 5),
-                          ('newton-cg', 1e-7, 5),
-                          ('cd', 1e-7, 7)])
-def test_poisson_ridge(solver, tol, dec):
+@pytest.mark.parametrize('solver, tol',
+                         [('irls', 1e-7),
+                          ('lbfgs', 1e-7),
+                          ('newton-cg', 1e-7),
+                          ('cd', 1e-7)])
+def test_poisson_ridge(solver, tol):
     """Test ridge regression with poisson family and LogLink.
 
     Compare to R's glmnet"""

From f95b390c04fd4c0a333f08a649d9450ec44f0395 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Tue, 18 Jul 2017 21:50:10 +0200
Subject: [PATCH 57/69] ENH: add Generalized Linear Models, issue #5975

* new estimator GeneralizedLinearRegressor
* loss functions for Tweedie family and Binomial
* elasitc net penalties
* control of penalties by matrix P2 and vector P1
* new solvers: coordinate descent, irls
* tests
* documentation
* example for Poisson regression
---
 doc/modules/classes.rst                       |    1 +
 doc/modules/linear_model.rst                  |  129 +
 .../plot_poisson_spline_regression.py         |   85 +
 sklearn/linear_model/__init__.py              |    6 +-
 sklearn/linear_model/glm.py                   | 2331 +++++++++++++++++
 sklearn/linear_model/tests/test_glm.py        |  640 +++++
 6 files changed, 3191 insertions(+), 1 deletion(-)
 create mode 100644 examples/linear_model/plot_poisson_spline_regression.py
 create mode 100644 sklearn/linear_model/glm.py
 create mode 100644 sklearn/linear_model/tests/test_glm.py

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 2dcf582a6ab39..4158e34e8bb8c 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -727,6 +727,7 @@ Kernels:
    linear_model.BayesianRidge
    linear_model.ElasticNet
    linear_model.ElasticNetCV
+   linear_model.GeneralizedLinearRegressor
    linear_model.HuberRegressor
    linear_model.Lars
    linear_model.LarsCV
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 02f406f629e04..888566fab3601 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -893,6 +893,135 @@ to warm-starting (see :term:`Glossary <warm_start>`).
     .. [9] `"Performance Evaluation of Lbfgs vs other solvers"
             <http://www.fuzihao.org/blog/2016/01/16/Comparison-of-Gradient-Descent-Stochastic-Gradient-Descent-and-L-BFGS/>`_
 
+.. _Generalized_linear_regression:
+
+Generalized Linear Regression
+=============================
+
+:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two
+ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear
+combination of the input variables :math:`X` via an inverse link function
+:math:`h` as
+
+.. math::    \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p).
+
+Secondly, the squared loss function is replaced by the deviance :math:`D` of an
+exponential dispersion model (EDM) [11]_. The objective function beeing minimized
+becomes
+
+.. math::    \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1
+            +\frac{\alpha(1-\rho)}{2} w^T P_2 w
+
+with sample weights :math:`s`.
+:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in
+the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows
+for a more versatile L2 penalty.
+
+Use cases, where a loss different from the squared loss might be appropriate,
+are the following:
+
+  * If the target values :math:`y` are counts (non-negative integer valued) or
+    frequencies (non-negative), you might use a Poisson deviance with log-link.
+
+  * If the target values are positive valued and skewed, you might try a
+    Gamma deviance with log-link.
+
+  * If the target values seem to be heavier tailed than a Gamma distribution,
+    you might try an Inverse Gaussian deviance (or even higher variance powers
+    of the Tweedie family).
+
+Since the linear predictor :math:`Xw` can be negative and
+Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
+it is convenient to apply a link function different from the identity link
+:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with
+:math:`h(Xw)=\exp(Xw)`.
+
+Note that the feature matrix `X` should be standardized before fitting. This
+ensures that the penalty treats features equally. The estimator can be used as
+follows:
+
+    >>> from sklearn.linear_model import GeneralizedLinearRegressor
+    >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
+    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
+    GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5,
+                               check_input=True, copy_X=True, diag_fisher=False,
+                               family='poisson', fit_dispersion=None,
+                               fit_intercept=True, l1_ratio=0, link='log',
+                               max_iter=100, random_state=None, selection='cyclic',
+                               solver='auto', start_params='guess', tol=0.0001,
+                               verbose=0, warm_start=False)
+    >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE
+    array([0.24630169, 0.43373464])
+    >>> reg.intercept_ #doctest: +ELLIPSIS
+    -0.76383633...
+
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py`
+
+Mathematical formulation
+------------------------
+
+In the unpenalized case, the assumptions are the following:
+
+    * The target values :math:`y_i` are realizations of random variables
+      :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})`
+      with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter
+      :math:`\phi` and sample weights :math:`s_i`.
+    * The aim is to predict the expectation :math:`\mu_i` with
+      :math:`\hat{y_i} = h(\eta_i)`, linear predictor
+      :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`.
+
+Note that the first assumption implies
+:math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance
+function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the
+same as specifying a unit variance function (they are one-to-one).
+
+Including penalties helps to avoid overfitting or, in case of L1 penalty, to
+obtain sparse solutions. But there are also other motivations to include them,
+e.g. accounting for the dependence structure of :math:`y`.
+
+The objective function, which is independent of :math:`\phi`, is minimized with
+respect to the coefficients :math:`w`.
+
+The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
+likelihood as
+
+.. math::     d(y, \mu) = -2\phi\cdot
+              \left(loglike(y,\mu,\phi)
+              - loglike(y,y,\phi)\right) \\
+              D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i)
+
+===================================== ===============================  ================================= ============================================
+Distribution                          Target Domain                    Variance Function :math:`v(\mu)`  Unit Deviance :math:`d(y, \mu)`
+===================================== ===============================  ================================= ============================================
+Normal ("normal")                     :math:`y \in (-\infty, \infty)`  :math:`1`                         :math:`(y-\mu)^2`
+Poisson ("poisson")                   :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{\mu}-y+\mu)`
+Gamma ("gamma")                       :math:`y \in (0, \infty)`        :math:`\mu^2`                     :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
+Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)`        :math:`\mu^3`                     :math:`\frac{(y-\mu)^2}{y\mu^2}`
+===================================== ===============================  ================================= ============================================
+
+Two remarks:
+
+* The deviances for at least Normal, Poisson and Gamma distributions are
+  strictly consistent scoring functions for the mean :math:`\mu`, see Eq.
+  (19)-(20) in [12]_.
+
+* If you want to model a frequency, i.e. counts per exposure (time, volume, ...)
+  you can do so by a Poisson distribution and passing
+  :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together
+  with :math:`s=\mathrm{exposure}` as sample weights.
+
+
+.. topic:: References:
+
+    .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+
+    .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
+           See also `Exponential dispersion model. <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
+
+    .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
 
 Stochastic Gradient Descent - SGD
 =================================
diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py
new file mode 100644
index 0000000000000..fce85fae1ea8c
--- /dev/null
+++ b/examples/linear_model/plot_poisson_spline_regression.py
@@ -0,0 +1,85 @@
+"""
+=================================
+Poisson Regression with B-Splines
+=================================
+
+As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py`
+example, a Poisson regression with penalized B-splines (P-splines) [1]_ is
+fitted on slightly different sinusodial, Poisson distributed data and
+compared to an AdaBoost model with decision trees.
+One can see, that this is a hard problem for both estimators.
+
+.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines
+       and penalties". Statist. Sci. 11 (1996), no. 2, 89--121.
+       `doi:10.1214/ss/1038425655
+       <https://projecteuclid.org/euclid.ss/1038425655>`_
+
+"""
+print(__doc__)
+
+# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
+# based on the AdaBoost regression example from Noel Dawe <noel.dawe@gmail.com>
+# License: BSD 3 clause
+
+# importing necessary libraries
+import numpy as np
+from scipy.linalg import toeplitz
+# from scipy.interpolate import BSpline
+from scipy.interpolate import splev
+import matplotlib.pyplot as plt
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import AdaBoostRegressor
+from sklearn.linear_model import GeneralizedLinearRegressor
+
+
+# Create the dataset
+xmin, xmax = 0, 6
+rng = np.random.RandomState(1)
+X = np.linspace(xmin, xmax, 500)[:, np.newaxis]
+y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel())
+y = rng.poisson(y_true, X.shape[0])
+
+# b-spline basis
+nknots, degree = 40, 3
+ns = nknots - degree - 1  # number of base spline functions
+dx = (xmax - xmin) / (nknots - 1 - 2 * degree)
+knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots)
+coef = np.zeros(ns)
+splineBasis = np.empty((X.shape[0], ns), dtype=float)
+for i in range(ns):
+    coef[i] = 1
+#    splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \
+#        .ravel()
+    splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel()
+    coef[i] = 0
+
+# second order difference matrix
+P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float)
+P2[0, 0] = P2[-1, -1] = 1
+
+# Fit regression model
+regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
+                           n_estimators=10, random_state=rng)
+
+regr_2 = GeneralizedLinearRegressor(family='poisson', link='log',
+                                    fit_intercept=True, alpha=0.02,
+                                    l1_ratio=0.1, P2=P2)
+
+regr_1.fit(X, y)
+regr_2.fit(splineBasis, y)
+
+# Predict
+y_1 = regr_1.predict(X)
+y_2 = regr_2.predict(splineBasis)
+
+# Plot the results
+plt.figure()
+plt.plot(X, y_true, c="b", label="true mean")
+plt.scatter(X, y, c="k", marker='.', label="training samples")
+plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2)
+plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2)
+plt.xlabel("data")
+plt.ylabel("target")
+plt.title("Regression Comparison")
+plt.legend()
+plt.show()
diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 770a1a49b600e..cbb2ad8826358 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,6 +18,8 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
+from .glm import (TweedieDistribution,
+                  GeneralizedLinearRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
@@ -78,4 +80,6 @@
            'orthogonal_mp',
            'orthogonal_mp_gram',
            'ridge_regression',
-           'RANSACRegressor']
+           'RANSACRegressor',
+           'GeneralizedLinearRegressor',
+           'TweedieDistribution']
diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py
new file mode 100644
index 0000000000000..ac0007c1789a8
--- /dev/null
+++ b/sklearn/linear_model/glm.py
@@ -0,0 +1,2331 @@
+"""
+Generalized Linear Models with Exponential Dispersion Family
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# some parts and tricks stolen from other sklearn files.
+# License: BSD 3 clause
+
+# TODO: Should the option `normalize` be included (like other linear models)?
+#       So far, it is not included. User must pass a normalized X.
+# TODO: Add cross validation support, e.g. GCV?
+# TODO: Should GeneralizedLinearRegressor inherit from LinearModel?
+#       So far, it does not.
+# TODO: Include further classes in class.rst? ExponentialDispersionModel?
+#       TweedieDistribution?
+# TODO: Negative values in P1 are not allowed so far. They could be used
+#       for group lasso.
+
+# Design Decisions:
+# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
+#   Estimators in sklearn are either regressors or classifiers. A GLM can do
+#   both depending on the distr (Normal => regressor, Binomial => classifier).
+#   Solution: GeneralizedLinearRegressor since this is the focus.
+# - Allow for finer control of penalty terms:
+#   L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude
+#       factors from the L1 penalty.
+#   L2: w*P2*w with P2 a positive (semi-) definite matrix, e.g. P2 could be
+#   a 1st or 2nd order difference matrix (compare B-spline penalties and
+#   Tikhonov regularization).
+# - The link funtion (instance of class Link) is necessary for the evaluation
+#   of deviance, score, Fisher and Hessian matrix as a functions of the
+#   coefficients, which is needed by optimizers.
+#   Solution: link as argument in those functions
+# - Which name/symbol for sample_weight in docu?
+#   sklearn.linear_models uses w for coefficients, standard literature on
+#   GLMs use beta for coefficients and w for (sample) weights.
+#   So far, coefficients=w and sample weights=s.
+# - The intercept term is the first index, i.e. coef[0]
+
+
+from __future__ import division
+from abc import ABCMeta, abstractmethod
+import numbers
+import numpy as np
+from scipy import linalg, sparse, special
+import scipy.sparse.linalg as splinalg
+from scipy.optimize import fmin_l_bfgs_b
+import warnings
+from ..base import BaseEstimator, RegressorMixin
+from ..exceptions import ConvergenceWarning
+from ..utils import check_array, check_X_y
+from ..utils.optimize import newton_cg
+from ..utils.validation import check_is_fitted, check_random_state
+
+
+def _check_weights(sample_weight, n_samples):
+    """Check that sample weights are non-negative and have the right shape."""
+    if sample_weight is None:
+        weights = np.ones(n_samples)
+    elif np.isscalar(sample_weight):
+        if sample_weight <= 0:
+            raise ValueError("Sample weights must be non-negative.")
+        weights = sample_weight * np.ones(n_samples)
+    else:
+        _dtype = [np.float64, np.float32]
+        weights = check_array(sample_weight, accept_sparse=False,
+                              force_all_finite=True, ensure_2d=False,
+                              dtype=_dtype)
+        if weights.ndim > 1:
+            raise ValueError("Sample weight must be 1D array or scalar")
+        elif weights.shape[0] != n_samples:
+            raise ValueError("Sample weights must have the same length as "
+                             "y")
+        if not np.all(weights >= 0):
+            raise ValueError("Sample weights must be non-negative.")
+        elif not np.sum(weights) > 0:
+            raise ValueError("Sample weights must have at least one positive "
+                             "element.")
+
+    return weights
+
+
+def _safe_lin_pred(X, coef):
+    """Compute the linear predictor taking care if intercept is present."""
+    if coef.size == X.shape[1] + 1:
+        return X @ coef[1:] + coef[0]
+    else:
+        return X @ coef
+
+
+def _safe_toarray(X):
+    """Returns a numpy array."""
+    if sparse.issparse(X):
+        return X.toarray()
+    else:
+        return np.asarray(X)
+
+
+def _safe_sandwich_dot(X, d, intercept=False):
+    """Compute sandwich product X.T @ diag(d) @ X.
+
+    With ``intercept=True``, X is treated as if a column of 1 were appended as
+    first column of X.
+    X can be sparse, d must be an ndarray. Always returns a ndarray."""
+    if sparse.issparse(X):
+        temp = (X.transpose() @ X.multiply(d[:, np.newaxis]))
+        # for older versions of numpy and scipy, temp may be a np.matrix
+        temp = _safe_toarray(temp)
+    else:
+        temp = (X.T * d) @ X
+    if intercept:
+        dim = X.shape[1] + 1
+        if sparse.issparse(X):
+            order = 'F' if sparse.isspmatrix_csc(X) else 'C'
+        else:
+            order = 'F' if X.flags['F_CONTIGUOUS'] else 'C'
+        res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order)
+        res[0, 0] = d.sum()
+        res[1:, 0] = d @ X
+        res[0, 1:] = res[1:, 0]
+        res[1:, 1:] = temp
+    else:
+        res = temp
+    return res
+
+
+def _min_norm_sugrad(coef, grad, P2, P1):
+    """Compute the gradient of all subgradients with minimal L2-norm.
+
+    subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1)
+
+    g_i = grad_i + (P2*coef)_i
+
+    if coef_i > 0:   g_i + P1_i
+    if coef_i < 0:   g_i - P1_i
+    if coef_i = 0:   sign(g_i) * max(|g_i|-P1_i, 0)
+
+    Parameters
+    ----------
+    coef : ndarray
+        coef[0] may be intercept.
+
+    grad : ndarray, shape=coef.shape
+
+    P2 : {1d or 2d array, None}
+        always without intercept, ``None`` means P2 = 0
+
+    P1 : ndarray
+        always without intercept
+    """
+    intercept = (coef.size == P1.size + 1)
+    idx = 1 if intercept else 0  # offset if coef[0] is intercept
+    # compute grad + coef @ P2 without intercept
+    grad_wP2 = grad[idx:].copy()
+    if P2 is None:
+        pass
+    elif P2.ndim == 1:
+        grad_wP2 += coef[idx:] * P2
+    else:
+        grad_wP2 += coef[idx:] @ P2
+    res = np.where(coef[idx:] == 0,
+                   np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0),
+                   grad_wP2 + np.sign(coef[idx:]) * P1)
+    if intercept:
+        return np.concatenate(([grad[0]], res))
+    else:
+        return res
+
+
+class Link(metaclass=ABCMeta):
+    """Abstract base class for Link funtions."""
+
+    @abstractmethod
+    def link(self, mu):
+        """Compute the link function g(mu).
+
+        The link function links the mean mu=E[Y] to the so called linear
+        predictor (X*w), i.e. g(mu) = linear predictor.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Usually the (predicted) mean.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def derivative(self, mu):
+        """Compute the derivative of the link g'(mu).
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Usually the (predicted) mean.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def inverse(self, lin_pred):
+        """Compute the inverse link function h(lin_pred).
+
+        Gives the inverse relationship between linkear predictor and the mean
+        mu=E[Y], i.e. h(linear predictor) = mu.
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def inverse_derivative(self, lin_pred):
+        """Compute the derivative of the inverse link function h'(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def inverse_derivative2(self, lin_pred):
+        """Compute 2nd derivative of the inverse link function h''(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+        raise NotImplementedError
+
+
+class IdentityLink(Link):
+    """The identity link function g(x)=x."""
+
+    def link(self, mu):
+        return mu
+
+    def derivative(self, mu):
+        return np.ones_like(mu)
+
+    def inverse(self, lin_pred):
+        return lin_pred
+
+    def inverse_derivative(self, lin_pred):
+        return np.ones_like(lin_pred)
+
+    def inverse_derivative2(self, lin_pred):
+        return np.zeros_like(lin_pred)
+
+
+class LogLink(Link):
+    """The log link function g(x)=log(x)."""
+
+    def link(self, mu):
+        return np.log(mu)
+
+    def derivative(self, mu):
+        return 1./mu
+
+    def inverse(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative2(self, lin_pred):
+        return np.exp(lin_pred)
+
+
+class LogitLink(Link):
+    """The logit link function g(x)=logit(x)."""
+
+    def link(self, mu):
+        return special.logit(mu)
+
+    def derivative(self, mu):
+        return 1. / (mu * (1 - mu))
+
+    def inverse(self, lin_pred):
+        return special.expit(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        ep = special.expit(lin_pred)
+        return ep * (1. - ep)
+
+    def inverse_derivative2(self, lin_pred):
+        ep = special.expit(lin_pred)
+        ep = special.expit(lin_pred)
+        return ep * (1. - ep) * (1. - 2 * ep)
+
+
+class ExponentialDispersionModel(metaclass=ABCMeta):
+    r"""Base class for reproductive Exponential Dispersion Models (EDM).
+
+    The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
+
+    .. math:: p(y| \theta, \phi) = c(y, \phi)
+        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
+        = \tilde{c}(y, \phi)
+            \exp\left(-\frac{d(y, \mu)}{2\phi}\right)
+
+    with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`,
+    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`,
+    unit variance :math:`v(\mu)` and
+    unit deviance :math:`d(y,\mu)`.
+
+    Attributes
+    ----------
+    lower_bound
+    upper_bound
+    include_lower_bound
+    include_upper_bound
+
+    Methods
+    -------
+    in_y_range
+    unit_variance
+    unit_variance_derivative
+    variance
+    variance_derivative
+    unit_deviance
+    unit_deviance_derivative
+    deviance
+    deviance_derivative
+    starting_mu
+
+    _mu_deviance_derivative
+    _score
+    _fisher_matrix
+    _observed_information
+    _eta_mu_score_fisher
+
+    References
+    ----------
+
+    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+    """
+    @property
+    def lower_bound(self):
+        """Get the lower bound of values for Y~EDM."""
+        return self._lower_bound
+
+    @property
+    def upper_bound(self):
+        """Get the upper bound of values for Y~EDM."""
+        return self._upper_bound
+
+    @property
+    def include_lower_bound(self):
+        """Get True if lower bound for y is included: y >= lower_bound."""
+        return self._include_lower_bound
+
+    @property
+    def include_upper_bound(self):
+        """Get True if upper bound for y is includede: y <= upper_bound."""
+        return self._include_upper_bound
+
+    def in_y_range(self, x):
+        """Returns ``True`` if x is in the valid range of Y~EDM.
+
+        Parameters
+        ----------
+        x : array, shape (n_samples,)
+            Target values.
+        """
+        if self.include_lower_bound:
+            if self.include_upper_bound:
+                return np.logical_and(np.greater_equal(x, self.lower_bound),
+                                      np.less_equal(x, self.upper_bound))
+            else:
+                return np.logical_and(np.greater_equal(x, self.lower_bound),
+                                      np.less(x, self.upper_bound))
+        else:
+            if self.include_upper_bound:
+                return np.logical_and(np.greater(x, self.lower_bound),
+                                      np.less_equal(x, self.upper_bound))
+            else:
+                return np.logical_and(np.greater(x, self.lower_bound),
+                                      np.less(x, self.upper_bound))
+
+    @abstractmethod
+    def unit_variance(self, mu):
+        r"""Compute the unit variance function.
+
+        The unit variance :math:`v(\mu)` determines the variance as
+        a function of the mean :math:`\mu` by
+        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`.
+        It can also be derived from the unit deviance :math:`d(y,\mu)` as
+
+        .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{
+            \partial\mu^2}}\big|_{y=\mu}
+
+        See also :func:`variance`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def unit_variance_derivative(self, mu):
+        r"""Compute the derivative of the unit variance w.r.t. mu.
+
+        Return :math:`v'(\mu)`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Target values.
+        """
+        raise NotImplementedError()
+
+    def variance(self, mu, phi=1, weights=1):
+        r"""Compute the variance function.
+
+        The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
+        :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`,
+        with unit variance :math:`v(\mu)` and weights :math:`s_i`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        phi : float (default=1)
+            Dispersion parameter.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return phi/weights * self.unit_variance(mu)
+
+    def variance_derivative(self, mu, phi=1, weights=1):
+        r"""Compute the derivative of the variance w.r.t. mu.
+
+        Returns
+        :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
+        =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)`
+        and weights :math:`s_i`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        phi : float (default=1)
+            Dispersion parameter.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return phi/weights * self.unit_variance_derivative(mu)
+
+    @abstractmethod
+    def unit_deviance(self, y, mu):
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,\mu)` can be defined by the
+        log-likelihood as
+        :math:`d(y,\mu) = -2\phi\cdot
+        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        raise NotImplementedError()
+
+    def unit_deviance_derivative(self, y, mu):
+        r"""Compute the derivative of the unit deviance w.r.t. mu.
+
+        The derivative of the unit deviance is given by
+        :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
+        with unit variance :math:`v(\mu)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        return -2 * (y - mu) / self.unit_variance(mu)
+
+    def deviance(self, y, mu, weights=1):
+        r"""Compute the deviance.
+
+        The deviance is a weighted sum of the per sample unit deviances,
+        :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)`
+        with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`.
+        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
+        \left(loglike(y,\mu,\frac{phi}{s})
+        - loglike(y,y,\frac{phi}{s})\right)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return np.sum(weights * self.unit_deviance(y, mu))
+
+    def deviance_derivative(self, y, mu, weights=1):
+        """Compute the derivative of the deviance w.r.t. mu.
+
+        It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return weights * self.unit_deviance_derivative(y, mu)
+
+    def starting_mu(self, y, weights=1, ind_weight=0.5):
+        """Set starting values for the mean mu.
+
+        These may be good starting points for the (unpenalized) IRLS solver.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+
+        ind_weight : float (default=0.5)
+            Must be between 0 and 1. Specifies how much weight is given to the
+            individual observations instead of the mean of y.
+        """
+        return (ind_weight * y +
+                (1. - ind_weight) * np.average(y, weights=weights))
+
+    def _mu_deviance_derivative(self, coef, X, y, weights, link):
+        """Compute mu and the derivative of the deviance w.r.t coef."""
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        d1 = link.inverse_derivative(lin_pred)
+        temp = d1 * self.deviance_derivative(y, mu, weights)
+        if coef.size == X.shape[1] + 1:
+            devp = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            devp = temp @ X  # sampe as X.T @ temp
+        return mu, devp
+
+    def _score(self, coef, phi, X, y, weights, link):
+        r"""Compute the score function.
+
+        The score function is the derivative of the
+        log-likelihood w.r.t. `coef` (:math:`w`).
+        It is given by
+
+        .. math:
+
+            \mathbf{score}(\boldsymbol{w})
+            = \frac{\partial loglike}{\partial\boldsymbol{w}}
+            = \mathbf{X}^T \mathbf{D}
+            \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,,
+
+        with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and
+        :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`.
+        Note: The derivative of the deviance w.r.t. coef equals -2 * score.
+        """
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
+        d = link.inverse_derivative(lin_pred)
+        temp = sigma_inv * d * (y - mu)
+        if coef.size == X.shape[1] + 1:
+            score = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            score = temp @ X  # sampe as X.T @ temp
+        return score
+
+    def _fisher_matrix(self, coef, phi, X, y, weights, link):
+        r"""Compute the Fisher information matrix.
+
+        The Fisher information matrix, also known as expected information
+        matrix is given by
+
+        .. math:
+
+            \mathbf{F}(\boldsymbol{w}) =
+            \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial
+            \boldsymbol{w}} \right]
+            = \mathrm{E}\left[
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
+            \partial\boldsymbol{w}^T}\right]
+            = \mathbf{X}^T W \mathbf{X} \,,
+
+        with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`,
+        see func:`_score`.
+        """
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
+        d = link.inverse_derivative(lin_pred)
+        d2_sigma_inv = sigma_inv * d * d
+        intercept = (coef.size == X.shape[1] + 1)
+        fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv,
+                                           intercept=intercept)
+        return fisher_matrix
+
+    def _observed_information(self, coef, phi, X, y, weights, link):
+        r"""Compute the observed information matrix.
+
+        The observed information matrix, also known as the negative of
+        the Hessian matrix of the log-likelihood, is given by
+
+        .. math:
+
+            \mathbf{H}(\boldsymbol{w}) =
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
+            \partial\boldsymbol{w}^T}
+            = \mathbf{X}^T \left[
+            - \mathbf{D}' \mathbf{R}
+            + \mathbf{D}^2 \mathbf{V} \mathbf{R}
+            + \mathbf{D}^2
+            \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,,
+
+        with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`,
+        :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{
+        v(\mu_i)}
+        \right)`,
+        see :func:`score_` function and :func:`_fisher_matrix`.
+        """
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
+        dp = link.inverse_derivative2(lin_pred)
+        d2 = link.inverse_derivative(lin_pred)**2
+        v = self.unit_variance_derivative(mu)/self.unit_variance(mu)
+        r = y - mu
+        temp = sigma_inv * (-dp * r + d2 * v * r + d2)
+        intercept = (coef.size == X.shape[1] + 1)
+        observed_information = _safe_sandwich_dot(X, temp,
+                                                  intercept=intercept)
+        return observed_information
+
+    def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link,
+                             diag_fisher=False):
+        """Compute linear predictor, mean, score function and fisher matrix.
+
+        It calculates the linear predictor, the mean, score function
+        (derivative of log-likelihood) and Fisher information matrix
+        all in one go as function of `coef` (:math:`w`) and the data.
+
+        Parameters
+        ----------
+        diag_fisher : boolean, optional (default=False)
+            If ``True``, returns only an array d such that
+            fisher = X.T @ np.diag(d) @ X.
+
+        Returns
+        -------
+        (eta, mu, score, fisher) : tuple with 4 elements
+            The 4 elements are:
+
+            * eta: ndarray, shape (X.shape[0],)
+            * mu: ndarray, shape (X.shape[0],)
+            * score: ndarray, shape (X.shape[0],)
+            * fisher:
+
+                * If diag_fisher is ``False``, the full fisher matrix,
+                  an array of shape (X.shape[1], X.shape[1])
+                * If diag_fisher is ``True`, an array of shape (X.shape[0])
+        """
+        intercept = (coef.size == X.shape[1] + 1)
+        # eta = linear predictor
+        eta = _safe_lin_pred(X, coef)
+        mu = link.inverse(eta)
+        sigma_inv = 1./self.variance(mu, phi=phi, weights=weights)
+        d1 = link.inverse_derivative(eta)  # = h'(eta)
+        # Alternatively:
+        # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
+        # d1 = 1./link.derivative(mu)
+        d1_sigma_inv = d1 * sigma_inv
+        temp = d1_sigma_inv * (y - mu)
+        if intercept:
+            score = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            score = temp @ X
+
+        d2_sigma_inv = d1 * d1_sigma_inv
+        if diag_fisher:
+            fisher_matrix = d2_sigma_inv
+        else:
+            fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv,
+                                               intercept=intercept)
+        return eta, mu, score, fisher_matrix
+
+
+class TweedieDistribution(ExponentialDispersionModel):
+    r"""A class for the Tweedie distribution.
+
+    A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely
+    defined by it's mean-variance relationship
+    :math:`\mathrm{Var}[Y] \propto \mu^power`.
+
+    Special cases are:
+
+    ===== ================
+    Power Distribution
+    ===== ================
+    0     Normal
+    1     Poisson
+    (0,1) Compound Poisson
+    2     Gamma
+    3     Inverse Gaussian
+
+    Parameters
+    ----------
+    power : float (default=0)
+            The variance power of the `unit_variance`
+            :math:`v(\mu) = \mu^{power}`.
+            For ``0<power<1``, no distribution exists.
+    """
+    def __init__(self, power=0):
+        self.power = power
+        self._upper_bound = np.Inf
+        self._include_upper_bound = False
+        if power < 0:
+            # Extreme Stable
+            self._lower_bound = -np.Inf
+            self._include_lower_bound = False
+        elif power == 0:
+            # NormalDistribution
+            self._lower_bound = -np.Inf
+            self._include_lower_bound = False
+        elif (power > 0) and (power < 1):
+            raise ValueError('For 0<power<1, no distribution exists.')
+        elif power == 1:
+            # PoissonDistribution
+            self._lower_bound = 0
+            self._include_lower_bound = True
+        elif (power > 1) and (power < 2):
+            # Compound Poisson
+            self._lower_bound = 0
+            self._include_lower_bound = True
+        elif power == 2:
+            # GammaDistribution
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        elif (power > 2) and (power < 3):
+            # Positive Stable
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        elif power == 3:
+            # InverseGaussianDistribution
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        elif power > 3:
+            # Positive Stable
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        else:
+            raise ValueError('The power must be a float, i.e. real number, '
+                             'got (power={})'.format(power))
+
+    @property
+    def power(self):
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        if not isinstance(power, numbers.Real):
+            raise TypeError('power must be a real number, input was {0}'
+                            .format(power))
+        self._power = power
+
+    def unit_variance(self, mu):
+        """Compute the unit variance of a Tweedie distribution v(mu)=mu**power.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        return np.power(mu, self.power)
+
+    def unit_variance_derivative(self, mu):
+        """Compute the derivative of the unit variance of a Tweedie
+        distribution v(mu)=power*mu**(power-1).
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        return self.power * np.power(mu, self.power - 1)
+
+    def unit_deviance(self, y, mu):
+        p = self.power
+        if p == 0:
+            # NormalDistribution
+            return (y - mu)**2
+        if p == 1:
+            # PoissonDistribution
+            # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
+            return 2 * (special.xlogy(y, y/mu) - y + mu)
+        elif p == 2:
+            # GammaDistribution
+            return 2 * (np.log(mu/y) + y/mu - 1)
+        else:
+            # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
+            #    - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p))
+            return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
+                        y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
+
+
+class NormalDistribution(TweedieDistribution):
+    """Class for the Normal (aka Gaussian) distribution"""
+    def __init__(self):
+        super(NormalDistribution, self).__init__(power=0)
+
+
+class PoissonDistribution(TweedieDistribution):
+    """Class for the scaled Poisson distribution"""
+    def __init__(self):
+        super(PoissonDistribution, self).__init__(power=1)
+
+
+class GammaDistribution(TweedieDistribution):
+    """Class for the Gamma distribution"""
+    def __init__(self):
+        super(GammaDistribution, self).__init__(power=2)
+
+
+class InverseGaussianDistribution(TweedieDistribution):
+    """Class for the scaled InverseGaussianDistribution distribution"""
+    def __init__(self):
+        super(InverseGaussianDistribution, self).__init__(power=3)
+
+
+class GeneralizedHyperbolicSecant(ExponentialDispersionModel):
+    """A class for the Generalized Hyperbolic Secant (GHS) distribution.
+
+    The GHS distribution is for tagets y in (-inf, inf).
+    """
+    def __init__(self):
+        self._lower_bound = -np.Inf
+        self._upper_bound = np.Inf
+        self._include_lower_bound = False
+        self._include_upper_bound = False
+
+    def unit_variance(self, mu):
+        return 1 + mu**2
+
+    def unit_variance_derivative(self, mu):
+        return 2 * mu
+
+    def unit_deviance(self, y, mu):
+        return (2 * y * (np.arctan(y) - np.arctan(mu)) +
+                np.log((1 + mu**2)/(1 + y**2)))
+
+
+class BinomialDistribution(ExponentialDispersionModel):
+    """A class for the Binomial distribution.
+
+    The Binomial distribution is for tagets y in [0, 1].
+    """
+    def __init__(self):
+        self._lower_bound = 0
+        self._upper_bound = 1
+        self._include_lower_bound = True
+        self._include_upper_bound = True
+
+    def unit_variance(self, mu):
+        return mu * (1 - mu)
+
+    def unit_variance_derivative(self, mu):
+        return 1 - 2 * mu
+
+    def unit_deviance(self, y, mu):
+        return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu)))
+
+
+def _irls_step(X, W, P2, z, fit_intercept=True):
+    """Compute one step in iteratively reweighted least squares.
+
+    Solve A w = b for w with
+    A = (X' W X + P2)
+    b = X' W z
+    z = eta + D^-1 (y-mu)
+
+    See also fit method of :class:`GeneralizedLinearRegressor`.
+
+    Parameters
+    ----------
+    X : {ndarray, sparse matrix}, shape (n_samples, n_features)
+        Training data (with intercept included if present)
+
+    W : ndarray, shape (n_samples,)
+
+    P2 : {ndarray, sparse matrix}, shape (n_features, n_features)
+        The L2-penalty matrix or vector (=diagonal matrix)
+
+    z : ndarray, shape (n_samples,)
+        Working observations
+
+    fit_intercept : boolean, optional (default=True)
+
+    Returns
+    -------
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
+    """
+    # Note: solve vs least squares, what is more appropriate?
+    #       scipy.linalg.solve seems faster, but scipy.linalg.lstsq
+    #       is more robust.
+    # Note: X.T @ W @ X is not sparse, even when X is sparse.
+    #      Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b)
+    if fit_intercept:
+        Wz = W * z
+        if sparse.issparse(X):
+            b = np.concatenate(([Wz.sum()], X.transpose() @ Wz))
+        else:
+            b = np.concatenate(([Wz.sum()], X.T @ Wz))
+        A = _safe_sandwich_dot(X, W, intercept=fit_intercept)
+        if P2.ndim == 1:
+            idx = np.arange(start=1, stop=A.shape[0])
+            A[(idx, idx)] += P2  # add to diag elements without intercept
+        elif sparse.issparse(P2):
+            A[1:, 1:] += P2.toarray()
+        else:
+            A[1:, 1:] += P2
+    else:
+        if sparse.issparse(X):
+            XtW = X.transpose().multiply(W)
+            # for older versions of numpy and scipy, A may be a np.matrix
+            A = _safe_toarray(XtW @ X)
+        else:
+            XtW = (X.T * W)
+            A = XtW @ X
+        b = XtW @ z
+        if P2.ndim == 1:
+            A[np.diag_indices_from(A)] += P2
+        elif sparse.issparse(P2):
+            A += P2.toarray()
+        else:
+            A += P2
+    # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True)
+    coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
+    return coef
+
+
+def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link,
+                 max_iter, tol):
+    """Solve GLM with L2 penalty by IRLS algorithm.
+
+    Note: If X is sparse, P2 must also be sparse.
+    """
+    # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj'
+    #   Obj = objective function = 1/2 Dev + l2/2 w P2 w
+    #   Dev = deviance, s = normalized weights, variance V(mu) but phi=1
+    #   D   = link.inverse_derivative(eta) = diag_matrix(h'(X w))
+    #   D2  = link.inverse_derivative(eta)^2 = D^2
+    #   W   = D2/V(mu)
+    #   l2  = alpha * (1 - l1_ratio)
+    #   Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w
+    #        = -X' D (y-mu)/V(mu) + l2 P2 w
+    #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
+    #   Use Fisher matrix instead of full info matrix -X'(...) X,
+    #    i.e. E[Dev''] with E[y-mu]=0:
+    #   Obj'' ~ X' W X + l2 P2
+    # (1): w = (X' W X + l2 P2)^-1 X' W z,
+    #      with z = eta + D^-1 (y-mu)
+    # Note: P2 must be symmetrized
+    # Note: ' denotes derivative, but also transpose for matrices
+
+    # eta = linear predictor
+    eta = _safe_lin_pred(X, coef)
+    mu = link.inverse(eta)
+    # D = h'(eta)
+    hp = link.inverse_derivative(eta)
+    V = family.variance(mu, phi=1, weights=weights)
+    n_iter = 0
+    while n_iter < max_iter:
+        n_iter += 1
+        # coef_old not used so far.
+        # coef_old = coef
+        # working weights W, in principle a diagonal matrix
+        # therefore here just as 1d array
+        W = hp**2 / V
+        # working observations
+        z = eta + (y - mu) / hp
+        # solve A*coef = b
+        # A = X' W X + P2, b = X' W z
+        coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept)
+        # updated linear predictor
+        # do it here for updated values for tolerance
+        eta = _safe_lin_pred(X, coef)
+        mu = link.inverse(eta)
+        hp = link.inverse_derivative(eta)
+        V = family.variance(mu, phi=1, weights=weights)
+
+        # which tolerace? |coef - coef_old| or gradient?
+        # use gradient for compliance with newton-cg and lbfgs
+        # gradient = -X' D (y-mu)/V(mu) + l2 P2 w
+        temp = hp * (y - mu) / V
+        if sparse.issparse(X):
+            gradient = -(X.transpose() @ temp)
+        else:
+            gradient = -(X.T @ temp)
+        idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
+        if P2.ndim == 1:
+            gradient += P2 * coef[idx:]
+        else:
+            gradient += P2 @ coef[idx:]
+        if fit_intercept:
+            gradient = np.concatenate(([-temp.sum()], gradient))
+        if (np.max(np.abs(gradient)) <= tol):
+            converged = True
+            break
+
+    if not converged:
+        warnings.warn("irls failed to converge. Increase the number "
+                      "of iterations (currently {0})"
+                      .format(max_iter), ConvergenceWarning)
+
+    return coef, n_iter
+
+
+def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
+              max_inner_iter=1000, selection='cyclic',
+              random_state=None, diag_fisher=False):
+    """Compute inner loop of coordinate descent, i.e. cycles through features.
+
+    Minimization of 1-d subproblems::
+
+        min_z q(d+z*e_j) - q(d)
+        = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1
+
+    A = f'(w) + d*H(w) + (w+d)*P2
+    B = H+P2
+    Note: f'=-score and H=fisher are updated at the end of outer iteration.
+    """
+    # TODO: use sparsity (coefficient already 0 due to L1 penalty)
+    #       => active set of features for featurelist, see paper
+    #          of Improved GLMNET or Gap Safe Screening Rules
+    #          https://arxiv.org/abs/1611.05780
+    n_samples, n_features = X.shape
+    intercept = (coef.size == X.shape[1] + 1)
+    idx = 1 if intercept else 0  # offset if coef[0] is intercept
+    B = fisher
+    if P2.ndim == 1:
+        coef_P2 = coef[idx:] * P2
+        if not diag_fisher:
+            idiag = np.arange(start=idx, stop=B.shape[0])
+            # B[np.diag_indices_from(B)] += P2
+            B[(idiag, idiag)] += P2
+    else:
+        coef_P2 = coef[idx:] @ P2
+        if not diag_fisher:
+            if sparse.issparse(P2):
+                B[idx:, idx:] += P2.toarray()
+            else:
+                B[idx:, idx:] += P2
+    # A = -score + coef_P2
+    A = -score
+    A[idx:] += coef_P2
+    # A += d @ (H+P2) but so far d=0
+    # inner loop
+    for inner_iter in range(1, max_inner_iter+1):
+        inner_iter += 1
+        n_cycles += 1
+        # cycle through features, update intercept separately at the end
+        if selection == 'random':
+            featurelist = random_state.permutation(n_features)
+        else:
+            featurelist = np.arange(n_features)
+        for j in featurelist:
+            # minimize_z: a z + 1/2 b z^2 + c |d+z|
+            # a = A_j
+            # b = B_jj > 0
+            # c = |P1_j| = P1_j > 0, see 1.3
+            # d = w_j + d_j
+            # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
+            # with beta = z+d, beta_hat = d-a/b and gamma = c/b
+            # z = 1/b * S(bd-a,c) - d
+            # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
+            jdx = j+idx  # index for arrays containing entries for intercept
+            a = A[jdx]
+            if diag_fisher:
+                # Note: fisher is ndarray of shape (n_samples,) => no idx
+                # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway
+                Bj = np.zeros_like(A)
+                if intercept:
+                    Bj[0] = fisher.sum()
+                if sparse.issparse(X):
+                    Bj[idx:] = _safe_toarray(X[:, j].transpose() @
+                                             X.multiply(fisher[:, np.newaxis])
+                                             ).ravel()
+                else:
+                    Bj[idx:] = (fisher * X[:, j]) @ X
+
+                if P2.ndim == 1:
+                    Bj[idx:] += P2[j]
+                else:
+                    if sparse.issparse(P2):
+                        # slice columns as P2 is csc
+                        Bj[idx:] += P2[:, j].toarray().ravel()
+                    else:
+                        Bj[idx:] += P2[:, j]
+                b = Bj[jdx]
+            else:
+                b = B[jdx, jdx]
+
+            # those ten lines aree what it is all about
+            if b <= 0:
+                z = 0
+            elif P1[j] == 0:
+                z = -a/b
+            elif a + P1[j] < b * (coef[jdx] + d[jdx]):
+                z = -(a + P1[j])/b
+            elif a - P1[j] > b * (coef[jdx] + d[jdx]):
+                z = -(a - P1[j])/b
+            else:
+                z = -(coef[jdx] + d[jdx])
+
+            # update direction d
+            d[jdx] += z
+            # update A because d_j is now d_j+z
+            # A = f'(w) + d*H(w) + (w+d)*P2
+            # => A += (H+P2)*e_j z = B_j * z
+            # Note: B is symmetric B = B.transpose
+            if diag_fisher:
+                # Bj = B[:, j] calculated above, still valid
+                A += Bj * z
+            else:
+                # B is symmetric, C- or F-contiguous, but never sparse
+                if B.flags['F_CONTIGUOUS']:
+                    # slice columns like for sparse csc
+                    A += B[:, jdx] * z
+                else:  # B.flags['C_CONTIGUOUS'] might be true
+                    # slice rows
+                    A += B[jdx, :] * z
+            # end of cycle over features
+        # update intercept
+        if intercept:
+            if diag_fisher:
+                Bj = np.zeros_like(A)
+                Bj[0] = fisher.sum()
+                Bj[1:] = fisher @ X
+                b = Bj[0]
+            else:
+                b = B[0, 0]
+            z = 0 if b <= 0 else -A[0]/b
+            d[0] += z
+            if diag_fisher:
+                A += Bj * z
+            else:
+                if B.flags['F_CONTIGUOUS']:
+                    A += B[:, 0] * z
+                else:
+                    A += B[0, :] * z
+        # end of complete cycle
+        # stopping criterion for inner loop
+        # sum_i(|minimum of norm of subgrad of q(d)_i|)
+        # subgrad q(d) = A + subgrad ||P1*(w+d)||_1
+        mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1)
+        mn_subgrad = linalg.norm(mn_subgrad, ord=1)
+        if mn_subgrad <= inner_tol:
+            if inner_iter == 1:
+                inner_tol = inner_tol/4.
+            break
+        # end of inner loop
+    return d, coef_P2, n_cycles, inner_tol
+
+
+def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
+               max_iter=100, max_inner_iter=1000, tol=1e-4,
+               selection='cyclic ', random_state=None,
+               diag_fisher=False, copy_X=True):
+    """Solve GLM with L1 and L2 penalty by coordinate descent algorithm.
+
+    The objective beeing minimized in the coefficients w=coef is::
+
+        F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1
+
+    An Improved GLMNET for L1-regularized Logistic Regression:
+
+    1. Find optimal descent direction d by minimizing
+       min_d F(w+d) = min_d F(w+d) - F(w)
+    2. Quadrdatic approximation of F(w+d)-F(w) = q(d):
+       using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives:
+       q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
+       + ||P1*(w+d)||_1 - ||P1*w||_1
+       Then minimize q(d): min_d q(d)
+    3. Coordinate descent by updating coordinate j (d -> d+z*e_j):
+       min_z q(d+z*e_j)
+       = min_z q(d+z*e_j) - q(d)
+       = min_z A_j z + 1/2 B_jj z^2
+               + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
+       A = f'(w) + d*H(w) + (w+d)*P2
+       B = H + P2
+
+    Repeat steps 1-3 until convergence.
+    Note: Use Fisher matrix instead of Hessian for H.
+    Note: f' = -score, H = Fisher matrix
+
+    Parameters
+    ----------
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
+
+    X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
+        Training data (with intercept included if present). If not sparse,
+        pass directly as Fortran-contiguous data to avoid
+        unnecessary memory duplication.
+
+    y : ndarray, shape (n_samples,)
+        Target values.
+
+    weights: ndarray, shape (n_samples,)
+        Sample weights with which the deviance is weighted. The weights must
+        bee normalized and sum to 1.
+
+    P1 : {ndarray}, shape (n_features,)
+        The L1-penalty vector (=diagonal matrix)
+
+    P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features)
+        The L2-penalty matrix or vector (=diagonal matrix). If a matrix is
+        passed, it must be symmetric. If X is sparse, P2 must also be sparse.
+
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
+    family : ExponentialDispersionModel
+
+    link : Link
+
+    max_iter : int, optional (default=100)
+        Maximum numer of outer (Newton) iterations.
+
+    max_inner_iter : int, optional (default=1000)
+        Maximum number of iterations in each inner loop, i.e. max number of
+        cycles over all features per inner loop.
+
+    tol : float, optional (default=1e-4)
+        Covergence criterion is
+        sum_i(|minimum of norm of subgrad of objective_i|)<=tol.
+
+    selection : str, optional (default='cyclic')
+        If 'random', randomly chose features in inner loop.
+
+    random_state : {int, RandomState instance, None}, optional (default=None)
+
+    diag_fisher : boolean, optional (default=False)
+        ``False`` calculates full fisher matrix, ``True`` only diagonal matrix
+        s.t. fisher = X.T @ diag @ X. This saves storage but needs more
+        matrix-vector multiplications.
+
+    copy_X : boolean, optional (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    Returns
+    -------
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
+
+    n_iter : numer of outer iterations = newton iterations
+
+    n_cycles : number of cycles over features
+
+    References
+    ----------
+    Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+    An Improved GLMNET for L1-regularized Logistic Regression,
+    Journal of Machine Learning Research 13 (2012) 1999-2030
+    https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    """
+    X = check_array(X, 'csc', dtype=[np.float64, np.float32],
+                    order='F', copy=copy_X)
+    if P2.ndim == 2:
+        P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32],
+                         order='F', copy=copy_X)
+    if sparse.issparse(X):
+        if not sparse.isspmatrix_csc(X):
+            raise ValueError("If X is sparse, it must be in csc format"
+                             "; got (format={})".format(X.format))
+        if not sparse.isspmatrix_csc(P2):
+            raise ValueError("If X is sparse, P2 must also be sparse csc"
+                             "format. Got P2 not sparse.")
+    random_state = check_random_state(random_state)
+    # Note: we already set P2 = l2*P2, P1 = l1*P1
+    # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+    n_iter = 0  # number of outer iterations
+    n_cycles = 0  # number of (complete) cycles over features
+    converged = False
+    n_samples, n_features = X.shape
+    idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
+    # line search parameters
+    (beta, sigma) = (0.5, 0.01)
+    # some precalculations
+    # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a
+    #       1d array representing a diagonal matrix.
+    eta, mu, score, fisher = family._eta_mu_score_fisher(
+        coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
+        diag_fisher=diag_fisher)
+    # set up space for search direction d for inner loop
+    d = np.zeros_like(coef)
+    # initial stopping tolerance of inner loop
+    # use L1-norm of minimum of norm of subgradient of F
+    inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
+    inner_tol = linalg.norm(inner_tol, ord=1)
+    # outer loop
+    while n_iter < max_iter:
+        n_iter += 1
+        # initialize search direction d (to be optimized) with zero
+        d.fill(0)
+        # inner loop = _cd_cycle
+        d, coef_P2, n_cycles, inner_tol = \
+            _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
+                      max_inner_iter=max_inner_iter, selection=selection,
+                      random_state=random_state, diag_fisher=diag_fisher)
+        # line search by sequence beta^k, k=0, 1, ..
+        # F(w + lambda d) - F(w) <= lambda * bound
+        # bound = sigma * (f'(w)*d + w*P2*d
+        #                  +||P1 (w+d)||_1 - ||P1 w||_1)
+        P1w_1 = linalg.norm(P1 * coef[idx:], ord=1)
+        P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1)
+        # Note: coef_P2 already calculated and still valid
+        bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1)
+        Fw = (0.5 * family.deviance(y, mu, weights) +
+              0.5 * (coef_P2 @ coef[idx:]) + P1w_1)
+        la = 1./beta
+        for k in range(20):
+            la *= beta  # starts with la=1
+            coef_wd = coef + la * d
+            mu_wd = link.inverse(_safe_lin_pred(X, coef_wd))
+            Fwd = (0.5 * family.deviance(y, mu_wd, weights) +
+                   linalg.norm(P1 * coef_wd[idx:], ord=1))
+            if P2.ndim == 1:
+                Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:])
+            else:
+                Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:]))
+            if Fwd - Fw <= sigma * la * bound:
+                break
+        # update coefficients
+        # coef_old = coef.copy()
+        coef += la * d
+        # calculate eta, mu, score, Fisher matrix for next iteration
+        eta, mu, score, fisher = family._eta_mu_score_fisher(
+            coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
+            diag_fisher=diag_fisher)
+        # stopping criterion for outer loop
+        # sum_i(|minimum-norm of subgrad of F(w)_i|)
+        # fp_wP2 = f'(w) + w*P2
+        # Note: eta, mu and score are already updated
+        mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
+        mn_subgrad = linalg.norm(mn_subgrad, ord=1)
+        if mn_subgrad <= tol:
+            converged = True
+            break
+        # end of outer loop
+    if not converged:
+        warnings.warn("Coordinate descent failed to converge. Increase"
+                      " the maximum number of iterations max_iter"
+                      " (currently {0})".format(max_iter), ConvergenceWarning)
+
+    return coef, n_iter, n_cycles
+
+
+class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
+    """Regression via a Generalized Linear Model (GLM) with penalties.
+
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
+    fitting and predicting the mean of the target y as mu=h(X*w). Therefore,
+    the fit minimizes the following objective function with combined L1 and L2
+    priors as regularizer::
+
+            1/(2*sum(s)) * deviance(y, h(X*w); s)
+            + alpha * l1_ratio * ||P1*w||_1
+            + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
+
+    with inverse link function h and s=sample_weight. Note that for
+    ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples).
+    For ``P1=P2='identity'``, the penalty is the elastic net::
+
+            alpha * l1_ratio * ||w||_1
+            + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2
+
+    If you are interested in controlling the L1 and L2 penalties
+    separately, keep in mind that this is equivalent to::
+
+            a * L1 + b * L2
+
+    where::
+
+            alpha = a + b and l1_ratio = a / (a + b)
+
+    The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet,
+    while ``alpha`` corresponds to the lambda parameter in glmnet.
+    Specifically, l1_ratio = 1 is the lasso penalty.
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
+
+    Parameters
+    ----------
+    alpha : float, optional (default=1)
+        Constant that multiplies the penalty terms und thus determines the
+        regularization strength.
+        See the notes for the exact mathematical meaning of this
+        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        case, the design matrix X must have full column rank
+        (no collinearities).
+
+    l1_ratio : float, optional (default=0)
+        The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For
+        ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it
+        is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
+        combination of L1 and L2.
+
+    P1 : {'identity', array-like}, shape (n_features,), optional \
+            (default='identity')
+        With this array, you can exclude coefficients from the L1 penalty.
+        Set the corresponding value to 1 (include) or 0 (exclude). The
+        default value ``'identity'`` is the same as a 1d array of ones.
+        Note that n_features = X.shape[1].
+
+    P2 : {'identity', array-like, sparse matrix}, shape \
+            (n_features,) or (n_features, n_features), optional \
+            (default='identity')
+        With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`.
+        This gives a fine control over this penalty (Tikhonov regularization).
+        A 2d array is directly used as the square matrix P2. A 1d array is
+        interpreted as diagonal (square) matrix. The default 'identity' sets
+        the identity matrix, which gives the usual squared L2-norm. If you just
+        want to exclude certain coefficients, pass a 1d array filled with 1,
+        and 0 for the coefficients to be excluded.
+        Note that P2 must be positive semi-definite.
+
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
+    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \
+            or an instance of class ExponentialDispersionModel, \
+            optional(default='normal')
+        The distributional assumption of the GLM, i.e. which distribution from
+        the EDM, specifies the loss function to be minimized.
+
+    link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \
+            optional (default='auto')
+        The link function of the GLM, i.e. mapping from linear predictor
+        (X*coef) to expectation (mu). Option 'auto' sets the link depending on
+        the chosen family as follows:
+
+        - 'identity' for family 'normal'
+
+        - 'log' for families 'poisson', 'gamma', 'inverse.gaussian'
+
+        - 'logit' for family 'binomial'
+
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None)
+        Method for estimation of the dispersion parameter phi. Whether to use
+        the chi squared statisic or the deviance statistic. If None, the
+        dispersion is not estimated.
+
+    solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \
+            optional (default='auto')
+        Algorithm to use in the optimization problem:
+
+        'auto'
+            Sets 'irls' if l1_ratio equals 0, else 'cd'.
+
+        'cd'
+            Coordinate descent algorithm. It can deal with L1 as well as L2
+            penalties. Note that in order to avoid unnecessary memory
+            duplication of X in the ``fit`` method, X should be directly passed
+            as a Fortran-contiguous numpy array or sparse csc matrix.
+
+        'irls'
+            Iterated reweighted least squares.
+            It is the standard algorithm for GLMs. It cannot deal with
+            L1 penalties.
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties.
+
+        'newton-cg', 'lbfgs'
+            Newton conjugate gradient algorithm cannot deal with L1 penalties.
+
+        Note that all solvers except lbfgs use the fisher matrix, i.e. the
+        expected Hessian instead of the Hessian matrix.
+
+    max_iter : int, optional (default=100)
+        The maximal number of iterations for solver algorithms.
+
+    tol : float, optional (default=1e-4)
+        Stopping criterion. For the irls, newton-cg and lbfgs solvers,
+        the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
+        where ``g_i`` is the i-th component of the gradient (derivative) of
+        the objective function. For the cd solver, covergence is reached
+        when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the
+        subgradient of the objective and minimum-norm of ``g_i`` is the element
+        of the subgradient ``g_i`` with the smallest L2-norm.
+
+    warm_start : boolean, optional (default=False)
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` (supersedes option
+        ``start_params``). If set to ``True`` or if the attribute ``coef_``
+        does not exit (first call to ``fit``), option ``start_params`` sets the
+        start values for ``coef_`` and ``intercept_``.
+
+    start_params : {'guess', 'zero', array of shape (n_features*, )}, \
+            optional (default='guess')
+        Relevant only if ``warm_start=False`` or if fit is called
+        the first time (``self.coef_`` does not yet exist).
+
+        'guess'
+            Start values of mu are calculated by family.starting_mu(..). Then,
+            one Newton step obtains start values for ``coef_``. If
+            ``solver='irls'``, it uses one irls step, else the Newton step is
+            calculated by the cd solver.
+            This gives usually good starting values.
+
+        'zero'
+        All coefficients are set to zero. If ``fit_intercept=True``, the
+        start value for the intercept is obtained by the weighted average of y.
+
+        array
+        The array of size n_features* is directly used as start values
+        for ``coef_``. If ``fit_intercept=True``, the first element
+        is assumed to be the start value for the ``intercept_``.
+        Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes
+        the intercept in counting.
+
+    selection : str, optional (default='cyclic')
+        For the solver 'cd' (coordinate descent), the coordinates (features)
+        can be updated in either cyclic or random order.
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially in the same order. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    random_state : {int, RandomState instance, None}, optional (default=None)
+        The seed of the pseudo random number generator that selects a random
+        feature to be updated for solver 'cd' (coordinate descent).
+        If int, random_state is the seed used by the random
+        number generator; if RandomState instance, random_state is the random
+        number generator; if None, the random number generator is the
+        RandomState instance used by `np.random`. Used when ``selection`` ==
+        'random'.
+
+    diag_fisher : boolean, optional, (default=False)
+        Only relevant for solver 'cd' (see also ``start_params='guess'``).
+        If ``False``, the full Fisher matrix (expected Hessian) is computed in
+        each outer iteration (Newton iteration). If ``True``, only a diagonal
+        matrix (stored as 1d array) is computed, such that
+        fisher = X.T @ diag @ X. This saves memory and matrix-matrix
+        multiplications, but needs more matrix-vector multiplications. If you
+        use large sparse X or if you have many features,
+        i.e. n_features >> n_samples, you might set this option to ``True``.
+
+    copy_X : boolean, optional, (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    check_input : boolean, optional (default=True)
+        Allow to bypass several checks on input: y values in range of family,
+        sample_weight non-negative, P2 positive semi-definite.
+        Don't use this parameter unless you know what you do.
+
+    verbose : int, optional (default=0)
+        For the lbfgs solver set verbose to any positive number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array, shape (n_features,)
+        Estimated coefficients for the linear predictor (X*coef_+intercept_) in
+        the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    dispersion_ : float
+        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
+
+    n_iter_ : int
+        Actual number of iterations used in solver.
+
+    Notes
+    -----
+    The fit itself does not need Y to be from an EDM, but only assumes
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    :ref:`User Guide <Generalized_linear_regression>`.
+
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    minimizing the deviance plus penalty term, which is equivalent to
+    (penalized) maximum likelihood estimation.
+
+    For alpha > 0, the feature matrix X should be standardized in order to
+    penalize features equally strong. Call
+    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
+
+    If the target y is a ratio, appropriate sample weights s should be
+    provided.
+    As an example, consider Poission distributed counts z (integers) and
+    weights s=exposure (time, money, persons years, ...). Then you fit
+    y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
+    sample_weight=s)``. The weights are necessary for the right (finite
+    sample) mean.
+    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
+    in this case one might say that y has a 'scaled' Poisson distributions.
+    The same holds for other distributions.
+
+    References
+    ----------
+    For the coordinate descent implementation:
+        * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+          An Improved GLMNET for L1-regularized Logistic Regression,
+          Journal of Machine Learning Research 13 (2012) 1999-2030
+          https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    """
+    def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
+                 fit_intercept=True, family='normal', link='auto',
+                 fit_dispersion=None, solver='auto', max_iter=100,
+                 tol=1e-4, warm_start=False, start_params='guess',
+                 selection='cyclic', random_state=None, diag_fisher=False,
+                 copy_X=True, check_input=True, verbose=0):
+        self.alpha = alpha
+        self.l1_ratio = l1_ratio
+        self.P1 = P1
+        self.P2 = P2
+        self.fit_intercept = fit_intercept
+        self.family = family
+        self.link = link
+        self.fit_dispersion = fit_dispersion
+        self.solver = solver
+        self.max_iter = max_iter
+        self.tol = tol
+        self.warm_start = warm_start
+        self.start_params = start_params
+        self.selection = selection
+        self.random_state = random_state
+        self.diag_fisher = diag_fisher
+        self.copy_X = copy_X
+        self.check_input = check_input
+        self.verbose = verbose
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit a Generalized Linear Model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+
+        y : array-like, shape (n_samples,)
+            Target values.
+
+        sample_weight : {None, array-like}, shape (n_samples,),\
+                optinal (default=None)
+            Individual weights w_i for each sample. Note that for an
+            Exponential Dispersion Model (EDM), one has
+            Var[Y_i]=phi/w_i * v(mu).
+            If Y_i ~ EDM(mu, phi/w_i), then
+            sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
+            weighted average with weights=sample_weight.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        #######################################################################
+        # 1. input validation                                                 #
+        #######################################################################
+        # 1.1 validate arguments of __init__ ##################################
+        # Guarantee that self._family_instance is an instance of class
+        # ExponentialDispersionModel
+        if isinstance(self.family, ExponentialDispersionModel):
+            self._family_instance = self.family
+        else:
+            if self.family == 'normal':
+                self._family_instance = NormalDistribution()
+            elif self.family == 'poisson':
+                self._family_instance = PoissonDistribution()
+            elif self.family == 'gamma':
+                self._family_instance = GammaDistribution()
+            elif self.family == 'inverse.gaussian':
+                self._family_instance = InverseGaussianDistribution()
+            elif self.family == 'binomial':
+                self._family_instance = BinomialDistribution()
+            else:
+                raise ValueError(
+                    "The family must be an instance of class"
+                    " ExponentialDispersionModel or an element of"
+                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
+                    "'binomial']; got (family={0})".format(self.family))
+
+        # Guarantee that self._link_instance is set to an instance of
+        # class Link
+        if isinstance(self.link, Link):
+            self._link_instance = self.link
+        else:
+            if self.link == 'auto':
+                if isinstance(self._family_instance, TweedieDistribution):
+                    if self._family_instance.power <= 0:
+                        self._link_instance = IdentityLink()
+                    if self._family_instance.power >= 1:
+                        self._link_instance = LogLink()
+                elif isinstance(self._family_instance,
+                                GeneralizedHyperbolicSecant):
+                    self._link_instance = IdentityLink()
+                elif isinstance(self._family_instance, BinomialDistribution):
+                    self._link_instance = LogitLink()
+                else:
+                    raise ValueError("No default link known for the "
+                                     "specified distribution family. Please "
+                                     "set link manually, i.e. not to 'auto'; "
+                                     "got (link='auto', family={}"
+                                     .format(self.family))
+            elif self.link == 'identity':
+                self._link_instance = IdentityLink()
+            elif self.link == 'log':
+                self._link_instance = LogLink()
+            elif self.link == 'logit':
+                self._link_instance = LogitLink()
+            else:
+                raise ValueError(
+                    "The link must be an instance of class Link or "
+                    "an element of ['auto', 'identity', 'log', 'logit']; "
+                    "got (link={0})".format(self.link))
+
+        # validate further arguments
+        if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
+            raise ValueError("Penalty term must be a non-negative number;"
+                             " got (alpha={0})".format(self.alpha))
+        if (not isinstance(self.l1_ratio, numbers.Number) or
+                self.l1_ratio < 0 or self.l1_ratio > 1):
+            raise ValueError("l1_ratio must be a number in interval [0, 1];"
+                             " got (l1_ratio={0})".format(self.l1_ratio))
+        if not isinstance(self.fit_intercept, bool):
+            raise ValueError("The argument fit_intercept must be bool;"
+                             " got {0}".format(self.fit_intercept))
+        if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']:
+            raise ValueError("GeneralizedLinearRegressor supports only solvers"
+                             " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';"
+                             " got {0}".format(self.solver))
+        solver = self.solver
+        if self.solver == 'auto':
+            if self.l1_ratio == 0:
+                solver = 'irls'
+            else:
+                solver = 'cd'
+        if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']):
+            raise ValueError("The chosen solver (solver={0}) can't deal "
+                             "with L1 penalties, which are included with "
+                             "(alpha={1}) and (l1_ratio={2})."
+                             .format(solver, self.alpha, self.l1_ratio))
+        if (not isinstance(self.max_iter, int)
+                or self.max_iter <= 0):
+            raise ValueError("Maximum number of iteration must be a positive "
+                             "integer;"
+                             " got (max_iter={0!r})".format(self.max_iter))
+        if not isinstance(self.tol, numbers.Number) or self.tol <= 0:
+            raise ValueError("Tolerance for stopping criteria must be "
+                             "positive; got (tol={0!r})".format(self.tol))
+        if not isinstance(self.warm_start, bool):
+            raise ValueError("The argument warm_start must be bool;"
+                             " got {0}".format(self.warm_start))
+        if self.selection not in ['cyclic', 'random']:
+            raise ValueError("The argument selection must be 'cyclic' or "
+                             "'random'; got (selection={0})"
+                             .format(self.selection))
+        random_state = check_random_state(self.random_state)
+        if not isinstance(self.diag_fisher, bool):
+            raise ValueError("The argument diag_fisher must be bool;"
+                             " got {0}".format(self.diag_fisher))
+        if not isinstance(self.copy_X, bool):
+            raise ValueError("The argument copy_X must be bool;"
+                             " got {0}".format(self.copy_X))
+        if not isinstance(self.check_input, bool):
+            raise ValueError("The argument check_input must be bool; got "
+                             "(check_input={0})".format(self.check_input))
+
+        family = self._family_instance
+        link = self._link_instance
+
+        # 1.2 validate arguments of fit #######################################
+        _dtype = [np.float64, np.float32]
+        if solver == 'cd':
+            _stype = ['csc']
+        else:
+            _stype = ['csc', 'csr']
+        X, y = check_X_y(X, y, accept_sparse=_stype,
+                         dtype=_dtype, y_numeric=True, multi_output=False,
+                         copy=self.copy_X)
+        # Without converting y to float, deviance might raise
+        # ValueError: Integers to negative integer powers are not allowed.
+        # Also, y must not be sparse.
+        y = np.asarray(y, dtype=np.float64)
+
+        weights = _check_weights(sample_weight, y.shape[0])
+
+        n_samples, n_features = X.shape
+
+        # 1.3 arguments to take special care ##################################
+        # P1, P2, start_params
+        if isinstance(self.P1, str) and self.P1 == 'identity':
+            P1 = np.ones(n_features)
+        else:
+            P1 = np.atleast_1d(self.P1)
+            try:
+                P1 = P1.astype(np.float64, casting='safe', copy=False)
+            except TypeError:
+                raise TypeError("The given P1 cannot be converted to a numeric"
+                                "array; got (P1.dtype={0})."
+                                .format(P1.dtype))
+            if (P1.ndim != 1) or (P1.shape[0] != n_features):
+                raise ValueError("P1 must be either 'identity' or a 1d array "
+                                 "with the length of X.shape[1]; "
+                                 "got (P1.shape[0]={0}), "
+                                 "needed (X.shape[1]={1})."
+                                 .format(P1.shape[0], n_features))
+        # If X is sparse, make P2 sparse, too.
+        if isinstance(self.P2, str) and self.P2 == 'identity':
+            if sparse.issparse(X):
+                P2 = (sparse.dia_matrix((np.ones(n_features), 0),
+                      shape=(n_features, n_features))).tocsc()
+            else:
+                P2 = np.ones(n_features)
+        else:
+            P2 = check_array(self.P2, copy=True,
+                             accept_sparse=_stype,
+                             dtype=_dtype, ensure_2d=False)
+            if P2.ndim == 1:
+                P2 = np.asarray(P2)
+                if P2.shape[0] != n_features:
+                    raise ValueError("P2 should be a 1d array of shape "
+                                     "(n_features,) with "
+                                     "n_features=X.shape[1]; "
+                                     "got (P2.shape=({0},)), needed ({1},)"
+                                     .format(P2.shape[0], X.shape[1]))
+                if sparse.issparse(X):
+                    P2 = (sparse.dia_matrix((P2, 0),
+                          shape=(n_features, n_features))).tocsc()
+            elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and
+                    P2.shape[0] == X.shape[1]):
+                if sparse.issparse(X):
+                    P2 = (sparse.dia_matrix((P2, 0),
+                          shape=(n_features, n_features))).tocsc()
+            else:
+                raise ValueError("P2 must be either None or an array of shape "
+                                 "(n_features, n_features) with "
+                                 "n_features=X.shape[1]; "
+                                 "got (P2.shape=({0}, {1})), needed ({2}, {2})"
+                                 .format(P2.shape[0], P2.shape[1], X.shape[1]))
+
+        start_params = self.start_params
+        if isinstance(start_params, str):
+            if start_params not in ['guess',  'zero']:
+                raise ValueError("The argument start_params must be 'guess', "
+                                 "'zero' or an array of correct length; "
+                                 "got(start_params={0})".format(start_params))
+        else:
+            start_params = check_array(start_params, accept_sparse=False,
+                                       force_all_finite=True, ensure_2d=False,
+                                       dtype=_dtype, copy=True)
+            if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
+                    (start_params.ndim != 1)):
+                raise ValueError("Start values for parameters must have the"
+                                 "right length and dimension; required (length"
+                                 "={0}, ndim=1); got (length={1}, ndim={2})."
+                                 .format(X.shape[1] + self.fit_intercept,
+                                         start_params.shape[0],
+                                         start_params.ndim))
+
+        l1 = self.alpha * self.l1_ratio
+        l2 = self.alpha * (1 - self.l1_ratio)
+        # P1 and P2 are now for sure copies
+        P1 = l1 * P1
+        P2 = l2 * P2
+        # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
+        # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
+        if P2.ndim == 2:
+            if sparse.issparse(P2):
+                if sparse.isspmatrix_csc(P2):
+                    P2 = 0.5 * (P2 + P2.transpose()).tocsc()
+                else:
+                    P2 = 0.5 * (P2 + P2.transpose()).tocsr()
+            else:
+                P2 = 0.5 * (P2 + P2.T)
+
+        # For coordinate descent, if X is sparse, P2 must also be csc
+        if solver == 'cd' and sparse.issparse(X):
+            P2 = sparse.csc_matrix(P2)
+
+        # 1.4 additional validations ##########################################
+        if self.check_input:
+            if not np.all(family.in_y_range(y)):
+                raise ValueError("Some value(s) of y are out of the valid "
+                                 "range for family {0}"
+                                 .format(family.__class__.__name__))
+            # check if P1 has only non-negative values, negative values might
+            # indicate group lasso in the future.
+            if not isinstance(self.P1, str):  # if self.P1 != 'identity':
+                if not np.all(P1 >= 0):
+                    raise ValueError("P1 must not have negative values.")
+            # check if P2 is positive semidefinite
+            # np.linalg.cholesky(P2) 'only' asserts positive definite
+            if not isinstance(self.P2, str):  # self.P2 != 'identity'
+                # due to numerical precision, we allow eigenvalues to be a
+                # tiny bit negative
+                epsneg = -10 * np.finfo(P2.dtype).epsneg
+                if P2.ndim == 1 or P2.shape[0] == 1:
+                    p2 = P2
+                    if sparse.issparse(P2):
+                        p2 = P2.toarray()
+                    if not np.all(p2 >= 0):
+                        raise ValueError("1d array P2 must not have negative "
+                                         "values.")
+                elif sparse.issparse(P2):
+                    # for sparse matrices, not all eigenvals can be computed
+                    # efficiently, use only half of n_features
+                    # k = how many eigenvals to compute
+                    k = np.min([10, n_features // 10 + 1])
+                    sigma = 0  # start searching near this value
+                    which = 'SA'  # find smallest algebraic eigenvalues first
+                    if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma,
+                                                 which=which) >= epsneg):
+                        raise ValueError("P2 must be positive semi-definite.")
+                else:
+                    if not np.all(linalg.eigvalsh(P2) >= epsneg):
+                        raise ValueError("P2 must be positive semi-definite.")
+            # TODO: if alpha=0 check that X is not rank deficient
+            # TODO: what else to check?
+
+        #######################################################################
+        # 2. rescaling of weights (sample_weight)                             #
+        #######################################################################
+        # IMPORTANT NOTE: Since we want to minimize
+        # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
+        # deviance = sum(sample_weight * unit_deviance),
+        # we rescale weights such that sum(weights) = 1 and this becomes
+        # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
+        weights_sum = np.sum(weights)
+        weights = weights/weights_sum
+
+        #######################################################################
+        # 3. initialization of coef = (intercept_, coef_)                     #
+        #######################################################################
+        # Note: Since phi=self.dispersion_ does not enter the estimation
+        #       of mu_i=E[y_i], set it to 1.
+
+        # set start values for coef
+        coef = None
+        if self.warm_start and hasattr(self, 'coef_'):
+            if self.fit_intercept:
+                coef = np.concatenate((np.array([self.intercept_]),
+                                       self.coef_))
+            else:
+                coef = self.coef_
+        elif isinstance(start_params, str):
+            if start_params == 'guess':
+                # Set mu=starting_mu of the family and do one Newton step
+                # If solver=cd use cd, else irls
+                mu = family.starting_mu(y, weights=weights)
+                eta = link.link(mu)  # linear predictor
+                if solver in ['cd', 'lbfgs', 'newton-cg']:
+                    # see function _cd_solver
+                    sigma_inv = 1/family.variance(mu, phi=1, weights=weights)
+                    d1 = link.inverse_derivative(eta)
+                    temp = sigma_inv * d1 * (y - mu)
+                    if self.fit_intercept:
+                        score = np.concatenate(([temp.sum()], temp @ X))
+                    else:
+                        score = temp @ X  # sampe as X.T @ temp
+
+                    d2_sigma_inv = d1 * d1 * sigma_inv
+                    diag_fisher = self.diag_fisher
+                    if diag_fisher:
+                        fisher = d2_sigma_inv
+                    else:
+                        fisher = \
+                            _safe_sandwich_dot(X, d2_sigma_inv,
+                                               intercept=self.fit_intercept)
+                    # set up space for search direction d for inner loop
+                    if self.fit_intercept:
+                        coef = np.zeros(n_features+1)
+                    else:
+                        coef = np.zeros(n_features)
+                    d = np.zeros_like(coef)
+                    # initial stopping tolerance of inner loop
+                    # use L1-norm of minimum of norm of subgradient of F
+                    # use less restrictive tolerance for initial guess
+                    inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2,
+                                                 P1=P1)
+                    inner_tol = 4 * linalg.norm(inner_tol, ord=1)
+                    # just one outer loop = Newton step
+                    n_cycles = 0
+                    d, coef_P2, n_cycles, inner_tol = \
+                        _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles,
+                                  inner_tol, max_inner_iter=1000,
+                                  selection=self.selection,
+                                  random_state=self.random_state,
+                                  diag_fisher=self.diag_fisher)
+                    coef += d  # for simplicity no line search here
+                else:
+                    # See _irls_solver
+                    # h'(eta)
+                    hp = link.inverse_derivative(eta)
+                    # working weights W, in principle a diagonal matrix
+                    # therefore here just as 1d array
+                    W = (hp**2 / family.variance(mu, phi=1, weights=weights))
+                    # working observations
+                    z = eta + (y-mu)/hp
+                    # solve A*coef = b
+                    # A = X' W X + l2 P2, b = X' W z
+                    coef = _irls_step(X, W, P2, z,
+                                      fit_intercept=self.fit_intercept)
+            else:  # start_params == 'zero'
+                if self.fit_intercept:
+                    coef = np.zeros(n_features+1)
+                    coef[0] = link.link(np.average(y, weights=weights))
+                else:
+                    coef = np.zeros(n_features)
+        else:  # assign given array as start values
+            coef = start_params
+
+        #######################################################################
+        # 4. fit                                                              #
+        #######################################################################
+        # algorithms for optimiation
+        # TODO: Parallelize it?
+
+        # 4.1 IRLS ############################################################
+        # Note: we already set P2 = l2*P2, see above
+        # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+        if solver == 'irls':
+            coef, self.n_iter_ = \
+                _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2,
+                             fit_intercept=self.fit_intercept, family=family,
+                             link=link, max_iter=self.max_iter, tol=self.tol)
+
+        # 4.2 L-BFGS ##########################################################
+        elif solver == 'lbfgs':
+            def func(coef, X, y, weights, P2, family, link):
+                mu, devp = \
+                    family._mu_deviance_derivative(coef, X, y, weights, link)
+                dev = family.deviance(y, mu, weights)
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
+                if P2.ndim == 1:
+                    L2 = P2 * coef[idx:]
+                else:
+                    L2 = P2 @ coef[idx:]
+                obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2)
+                objp = 0.5 * devp
+                objp[idx:] += L2
+                return obj, objp
+
+            args = (X, y, weights, P2, family, link)
+            coef, loss, info = fmin_l_bfgs_b(
+                func, coef, fprime=None, args=args,
+                iprint=(self.verbose > 0) - 1, pgtol=self.tol,
+                maxiter=self.max_iter, factr=1e3)
+            if self.verbose > 0:
+                if info["warnflag"] == 1:
+                    warnings.warn("lbfgs failed to converge."
+                                  " Increase the number of iterations.",
+                                  ConvergenceWarning)
+                elif info["warnflag"] == 2:
+                    warnings.warn("lbfgs failed for the reason: {0}"
+                                  .format(info["task"]))
+            self.n_iter_ = info['nit']
+
+        # 4.3 Newton-CG #######################################################
+        # We use again the fisher matrix instead of the hessian. More
+        # precisely, expected hessian of deviance.
+        elif solver == 'newton-cg':
+            def func(coef, X, y, weights, P2, family, link):
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
+                if P2.ndim == 1:
+                    L2 = coef[idx:] @ (P2 * coef[idx:])
+                else:
+                    L2 = coef[idx:] @ (P2 @ coef[idx:])
+                mu = link.inverse(_safe_lin_pred(X, coef))
+                return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2
+
+            def grad(coef, X, y, weights, P2, family, link):
+                mu, devp = \
+                    family._mu_deviance_derivative(coef, X, y, weights, link)
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
+                if P2.ndim == 1:
+                    L2 = P2 * coef[idx:]
+                else:
+                    L2 = P2 @ coef[idx:]
+                objp = 0.5 * devp
+                objp[idx:] += L2
+                return objp
+
+            def grad_hess(coef, X, y, weights, P2, family, link):
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
+                if P2.ndim == 1:
+                    L2 = P2 * coef[idx:]
+                else:
+                    L2 = P2 @ coef[idx:]
+                eta = _safe_lin_pred(X, coef)
+                mu = link.inverse(eta)
+                d1 = link.inverse_derivative(eta)
+                temp = d1 * family.deviance_derivative(y, mu, weights)
+                if intercept:
+                    grad = np.concatenate(([0.5 * temp.sum()],
+                                           0.5 * temp @ X + L2))
+                else:
+                    grad = 0.5 * temp @ X + L2  # sampe as 0.5* X.T @ temp + L2
+
+                # expected hessian = fisher = X.T @ diag_matrix @ X
+                # calculate only diag_matrix
+                diag = d1**2 / family.variance(mu, phi=1, weights=weights)
+                if intercept:
+                    h0i = np.concatenate(([diag.sum()], diag @ X))
+
+                def Hs(coef):
+                    # return (0.5 * fisher + P2) @ coef
+                    # ret = 0.5 * (X.T @ (diag * (X @ coef)))
+                    ret = 0.5 * ((diag * (X @ coef[idx:])) @ X)
+                    if P2.ndim == 1:
+                        ret += P2 * coef[idx:]
+                    else:
+                        ret += P2 @ coef[idx:]
+                    if intercept:
+                        ret = np.concatenate(([0.5 * (h0i @ coef)],
+                                             ret + 0.5 * coef[0] * h0i[1:]))
+                    return ret
+
+                return grad, Hs
+
+            args = (X, y, weights, P2, family, link)
+            coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef,
+                                           args=args, maxiter=self.max_iter,
+                                           tol=self.tol)
+
+        # 4.4 coordinate descent ##############################################
+        # Note: we already set P1 = l1*P1, see above
+        # Note: we already set P2 = l2*P2, see above
+        # Note: we already symmetriezed P2 = 1/2 (P2 + P2')
+        elif solver == 'cd':
+            coef, self.n_iter_, self._n_cycles = \
+                _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1,
+                           P2=P2, fit_intercept=self.fit_intercept,
+                           family=family, link=link,
+                           max_iter=self.max_iter, tol=self.tol,
+                           selection=self.selection, random_state=random_state,
+                           diag_fisher=self.diag_fisher, copy_X=self.copy_X)
+
+        #######################################################################
+        # 5. postprocessing                                                   #
+        #######################################################################
+        if self.fit_intercept:
+            self.intercept_ = coef[0]
+            self.coef_ = coef[1:]
+        else:
+            # set intercept to zero as the other linear models do
+            self.intercept_ = 0.
+            self.coef_ = coef
+
+        if self.fit_dispersion in ['chisqr', 'deviance']:
+            # attention because of rescaling of weights
+            self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum
+
+        return self
+
+    def linear_predictor(self, X):
+        """Compute the linear_predictor = X*coef_ + intercept_.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        C : array, shape (n_samples,)
+            Returns predicted values of linear predictor.
+        """
+        check_is_fitted(self, "coef_")
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                        dtype='numeric', copy=True, ensure_2d=True,
+                        allow_nd=False)
+        return X @ self.coef_ + self.intercept_
+
+    def predict(self, X, sample_weight=None):
+        """Predict uing GLM with feature matrix X.
+        If sample_weight is given, returns prediction*sample_weight.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Samples.
+
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
+
+        Returns
+        -------
+        C : array, shape (n_samples,)
+            Returns predicted values times sample_weight.
+        """
+        # TODO: Is copy=True necessary?
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                        dtype='numeric', copy=True, ensure_2d=True,
+                        allow_nd=False)
+        eta = self.linear_predictor(X)
+        mu = self._link_instance.inverse(eta)
+        weights = _check_weights(sample_weight, X.shape[0])
+
+        return mu*weights
+
+    def estimate_phi(self, X, y, sample_weight=None):
+        """Estimate/fit the dispersion parameter phi.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+
+        y : array-like, shape (n_samples,)
+            Target values.
+
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
+            Sample weights.
+
+        Returns
+        -------
+        phi : float
+            Dispersion parameter.
+        """
+        check_is_fitted(self, "coef_")
+        _dtype = [np.float64, np.float32]
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                         dtype=_dtype, y_numeric=True, multi_output=False)
+        n_samples, n_features = X.shape
+        weights = _check_weights(sample_weight, n_samples)
+        eta = X @ self.coef_
+        if self.fit_intercept is True:
+            eta += self.intercept_
+            n_features += 1
+        if n_samples <= n_features:
+            raise ValueError("Estimation of dispersion parameter phi requires"
+                             " more samples than features, got"
+                             " samples=X.shape[0]={0} and"
+                             " n_features=X.shape[1]+fit_intercept={1}."
+                             .format(n_samples, n_features))
+        mu = self._link_instance.inverse(eta)
+        if self.fit_dispersion == 'chisqr':
+            chisq = np.sum(weights*(y-mu)**2 /
+                           self._family_instance.unit_variance(mu))
+            return chisq/(n_samples - n_features)
+        elif self.fit_dispersion == 'deviance':
+            dev = self._family_instance.deviance(y, mu, weights)
+            return dev/(n_samples - n_features)
+
+    # Note: check_estimator(GeneralizedLinearRegressor) might raise
+    # "AssertionError: -0.28014056555724598 not greater than 0.5"
+    # unless GeneralizedLinearRegressor has a score which passes the test.
+    def score(self, X, y, sample_weight=None):
+        """Compute D^2, the percentage of deviance explained.
+
+        D^2 is a generalization of the coefficient of determination R^2.
+        R^2 uses squared error and D^2 deviance. Note that those two are equal
+        for family='normal'.
+
+        D^2 is defined as
+        :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
+        :math:`D_{null}` is the null deviance, i.e. the deviance of a model
+        with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
+        The mean :math:`\\bar{y}` is averaged by sample_weight.
+        Best possible score is 1.0 and it can be negative (because the model
+        can be arbitrarily worse).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like, shape (n_samples,)
+            True values of target.
+
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            D^2 of self.predict(X) w.r.t. y.
+        """
+        # Note, default score defined in RegressorMixin is R^2 score.
+        # TODO: make D^2 a score function in module metrics (and thereby get
+        #       input validation and so on)
+        weights = _check_weights(sample_weight, y.shape[0])
+        mu = self.predict(X)
+        dev = self._family_instance.deviance(y, mu, weights=weights)
+        y_mean = np.average(y, weights=weights)
+        dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
+        return 1. - dev / dev_null
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
new file mode 100644
index 0000000000000..7cb3f4a5b5969
--- /dev/null
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -0,0 +1,640 @@
+import numpy as np
+from numpy.testing import assert_allclose
+import pytest
+import scipy as sp
+from scipy import linalg, optimize, sparse
+
+from sklearn.datasets import make_classification, make_regression
+from sklearn.linear_model.glm import (
+    Link,
+    IdentityLink,
+    LogLink,
+    LogitLink,
+    TweedieDistribution,
+    NormalDistribution, PoissonDistribution,
+    GammaDistribution, InverseGaussianDistribution,
+    GeneralizedHyperbolicSecant, BinomialDistribution,
+    GeneralizedLinearRegressor)
+from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
+
+from sklearn.utils.testing import (
+    assert_equal, assert_almost_equal,
+    assert_array_equal, assert_array_almost_equal)
+
+
+@pytest.mark.parametrize('link', Link.__subclasses__())
+def test_link_properties(link):
+    """Test link inverse and derivative."""
+    rng = np.random.RandomState(42)
+    x = rng.rand(100)*100
+    link = link()  # instatiate object
+    decimal = 10
+    if isinstance(link, LogitLink):
+        # careful for large x, note expit(36) = 1
+        # limit max eta to 15
+        x = x / 100 * 15
+        decimal = 8
+    assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal)
+    # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
+    assert_almost_equal(link.derivative(link.inverse(x)),
+                        1./link.inverse_derivative(x), decimal=decimal)
+    # for LogitLink, in the following x should be between 0 and 1.
+    # assert_almost_equal(link.inverse_derivative(link.link(x)),
+    #                     1./link.derivative(x), decimal=decimal)
+
+
+@pytest.mark.parametrize(
+    'family, expected',
+    [(NormalDistribution(), [True, True, True]),
+     (PoissonDistribution(), [False, True, True]),
+     (TweedieDistribution(power=1.5), [False, True, True]),
+     (GammaDistribution(), [False, False, True]),
+     (InverseGaussianDistribution(), [False, False, True]),
+     (TweedieDistribution(power=4.5), [False, False, True])])
+def test_family_bounds(family, expected):
+    """Test the valid range of distributions at -1, 0, 1."""
+    result = family.in_y_range([-1, 0, 1])
+    assert_array_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    'family, chk_values',
+    [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
+     (PoissonDistribution(), [0.1, 1.5]),
+     (GammaDistribution(), [0.1, 1.5]),
+     (InverseGaussianDistribution(), [0.1, 1.5]),
+     (TweedieDistribution(power=-2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-1), [0.1, 1.5]),
+     (TweedieDistribution(power=1.5), [0.1, 1.5]),
+     (TweedieDistribution(power=2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-4), [0.1, 1.5]),
+     (GeneralizedHyperbolicSecant(), [0.1, 1.5])])
+def test_deviance_zero(family, chk_values):
+    """Test deviance(y,y) = 0 for different families."""
+    for x in chk_values:
+        assert_almost_equal(family.deviance(x, x), 0, decimal=10)
+
+
+@pytest.mark.parametrize(
+    'family, link',
+    [(NormalDistribution(), IdentityLink()),
+     (PoissonDistribution(), LogLink()),
+     (GammaDistribution(), LogLink()),
+     (InverseGaussianDistribution(), LogLink()),
+     (TweedieDistribution(power=1.5), LogLink()),
+     (TweedieDistribution(power=4.5), LogLink())])
+def test_fisher_matrix(family, link):
+    """Test the Fisher matrix numerically.
+    Trick: Use numerical differentiation with y = mu"""
+    coef = np.array([-2, 1, 0, 1, 2.5])
+    phi = 0.5
+    rng = np.random.RandomState(42)
+    X = rng.randn(10, 5)
+    lin_pred = np.dot(X, coef)
+    mu = link.inverse(lin_pred)
+    weights = rng.randn(10)**2 + 1
+    fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu,
+                                   weights=weights, link=link)
+    approx = np.array([]).reshape(0, coef.shape[0])
+    for i in range(coef.shape[0]):
+        def f(coef):
+            return -family._score(coef=coef, phi=phi, X=X, y=mu,
+                                  weights=weights, link=link)[i]
+        approx = np.vstack(
+            [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)])
+    assert_allclose(fisher, approx, rtol=1e-3)
+
+
+def test_sample_weights_validation():
+    """Test the raised errors in the validation of sample_weight."""
+    # 1. scalar value but not positive
+    X = [[1]]
+    y = [1]
+    weights = 0
+    glm = GeneralizedLinearRegressor(fit_intercept=False)
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
+
+    # 2. 2d array
+    weights = [[0]]
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
+
+    # 3. 1d but wrong length
+    weights = [1, 0]
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
+
+    # 4. 1d but only zeros (sum not greater than 0)
+    weights = [0, 0]
+    X = [[0], [1]]
+    y = [1, 2]
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
+
+    # 5. 1d but weith a negative value
+    weights = [2, -1]
+    with pytest.raises(ValueError):
+        glm.fit(X, y, weights)
+
+
+@pytest.mark.parametrize('f, fam',
+                         [('normal', NormalDistribution()),
+                          ('poisson', PoissonDistribution()),
+                          ('gamma', GammaDistribution()),
+                          ('inverse.gaussian', InverseGaussianDistribution()),
+                          ('binomial', BinomialDistribution())])
+def test_glm_family_argument(f, fam):
+    """Test GLM family argument set as string."""
+    y = np.array([0.1, 0.5])  # in range of all distributions
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
+    assert_equal(type(glm._family_instance), type(fam))
+
+    glm = GeneralizedLinearRegressor(family='not a family',
+                                     fit_intercept=False)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('l, link',
+                         [('identity', IdentityLink()),
+                          ('log', LogLink()),
+                          ('logit', LogitLink())])
+def test_glm_link_argument(l, link):
+    """Test GLM link argument set as string."""
+    y = np.array([0.1, 0.5])  # in range of all distributions
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y)
+    assert_equal(type(glm._link_instance), type(link))
+
+    glm = GeneralizedLinearRegressor(family='normal', link='not a link')
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('alpha', ['not a number', -4.2])
+def test_glm_alpha_argument(alpha):
+    """Test GLM for invalid alpha argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', alpha=alpha)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]])
+def test_glm_l1_ratio_argument(l1_ratio):
+    """Test GLM for invalid l1_ratio argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3],
+                                [-1]])
+def test_glm_P1_argument(P1):
+    """Test GLM for invalid P1 argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True)
+    with pytest.raises((ValueError, TypeError)):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]],
+                                sparse.csr_matrix([1, 2, 3]), [-1]])
+def test_glm_P2_argument(P2):
+    """Test GLM for invalid P2 argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(P2=P2, check_input=True)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+def test_glm_P2_positive_semidefinite():
+    """Test GLM for a positive semi-definite P2 argument."""
+    n_samples, n_features = 10, 5
+    y = np.arange(n_samples)
+    X = np.zeros((n_samples, n_features))
+    P2 = np.diag([100, 10, 5, 0, -1E-5])
+    rng = np.random.RandomState(42)
+    # construct random orthogonal matrix Q
+    Q, R = linalg.qr(rng.randn(n_features, n_features))
+    P2 = Q.T @ P2 @ Q
+    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
+                                     check_input=True)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+    P2 = sparse.csr_matrix(P2)
+    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
+                                     check_input=True)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]])
+def test_glm_fit_intercept_argument(fit_intercept):
+    """Test GLM for invalid fit_intercept argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('solver, l1_ratio',
+                         [('not a solver', 0), (1, 0), ([1], 0),
+                          ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)])
+def test_glm_solver_argument(solver, l1_ratio):
+    """Test GLM for invalid solver argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]])
+def test_glm_max_iter_argument(max_iter):
+    """Test GLM for invalid max_iter argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(max_iter=max_iter)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]])
+def test_glm_tol_argument(tol):
+    """Test GLM for invalid tol argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(tol=tol)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]])
+def test_glm_warm_start_argument(warm_start):
+    """Test GLM for invalid warm_start argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(warm_start=warm_start)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('start_params',
+                         ['not a start_params', ['zero'], [0, 0, 0],
+                          [[0, 0]], ['a', 'b']])
+def test_glm_start_params_argument(start_params):
+    """Test GLM for invalid start_params argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(start_params=start_params)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']])
+def test_glm_selection_argument(selection):
+    """Test GLM for invalid selection argument"""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(selection=selection)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]])
+def test_glm_random_state_argument(random_state):
+    """Test GLM for invalid random_state argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(random_state=random_state)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]])
+def test_glm_diag_fisher_argument(diag_fisher):
+    """Test GLM for invalid diag_fisher arguments."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]])
+def test_glm_copy_X_argument(copy_X):
+    """Test GLM for invalid copy_X arguments."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(copy_X=copy_X)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]])
+def test_glm_check_input_argument(check_input):
+    """Test GLM for invalid check_input argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(check_input=check_input)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
+def test_glm_identiy_regression(solver):
+    """Test GLM regression with identity link on a simple dataset."""
+    coef = [1, 2]
+    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
+    y = np.dot(X, coef)
+    glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
+                                     fit_intercept=False, solver=solver,
+                                     start_params='zero', tol=1e-7)
+    res = glm.fit(X, y)
+    assert_array_almost_equal(res.coef_, coef)
+
+
+@pytest.mark.parametrize(
+    'family',
+    [NormalDistribution(), PoissonDistribution(),
+     GammaDistribution(), InverseGaussianDistribution(),
+     TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
+     GeneralizedHyperbolicSecant()])
+@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6),
+                                              ('lbfgs', 1e-6, 6),
+                                              ('newton-cg', 1e-7, 6),
+                                              ('cd', 1e-7, 6)])
+def test_glm_log_regression(family, solver, tol, dec):
+    """Test GLM regression with log link on a simple dataset."""
+    coef = [0.2, -0.1]
+    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
+    y = np.exp(np.dot(X, coef))
+    glm = GeneralizedLinearRegressor(
+                alpha=0, family=family, link='log', fit_intercept=False,
+                solver=solver, start_params='guess', tol=tol)
+    res = glm.fit(X, y)
+    assert_array_almost_equal(res.coef_, coef, decimal=dec)
+
+
+@pytest.mark.filterwarnings('ignore::DeprecationWarning')
+@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 5),
+                                              ('lbfgs', 1e-6, 5),
+                                              ('newton-cg', 1e-5, 5),
+                                              ('cd', 1e-6, 6)])
+def test_normal_ridge(solver, tol, dec):
+    """Test ridge regression for Normal distributions.
+
+    Compare to test_ridge in test_ridge.py.
+    """
+    rng = np.random.RandomState(42)
+    alpha = 1.0
+
+    # 1. With more samples than features
+    n_samples, n_features, n_predict = 100, 7, 10
+    X, y, coef = make_regression(n_samples=n_samples+n_predict,
+                                 n_features=n_features,
+                                 n_informative=n_features-2, noise=0.5,
+                                 coef=True, random_state=rng)
+    y = y[0:n_samples]
+    X, T = X[0:n_samples], X[n_samples:]
+
+    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6,
+                  solver='svd', normalize=False)
+    ridge.fit(X, y)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=True,
+                                     tol=tol, max_iter=100, solver=solver,
+                                     check_input=False, random_state=rng)
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
+
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
+                  solver='svd', normalize=False)
+    ridge.fit(X, y)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=False,
+                                     tol=tol, max_iter=100, solver=solver,
+                                     check_input=False, random_state=rng,
+                                     fit_dispersion='chisqr')
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
+    mu = glm.predict(X)
+    assert_almost_equal(glm.dispersion_,
+                        np.sum((y-mu)**2/(n_samples-n_features)))
+
+    # 2. With more features than samples and sparse
+    n_samples, n_features, n_predict = 10, 100, 10
+    X, y, coef = make_regression(n_samples=n_samples+n_predict,
+                                 n_features=n_features,
+                                 n_informative=n_features-2, noise=0.5,
+                                 coef=True, random_state=rng)
+    y = y[0:n_samples]
+    X, T = X[0:n_samples], X[n_samples:]
+
+    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9,
+                  solver='sag', normalize=False, max_iter=100000)
+    ridge.fit(X, y)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=True,
+                                     tol=tol, max_iter=300, solver=solver,
+                                     check_input=False, random_state=rng)
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec)
+
+    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7,
+                  solver='sag', normalize=False, max_iter=1000)
+    ridge.fit(X, y)
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=False,
+                                     tol=tol*2, max_iter=300, solver=solver,
+                                     check_input=False, random_state=rng)
+    glm.fit(X, y)
+    assert_equal(glm.coef_.shape, (X.shape[1], ))
+    assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1)
+    assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1)
+    assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2)
+
+
+@pytest.mark.parametrize('solver, tol, dec',
+                         [('irls', 1e-7, 6),
+                          ('lbfgs', 1e-7, 5),
+                          ('newton-cg', 1e-7, 5),
+                          ('cd', 1e-7, 7)])
+def test_poisson_ridge(solver, tol, dec):
+    """Test ridge regression with poisson family and LogLink.
+
+    Compare to R's glmnet"""
+    # library("glmnet")
+    # options(digits=10)
+    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
+    # x <- data.matrix(df[,c("a", "b")])
+    # y <- df$y
+    # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson",
+    #               standardize=F, thresh=1e-10, nlambda=10000)
+    # coef(fit, s=1)
+    # (Intercept) -0.12889386979
+    # a            0.29019207995
+    # b            0.03741173122
+    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
+    y = np.array([0, 1, 1, 2])
+    rng = np.random.RandomState(42)
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
+                                     fit_intercept=True, family='poisson',
+                                     link='log', tol=tol,
+                                     solver=solver, max_iter=300,
+                                     random_state=rng)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, -0.12889386979,
+                        decimal=dec)
+    assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122],
+                              decimal=dec)
+
+
+@pytest.mark.parametrize('diag_fisher', [False, True])
+def test_normal_enet(diag_fisher):
+    """Test elastic net regression with normal/gaussian family."""
+    alpha, l1_ratio = 0.3, 0.7
+    n_samples, n_features = 20, 2
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples, n_features).copy(order='F')
+    beta = rng.randn(n_features)
+    y = 2 + np.dot(X, beta) + rng.randn(n_samples)
+
+    # 1. test normal enet on dense data
+    glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio,
+                                     family='normal', link='identity',
+                                     fit_intercept=True, tol=1e-8,
+                                     max_iter=100, selection='cyclic',
+                                     solver='cd', start_params='zero',
+                                     check_input=False,
+                                     diag_fisher=diag_fisher)
+    glm.fit(X, y)
+
+    enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True,
+                      normalize=False, tol=1e-8, copy_X=True)
+    enet.fit(X, y)
+
+    assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
+    assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
+
+    # 2. test normal enet on sparse data
+    X = sparse.csc_matrix(X)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7)
+    assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7)
+
+
+def test_poisson_enet():
+    """Test elastic net regression with poisson family and LogLink.
+
+    Compare to R's glmnet"""
+    # library("glmnet")
+    # options(digits=10)
+    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
+    # x <- data.matrix(df[,c("a", "b")])
+    # y <- df$y
+    # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson",
+    #               standardize=F, thresh=1e-10, nlambda=10000)
+    # coef(fit, s=1)
+    # (Intercept) -0.03550978409
+    # a            0.16936423283
+    # b            .
+    glmnet_intercept = -0.03550978409
+    glmnet_coef = [0.16936423283, 0.]
+    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
+    y = np.array([0, 1, 1, 2])
+    rng = np.random.RandomState(42)
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
+                                     link='log', solver='cd', tol=1e-8,
+                                     selection='random', random_state=rng,
+                                     start_params='guess')
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7)
+    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7)
+
+    # test results with general optimization procedure
+    def obj(coef):
+        pd = PoissonDistribution()
+        link = LogLink()
+        N = y.shape[0]
+        mu = link.inverse(X @ coef[1:] + coef[0])
+        alpha, l1_ratio = (1, 0.5)
+        return 1./(2.*N) * pd.deviance(y, mu) \
+            + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \
+            + alpha * l1_ratio * np.sum(np.abs(coef[1:]))
+    res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10,
+                            options={'maxiter': 1000, 'disp': False})
+    assert_almost_equal(glm.intercept_, res.x[0], decimal=5)
+    assert_almost_equal(glm.coef_, res.x[1:], decimal=5)
+    assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))),
+                        res.fun, decimal=8)
+
+    # same for start_params='zero' and selection='cyclic'
+    # with reduced precision
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
+                                     link='log', solver='cd', tol=1e-5,
+                                     selection='cyclic', start_params='zero')
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
+    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
+
+    # check warm_start, therefore start with different alpha
+    glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5,
+                                     family='poisson', max_iter=300,
+                                     link='log', solver='cd', tol=1e-5,
+                                     selection='cyclic', start_params='zero')
+    glm.fit(X, y)
+    # warm start with original alpha and use of sparse matrices
+    glm.warm_start = True
+    glm.alpha = 1
+    X = sparse.csr_matrix(X)
+    glm.fit(X, y)
+    assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4)
+    assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4)
+
+
+@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10])
+def test_binomial_enet(alpha):
+    """Test elastic net regression with binomial family and LogitLink.
+
+    Compare to LogisticRegression.
+    """
+    l1_ratio = 0.5
+    n_samples = 500
+    rng = np.random.RandomState(42)
+    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6,
+                               n_informative=5, n_redundant=0, n_repeated=0,
+                               random_state=rng)
+    log = LogisticRegression(
+        penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6,
+        max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha),
+        solver='saga')
+    log.fit(X, y)
+    glm = GeneralizedLinearRegressor(
+        family=BinomialDistribution(), link=LogitLink(), fit_intercept=False,
+        alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic',
+        tol=1e-7)
+    glm.fit(X, y)
+    assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6)
+    assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6)

From 9b574bdc0b3a4a1081f73563d0468c864fcbbb22 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 11 Jun 2019 07:30:52 -0500
Subject: [PATCH 58/69] Fix docstrings for the new print_changed_only=True by
 default

---
 doc/modules/linear_model.rst | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 52f28346cc047..b9f0e96734ae5 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -942,17 +942,11 @@ follows:
 
     >>> from sklearn.linear_model import GeneralizedLinearRegressor
     >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
-    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE
-    GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5,
-                               check_input=True, copy_X=True, diag_fisher=False,
-                               family='poisson', fit_dispersion=None,
-                               fit_intercept=True, l1_ratio=0, link='log',
-                               max_iter=100, random_state=None, selection='cyclic',
-                               solver='auto', start_params='guess', tol=0.0001,
-                               verbose=0, warm_start=False)
-    >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE
+    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
+    GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
+    >>> reg.coef_
     array([0.24630169, 0.43373464])
-    >>> reg.intercept_ #doctest: +ELLIPSIS
+    >>> reg.intercept_
     -0.76383633...
 
 

From 90299fdd8a4745965eb8f9dbe26ddb685f1531f1 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Wed, 12 Jun 2019 04:31:50 -0500
Subject: [PATCH 59/69] Increase coverage

---
 sklearn/linear_model/_glm.py           | 22 +++-----
 sklearn/linear_model/tests/test_glm.py | 78 ++++++++++++++++++++++++--
 2 files changed, 83 insertions(+), 17 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index 8152e84ac7253..055aacf26d747 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -182,7 +182,7 @@ def link(self, mu):
         mu : array, shape (n_samples,)
             Usually the (predicted) mean.
         """
-        raise NotImplementedError
+        pass
 
     @abstractmethod
     def derivative(self, mu):
@@ -193,7 +193,7 @@ def derivative(self, mu):
         mu : array, shape (n_samples,)
             Usually the (predicted) mean.
         """
-        raise NotImplementedError
+        pass
 
     @abstractmethod
     def inverse(self, lin_pred):
@@ -207,7 +207,7 @@ def inverse(self, lin_pred):
         lin_pred : array, shape (n_samples,)
             Usually the (fitted) linear predictor.
         """
-        raise NotImplementedError
+        pass
 
     @abstractmethod
     def inverse_derivative(self, lin_pred):
@@ -218,7 +218,7 @@ def inverse_derivative(self, lin_pred):
         lin_pred : array, shape (n_samples,)
             Usually the (fitted) linear predictor.
         """
-        raise NotImplementedError
+        pass
 
     @abstractmethod
     def inverse_derivative2(self, lin_pred):
@@ -229,7 +229,7 @@ def inverse_derivative2(self, lin_pred):
         lin_pred : array, shape (n_samples,)
             Usually the (fitted) linear predictor.
         """
-        raise NotImplementedError
+        pass
 
 
 class IdentityLink(Link):
@@ -400,7 +400,7 @@ def unit_variance(self, mu):
         mu : array, shape (n_samples,)
             Predicted mean.
         """
-        raise NotImplementedError()
+        pass
 
     @abstractmethod
     def unit_variance_derivative(self, mu):
@@ -413,7 +413,7 @@ def unit_variance_derivative(self, mu):
         mu : array, shape (n_samples,)
             Target values.
         """
-        raise NotImplementedError()
+        pass
 
     def variance(self, mu, phi=1, weights=1):
         r"""Compute the variance function.
@@ -473,7 +473,7 @@ def unit_deviance(self, y, mu):
         mu : array, shape (n_samples,)
             Predicted mean.
         """
-        raise NotImplementedError()
+        pass
 
     def unit_deviance_derivative(self, y, mu):
         r"""Compute the derivative of the unit deviance w.r.t. mu.
@@ -1079,7 +1079,6 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
                 B[idx:, idx:] += P2.toarray()
             else:
                 B[idx:, idx:] += P2
-    # A = -score + coef_P2
     A = -score
     A[idx:] += coef_P2
     # A += d @ (H+P2) but so far d=0
@@ -1302,9 +1301,6 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
         P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32],
                          order='F', copy=copy_X)
     if sparse.issparse(X):
-        if not sparse.isspmatrix_csc(X):
-            raise ValueError("If X is sparse, it must be in csc format"
-                             "; got (format={})".format(X.format))
         if not sparse.isspmatrix_csc(P2):
             raise ValueError("If X is sparse, P2 must also be sparse csc"
                              "format. Got P2 not sparse.")
@@ -2021,7 +2017,7 @@ def fit(self, X, y, sample_weight=None):
                         _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles,
                                   inner_tol, max_inner_iter=1000,
                                   selection=self.selection,
-                                  random_state=self.random_state,
+                                  random_state=random_state,
                                   diag_fisher=self.diag_fisher)
                     coef += d  # for simplicity no line search here
                 else:
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index d42a8739f6aa0..230bbdabae201 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -21,10 +21,20 @@
     GeneralizedHyperbolicSecant, BinomialDistribution,
 )
 from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
+from sklearn.metrics import mean_absolute_error
 
 from sklearn.utils.testing import assert_array_equal
 
 
+@pytest.fixture(scope="module")
+def regression_data():
+    X, y = make_regression(n_samples=107,
+                           n_features=10,
+                           n_informative=80, noise=0.5,
+                           random_state=2)
+    return X, y
+
+
 @pytest.mark.parametrize('link', Link.__subclasses__())
 def test_link_properties(link):
     """Test link inverse and derivative."""
@@ -39,6 +49,10 @@ def test_link_properties(link):
     # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
     assert_allclose(link.derivative(link.inverse(x)),
                     1./link.inverse_derivative(x))
+
+    assert (
+      link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape)
+
     # for LogitLink, in the following x should be between 0 and 1.
     # assert_almost_equal(link.inverse_derivative(link.link(x)),
     #                     1./link.derivative(x), decimal=decimal)
@@ -108,7 +122,7 @@ def f(coef):
 
 def test_sample_weights_validation():
     """Test the raised errors in the validation of sample_weight."""
-    # 1. scalar value but not positive
+    # scalar value but not positive
     X = [[1]]
     y = [1]
     weights = 0
@@ -116,17 +130,20 @@ def test_sample_weights_validation():
     with pytest.raises(ValueError):
         glm.fit(X, y, weights)
 
-    # 2. 2d array
+    # Positive weights are accepted
+    glm.fit(X, y, sample_weight=1)
+
+    # 2d array
     weights = [[0]]
     with pytest.raises(ValueError):
         glm.fit(X, y, weights)
 
-    # 3. 1d but wrong length
+    # 1d but wrong length
     weights = [1, 0]
     with pytest.raises(ValueError):
         glm.fit(X, y, weights)
 
-    # 4. 1d but only zeros (sum not greater than 0)
+    # 1d but only zeros (sum not greater than 0)
     weights = [0, 0]
     X = [[0], [1]]
     y = [1, 2]
@@ -643,3 +660,56 @@ def test_binomial_enet(alpha):
     glm.fit(X, y)
     assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6)
     assert_allclose(log.coef_[0, :], glm.coef_, rtol=5e-6)
+
+
+@pytest.mark.parametrize(
+        "params",
+        [
+            {"solver": "irls", "start_params": "guess"},
+            {"solver": "irls", "start_params": "zero"},
+            {"solver": "lbfgs", "start_params": "guess"},
+            {"solver": "lbfgs", "start_params": "zero"},
+            {"solver": "newton-cg"},
+            {"solver": "cd", "selection": "cyclic", "diag_fisher": False},
+            {"solver": "cd", "selection": "cyclic", "diag_fisher": True},
+            {"solver": "cd", "selection": "random", "diag_fisher": False},
+        ],
+        ids=lambda params: ', '.join("%s=%s" % (key, val)
+                                     for key,  val in params.items())
+)
+def test_solver_equivalence(params, regression_data):
+    X, y = regression_data
+    est_ref = GeneralizedLinearRegressor(random_state=2)
+    est_ref.fit(X, y)
+
+    estimator = GeneralizedLinearRegressor(**params)
+    estimator.set_params(random_state=2)
+
+    estimator.fit(X, y)
+
+    assert_allclose(estimator.intercept_, est_ref.intercept_, rtol=1e-4)
+    assert_allclose(estimator.coef_, est_ref.coef_, rtol=1e-4)
+    assert_allclose(
+        mean_absolute_error(estimator.predict(X), y),
+        mean_absolute_error(est_ref.predict(X), y),
+        rtol=1e-4
+    )
+
+
+def test_fit_dispersion(regression_data):
+    X, y = regression_data
+
+    est1 = GeneralizedLinearRegressor(random_state=2)
+    est1.fit(X, y)
+    assert not hasattr(est1, "dispersion_")
+
+    est2 = GeneralizedLinearRegressor(random_state=2, fit_dispersion="chisqr")
+    est2.fit(X, y)
+    assert isinstance(est2.dispersion_, float)
+
+    est3 = GeneralizedLinearRegressor(
+            random_state=2, fit_dispersion="deviance")
+    est3.fit(X, y)
+    assert isinstance(est3.dispersion_, float)
+
+    assert_allclose(est2.dispersion_,  est3.dispersion_)

From e3a5a9aa6855928d89bdc5dad7f3a46ba934fb8e Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Wed, 12 Jun 2019 12:00:28 -0500
Subject: [PATCH 60/69] More tests and addressing some review comments

---
 sklearn/linear_model/_glm.py           | 51 +++++++++++++-------------
 sklearn/linear_model/tests/test_glm.py | 43 +++++++++++++++++++++-
 2 files changed, 67 insertions(+), 27 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index 055aacf26d747..acce438b2b5e2 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -6,8 +6,6 @@
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
-# TODO: Should the option `normalize` be included (like other linear models)?
-#       So far, it is not included. User must pass a normalized X.
 # TODO: Add cross validation support, e.g. GCV?
 # TODO: Should GeneralizedLinearRegressor inherit from LinearModel?
 #       So far, it does not.
@@ -287,7 +285,6 @@ def inverse_derivative(self, lin_pred):
         return ep * (1. - ep)
 
     def inverse_derivative2(self, lin_pred):
-        ep = special.expit(lin_pred)
         ep = special.expit(lin_pred)
         return ep * (1. - ep) * (1. - 2 * ep)
 
@@ -738,7 +735,19 @@ class TweedieDistribution(ExponentialDispersionModel):
             For ``0<power<1``, no distribution exists.
     """
     def __init__(self, power=0):
+        # validate power and set _upper_bound, _include_upper_bound attrs
         self.power = power
+
+    @property
+    def power(self):
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        if not isinstance(power, numbers.Real):
+            raise TypeError('power must be a real number, input was {0}'
+                            .format(power))
+
         self._upper_bound = np.Inf
         self._include_upper_bound = False
         if power < 0:
@@ -775,19 +784,10 @@ def __init__(self, power=0):
             # Positive Stable
             self._lower_bound = 0
             self._include_lower_bound = False
-        else:
-            raise ValueError('The power must be a float, i.e. real number, '
-                             'got (power={})'.format(power))
+        else:  # pragma: no cover
+            # this branch should be unreachable.
+            raise ValueError
 
-    @property
-    def power(self):
-        return self._power
-
-    @power.setter
-    def power(self, power):
-        if not isinstance(power, numbers.Real):
-            raise TypeError('power must be a real number, input was {0}'
-                            .format(power))
         self._power = power
 
     def unit_variance(self, mu):
@@ -962,7 +962,7 @@ def _irls_step(X, W, P2, z, fit_intercept=True):
             A += P2.toarray()
         else:
             A += P2
-    # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True)
+
     coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
     return coef
 
@@ -991,12 +991,13 @@ def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link,
     # Note: P2 must be symmetrized
     # Note: ' denotes derivative, but also transpose for matrices
 
-    # eta = linear predictor
     eta = _safe_lin_pred(X, coef)
     mu = link.inverse(eta)
     # D = h'(eta)
     hp = link.inverse_derivative(eta)
     V = family.variance(mu, phi=1, weights=weights)
+
+    converged = False
     n_iter = 0
     while n_iter < max_iter:
         n_iter += 1
@@ -1360,7 +1361,6 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
             if Fwd - Fw <= sigma * la * bound:
                 break
         # update coefficients
-        # coef_old = coef.copy()
         coef += la * d
         # calculate eta, mu, score, Fisher matrix for next iteration
         eta, mu, score, fisher = family._eta_mu_score_fisher(
@@ -2079,14 +2079,13 @@ def func(coef, X, y, weights, P2, family, link):
                 func, coef, fprime=None, args=args,
                 iprint=(self.verbose > 0) - 1, pgtol=self.tol,
                 maxiter=self.max_iter, factr=1e3)
-            if self.verbose > 0:
-                if info["warnflag"] == 1:
-                    warnings.warn("lbfgs failed to converge."
-                                  " Increase the number of iterations.",
-                                  ConvergenceWarning)
-                elif info["warnflag"] == 2:
-                    warnings.warn("lbfgs failed for the reason: {0}"
-                                  .format(info["task"]))
+            if info["warnflag"] == 1:
+                warnings.warn("lbfgs failed to converge."
+                              " Increase the number of iterations.",
+                              ConvergenceWarning)
+            elif info["warnflag"] == 2:
+                warnings.warn("lbfgs failed for the reason: {0}"
+                              .format(info["task"]))
             self.n_iter_ = info['nit']
 
         # 4.3 Newton-CG #######################################################
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 230bbdabae201..7229f21840829 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -22,6 +22,7 @@
 )
 from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
 from sklearn.metrics import mean_absolute_error
+from sklearn.exceptions import ConvergenceWarning
 
 from sklearn.utils.testing import assert_array_equal
 
@@ -72,6 +73,23 @@ def test_family_bounds(family, expected):
     assert_array_equal(result, expected)
 
 
+def test_tweedie_distribution_power():
+    with pytest.raises(ValueError, match="no distribution exists"):
+        TweedieDistribution(power=0.5)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        TweedieDistribution(power=1j)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        dist = TweedieDistribution()
+        dist.power = 1j
+
+    dist = TweedieDistribution()
+    assert dist._include_lower_bound is False
+    dist.power = 1
+    assert dist._include_lower_bound is True
+
+
 @pytest.mark.parametrize(
     'family, chk_values',
     [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
@@ -97,7 +115,8 @@ def test_deviance_zero(family, chk_values):
      (GammaDistribution(), LogLink()),
      (InverseGaussianDistribution(), LogLink()),
      (TweedieDistribution(power=1.5), LogLink()),
-     (TweedieDistribution(power=4.5), LogLink())])
+     (TweedieDistribution(power=4.5), LogLink())],
+    ids=lambda args: args.__class__.__name__)
 def test_fisher_matrix(family, link):
     """Test the Fisher matrix numerically.
     Trick: Use numerical differentiation with y = mu"""
@@ -110,6 +129,11 @@ def test_fisher_matrix(family, link):
     weights = rng.randn(10)**2 + 1
     fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu,
                                    weights=weights, link=link)
+    # check that the Fisher matrix is square and positive definite
+    assert fisher.ndim == 2
+    assert fisher.shape[0] == fisher.shape[1]
+    assert np.all(np.linalg.eigvals(fisher) >= 0)
+
     approx = np.array([]).reshape(0, coef.shape[0])
     for i in range(coef.shape[0]):
         def f(coef):
@@ -119,6 +143,13 @@ def f(coef):
             [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)])
     assert_allclose(fisher, approx, rtol=1e-3)
 
+    # check the observed information matrix
+    oim = family._observed_information(coef=coef, phi=phi, X=X, y=mu,
+                                       weights=weights, link=link)
+    assert oim.ndim == 2
+    assert oim.shape == fisher.shape
+    assert_allclose(oim, fisher)
+
 
 def test_sample_weights_validation():
     """Test the raised errors in the validation of sample_weight."""
@@ -713,3 +744,13 @@ def test_fit_dispersion(regression_data):
     assert isinstance(est3.dispersion_, float)
 
     assert_allclose(est2.dispersion_,  est3.dispersion_)
+
+
+@pytest.mark.parametrize("solver", ["irls", "lbfgs", "newton-cg", "cd"])
+def test_convergence_warning(solver, regression_data):
+    X, y = regression_data
+
+    est = GeneralizedLinearRegressor(solver=solver, random_state=2,
+                                     max_iter=1, tol=1e-20)
+    with pytest.warns(ConvergenceWarning):
+        est.fit(X, y)

From 54b80b8db780fbd3b438c6d5dc0001cdafe7a47e Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 13 Jun 2019 09:50:53 -0500
Subject: [PATCH 61/69] TST More specific checks of error messages in tests

---
 sklearn/linear_model/_glm.py           |  6 ++--
 sklearn/linear_model/tests/test_glm.py | 44 ++++++++++++++------------
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index acce438b2b5e2..c4f8cf7a975d3 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -1941,8 +1941,10 @@ def fit(self, X, y, sample_weight=None):
                     k = np.min([10, n_features // 10 + 1])
                     sigma = 0  # start searching near this value
                     which = 'SA'  # find smallest algebraic eigenvalues first
-                    if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma,
-                                                 which=which) >= epsneg):
+                    eigenvalues = splinalg.eigsh(P2, k=k, sigma=sigma,
+                                                 which=which,
+                                                 return_eigenvectors=False)
+                    if not np.all(eigenvalues >= epsneg):
                         raise ValueError("P2 must be positive semi-definite.")
                 else:
                     if not np.all(linalg.eigvalsh(P2) >= epsneg):
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index 7229f21840829..de1a5262b36ce 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -158,7 +158,7 @@ def test_sample_weights_validation():
     y = [1]
     weights = 0
     glm = GeneralizedLinearRegressor(fit_intercept=False)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="weights must be non-negative"):
         glm.fit(X, y, weights)
 
     # Positive weights are accepted
@@ -166,24 +166,26 @@ def test_sample_weights_validation():
 
     # 2d array
     weights = [[0]]
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="must be 1D array or scalar"):
         glm.fit(X, y, weights)
 
     # 1d but wrong length
     weights = [1, 0]
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError,
+                       match="weights must have the same length as y"):
         glm.fit(X, y, weights)
 
     # 1d but only zeros (sum not greater than 0)
     weights = [0, 0]
     X = [[0], [1]]
     y = [1, 2]
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError,
+                       match="must have at least one positive element"):
         glm.fit(X, y, weights)
 
     # 5. 1d but with a negative value
     weights = [2, -1]
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="weights must be non-negative"):
         glm.fit(X, y, weights)
 
 
@@ -202,7 +204,7 @@ def test_glm_family_argument(f, fam):
 
     glm = GeneralizedLinearRegressor(family='not a family',
                                      fit_intercept=False)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="family must be"):
         glm.fit(X, y)
 
 
@@ -218,7 +220,7 @@ def test_glm_link_argument(l, link):
     assert isinstance(glm._link_instance, link.__class__)
 
     glm = GeneralizedLinearRegressor(family='normal', link='not a link')
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="link must be"):
         glm.fit(X, y)
 
 
@@ -228,7 +230,8 @@ def test_glm_alpha_argument(alpha):
     y = np.array([1, 2])
     X = np.array([[1], [2]])
     glm = GeneralizedLinearRegressor(family='normal', alpha=alpha)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError,
+                       match="Penalty term must be a non-negative"):
         glm.fit(X, y)
 
 
@@ -238,7 +241,8 @@ def test_glm_l1_ratio_argument(l1_ratio):
     y = np.array([1, 2])
     X = np.array([[1], [2]])
     glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError,
+                       match="l1_ratio must be a number in interval.*0, 1"):
         glm.fit(X, y)
 
 
@@ -276,13 +280,13 @@ def test_glm_P2_positive_semidefinite():
     P2 = Q.T @ P2 @ Q
     glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
                                      check_input=True)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="P2 must be positive semi-definite"):
         glm.fit(X, y)
 
     P2 = sparse.csr_matrix(P2)
     glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
                                      check_input=True)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="P2 must be positive semi-definite"):
         glm.fit(X, y)
 
 
@@ -292,7 +296,7 @@ def test_glm_fit_intercept_argument(fit_intercept):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="fit_intercept must be bool"):
         glm.fit(X, y)
 
 
@@ -314,7 +318,7 @@ def test_glm_max_iter_argument(max_iter):
     y = np.array([1, 2])
     X = np.array([[1], [2]])
     glm = GeneralizedLinearRegressor(max_iter=max_iter)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="must be a positive integer"):
         glm.fit(X, y)
 
 
@@ -324,7 +328,7 @@ def test_glm_tol_argument(tol):
     y = np.array([1, 2])
     X = np.array([[1], [2]])
     glm = GeneralizedLinearRegressor(tol=tol)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="stopping criteria must be positive"):
         glm.fit(X, y)
 
 
@@ -334,7 +338,7 @@ def test_glm_warm_start_argument(warm_start):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(warm_start=warm_start)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="warm_start must be bool"):
         glm.fit(X, y)
 
 
@@ -356,7 +360,7 @@ def test_glm_selection_argument(selection):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(selection=selection)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="argument selection must be"):
         glm.fit(X, y)
 
 
@@ -366,7 +370,7 @@ def test_glm_random_state_argument(random_state):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(random_state=random_state)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="cannot be used to seed"):
         glm.fit(X, y)
 
 
@@ -376,7 +380,7 @@ def test_glm_diag_fisher_argument(diag_fisher):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="diag_fisher must be bool"):
         glm.fit(X, y)
 
 
@@ -386,7 +390,7 @@ def test_glm_copy_X_argument(copy_X):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(copy_X=copy_X)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="copy_X must be bool"):
         glm.fit(X, y)
 
 
@@ -396,7 +400,7 @@ def test_glm_check_input_argument(check_input):
     y = np.array([1, 2])
     X = np.array([[1], [1]])
     glm = GeneralizedLinearRegressor(check_input=check_input)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="check_input must be bool"):
         glm.fit(X, y)
 
 

From 7db0320f460676d8bed8e01bf64657a89532e2cb Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 14 Jun 2019 10:44:16 -0500
Subject: [PATCH 62/69] Add PoissonRegressor alias

---
 sklearn/linear_model/__init__.py |   5 +-
 sklearn/linear_model/_glm.py     | 163 +++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index e5ede64413eb5..121418f901a1a 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -19,7 +19,7 @@
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
 from ._glm import (TweedieDistribution,
-                   GeneralizedLinearRegressor)
+                   GeneralizedLinearRegressor, PoissonRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
@@ -82,4 +82,5 @@
            'ridge_regression',
            'RANSACRegressor',
            'GeneralizedLinearRegressor',
-           'TweedieDistribution']
+           'TweedieDistribution',
+           'PoissonRegressor']
diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index c4f8cf7a975d3..b18731e73f328 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -2326,3 +2326,166 @@ def score(self, X, y, sample_weight=None):
         y_mean = np.average(y, weights=weights)
         dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
         return 1. - dev / dev_null
+
+    def _more_tags(self):
+        return {"requires_positive_y": True}
+
+
+class PoissonRegressor(GeneralizedLinearRegressor):
+    """Regression with the response variable y following a Poisson distribution
+
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
+    fitting and predicting the mean of the target y as mu=h(X*w).
+    The fit minimizes the following objective function with L2 regularization::
+
+            1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2
+
+    with inverse link function h and s=sample_weight. Note that for
+    ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples).
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
+
+    Parameters
+    ----------
+    alpha : float, optional (default=1)
+        Constant that multiplies the penalty terms and thus determines the
+        regularization strength.
+        See the notes for the exact mathematical meaning of this
+        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        case, the design matrix X must have full column rank
+        (no collinearities).
+
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
+        Method for estimation of the dispersion parameter phi. Whether to use
+        the chi squared statistic or the deviance statistic. If None, the
+        dispersion is not estimated.
+
+    solver : {'irls', 'lbfgs', 'newton-cg'}, optional (default='irls')
+        Algorithm to use in the optimization problem:
+
+        'irls'
+            Iterated reweighted least squares. It is the standard algorithm
+            for GLMs.
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cg'
+            Newton conjugate gradient algorithm.
+
+        Note that all solvers except lbfgs use the fisher matrix, i.e. the
+        expected Hessian instead of the Hessian matrix.
+
+    max_iter : int, optional (default=100)
+        The maximal number of iterations for solver algorithms.
+
+    tol : float, optional (default=1e-4)
+        Stopping criterion. For the irls, newton-cg and lbfgs solvers,
+        the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
+        where ``g_i`` is the i-th component of the gradient (derivative) of
+        the objective function.
+
+    warm_start : boolean, optional (default=False)
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` (supersedes option
+        ``start_params``). If set to ``True`` or if the attribute ``coef_``
+        does not exit (first call to ``fit``), option ``start_params`` sets the
+        start values for ``coef_`` and ``intercept_``.
+
+    start_params : {'guess', 'zero', array of shape (n_features*, )}, \
+            optional (default='guess')
+        Relevant only if ``warm_start=False`` or if fit is called
+        the first time (``self.coef_`` does not yet exist).
+
+        'guess'
+            Start values of mu are calculated by family.starting_mu(..). Then,
+            one Newton step obtains start values for ``coef_``. If
+            ``solver='irls'``, it uses one irls step. This gives usually good
+            starting values.
+
+        'zero'
+        All coefficients are set to zero. If ``fit_intercept=True``, the
+        start value for the intercept is obtained by the weighted average of y.
+
+        array
+        The array of size n_features* is directly used as start values
+        for ``coef_``. If ``fit_intercept=True``, the first element
+        is assumed to be the start value for the ``intercept_``.
+        Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes
+        the intercept in counting.
+
+    random_state : {int, RandomState instance, None}, optional (default=None)
+        If int, random_state is the seed used by the random
+        number generator; if RandomState instance, random_state is the random
+        number generator; if None, the random number generator is the
+        RandomState instance used by `np.random`. Used when ``selection`` ==
+        'random'.
+
+    copy_X : boolean, optional, (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    verbose : int, optional (default=0)
+        For the lbfgs solver set verbose to any positive number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array, shape (n_features,)
+        Estimated coefficients for the linear predictor (X*coef_+intercept_) in
+        the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    dispersion_ : float
+        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
+
+    n_iter_ : int
+        Actual number of iterations used in solver.
+
+    Notes
+    -----
+    The fit itself does not need Y to be from an EDM, but only assumes
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    :ref:`User Guide <Generalized_linear_regression>`.
+
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    minimizing the deviance plus penalty term, which is equivalent to
+    (penalized) maximum likelihood estimation.
+
+    For alpha > 0, the feature matrix X should be standardized in order to
+    penalize features equally strong.
+
+    If the target y is a ratio, appropriate sample weights s should be
+    provided.
+    As an example, consider Poisson distributed counts z (integers) and
+    weights s=exposure (time, money, persons years, ...). Then you fit
+    y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``.
+    The weights are necessary for the right (finite sample) mean.
+    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
+    in this case one might say that y has a 'scaled' Poisson distributions.
+
+    References
+    ----------
+    For the coordinate descent implementation:
+        * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+          An Improved GLMNET for L1-regularized Logistic Regression,
+          Journal of Machine Learning Research 13 (2012) 1999-2030
+          https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    """
+    def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None,
+                 solver='irls', max_iter=100,
+                 tol=1e-4, warm_start=False, start_params='guess',
+                 random_state=None, copy_X=True, check_input=True, verbose=0):
+
+        super().__init__(alpha=alpha, fit_intercept=fit_intercept,
+                         family="poisson", link='log',
+                         fit_dispersion=fit_dispersion, solver=solver,
+                         max_iter=max_iter, tol=tol, warm_start=warm_start,
+                         start_params=start_params, random_state=random_state,
+                         copy_X=copy_X, verbose=verbose)

From dcfe9edaf83509d2b6d98ee840c28f674ea4f496 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 27 Jun 2019 08:41:04 -0500
Subject: [PATCH 63/69] TST Simplify comparison with ridge

---
 sklearn/linear_model/tests/test_glm.py | 94 ++++++--------------------
 1 file changed, 22 insertions(+), 72 deletions(-)

diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
index de1a5262b36ce..1416bdcfad680 100644
--- a/sklearn/linear_model/tests/test_glm.py
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -26,6 +26,8 @@
 
 from sklearn.utils.testing import assert_array_equal
 
+GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg', 'cd']
+
 
 @pytest.fixture(scope="module")
 def regression_data():
@@ -404,7 +406,7 @@ def test_glm_check_input_argument(check_input):
         glm.fit(X, y)
 
 
-@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd'])
+@pytest.mark.parametrize('solver', GLM_SOLVERS)
 def test_glm_identity_regression(solver):
     """Test GLM regression with identity link on a simple dataset."""
     coef = [1., 2.]
@@ -442,97 +444,45 @@ def test_glm_log_regression(family, solver, tol):
 # newton-cg may issue a LineSearchWarning, which we filter out
 @pytest.mark.filterwarnings('ignore:The line search algorithm')
 @pytest.mark.filterwarnings('ignore:Line Search failed')
-@pytest.mark.parametrize('solver, tol', [('irls', 1e-6),
-                                         ('lbfgs', 1e-6),
-                                         ('newton-cg', 1e-6),
-                                         ('cd', 1e-6)])
-def test_normal_ridge(solver, tol):
+@pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)])
+@pytest.mark.parametrize('fit_intercept', [True, False])
+@pytest.mark.parametrize('solver', GLM_SOLVERS)
+def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
     """Test ridge regression for Normal distributions.
 
+    Case n_samples >> n_features
+
     Compare to test_ridge in test_ridge.py.
     """
-    rng = np.random.RandomState(42)
     alpha = 1.0
-
-    # 1. With more samples than features
-    n_samples, n_features, n_predict = 100, 7, 10
+    n_predict = 10
     X, y, coef = make_regression(n_samples=n_samples+n_predict,
                                  n_features=n_features,
                                  n_informative=n_features-2, noise=0.5,
-                                 coef=True, random_state=rng)
+                                 coef=True, random_state=42)
     y = y[0:n_samples]
     X, T = X[0:n_samples], X[n_samples:]
 
-    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6,
-                  solver='svd', normalize=False)
-    ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
-                                     link='identity', fit_intercept=True,
-                                     tol=tol, max_iter=100, solver=solver,
-                                     check_input=False, random_state=rng)
-    glm.fit(X, y)
-    assert glm.coef_.shape == (X.shape[1], )
-    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6)
-    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
-    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6)
-
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6,
-                  solver='svd', normalize=False)
-    ridge.fit(X, y)
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
-                                     link='identity', fit_intercept=False,
-                                     tol=tol, max_iter=100, solver=solver,
-                                     check_input=False, random_state=rng,
-                                     fit_dispersion='chisqr')
-    glm.fit(X, y)
-    assert glm.coef_.shape == (X.shape[1], )
-    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-5)
-    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6)
-    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6)
-    mu = glm.predict(X)
-    assert_allclose(glm.dispersion_,
-                    np.sum((y-mu)**2/(n_samples-n_features)))
-
-    # 2. With more features than samples and sparse
-    n_samples, n_features, n_predict = 10, 100, 10
-    X, y, coef = make_regression(n_samples=n_samples+n_predict,
-                                 n_features=n_features,
-                                 n_informative=n_features-2, noise=0.5,
-                                 coef=True, random_state=rng)
-    y = y[0:n_samples]
-    X, T = X[0:n_samples], X[n_samples:]
+    if n_samples > n_features:
+        ridge_params = {"solver": "svd"}
+    else:
+        ridge_params = {"solver": "sag", "max_iter": 10000, "tol": 1e-9}
 
     # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9,
-                  solver='sag', normalize=False, max_iter=100000,
-                  random_state=42)
+    ridge = Ridge(alpha=alpha*n_samples, normalize=False,
+                  random_state=42, **ridge_params)
     ridge.fit(X, y)
+
     glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
                                      link='identity', fit_intercept=True,
-                                     tol=tol, max_iter=300, solver=solver,
-                                     check_input=False, random_state=rng)
+                                     max_iter=300, solver=solver, tol=1e-6,
+                                     check_input=False, random_state=42)
     glm.fit(X, y)
     assert glm.coef_.shape == (X.shape[1], )
     assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6)
     assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6)
     assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
 
-    ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7,
-                  solver='sag', normalize=False, max_iter=1000,
-                  random_state=42)
-    ridge.fit(X, y)
-
-    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
-                                     link='identity', fit_intercept=False,
-                                     tol=tol*2, max_iter=300, solver=solver,
-                                     check_input=False, random_state=rng)
-    glm.fit(X, y)
-    assert glm.coef_.shape == (X.shape[1], )
-    assert_allclose(glm.coef_, ridge.coef_, rtol=1e-4)
-    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
-    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
-
 
 @pytest.mark.parametrize('solver, tol',
                          [('irls', 1e-7),
@@ -559,7 +509,7 @@ def test_poisson_ridge(solver, tol):
     rng = np.random.RandomState(42)
     glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
                                      fit_intercept=True, family='poisson',
-                                     link='log', tol=tol,
+                                     link='log', tol=1e-7,
                                      solver=solver, max_iter=300,
                                      random_state=rng)
     glm.fit(X, y)
@@ -750,7 +700,7 @@ def test_fit_dispersion(regression_data):
     assert_allclose(est2.dispersion_,  est3.dispersion_)
 
 
-@pytest.mark.parametrize("solver", ["irls", "lbfgs", "newton-cg", "cd"])
+@pytest.mark.parametrize("solver", GLM_SOLVERS)
 def test_convergence_warning(solver, regression_data):
     X, y = regression_data
 

From 4879bb6e057a95c38ee4950d13ef2dbd98da0a19 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 28 Jun 2019 09:54:59 -0500
Subject: [PATCH 64/69] EXA Add plot_tweedie_regression_insurance_claims.py

---
 ...lot_tweedie_regression_insurance_claims.py | 500 ++++++++++++++++++
 1 file changed, 500 insertions(+)
 create mode 100644 examples/linear_model/plot_tweedie_regression_insurance_claims.py

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
new file mode 100644
index 0000000000000..cb2ff667e8379
--- /dev/null
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -0,0 +1,500 @@
+"""
+======================================
+Tweedie regression on insurance claims
+======================================
+
+This example illustrate the use Poisson, Gamma and Tweedie regression
+on the French Motor Third-Party Liability Claims dataset, and is inspired
+by an R tutorial [1].
+
+Insurance claims data consist of the number of claims and the total claim
+amount. Often, the final goal is to predict the expected value, i.e. the mean,
+of the total claim amount. There are several possibilities to do that, two of
+which are:
+
+1. Model the number of claims with a Poisson distribution, the average
+   claim amount as a Gamma distribution and multiply the predictions, to get
+   the total claim amount.
+2. Model total claim amount directly, typically with a Tweedie distribution.
+
+In this example we will illustrate both approaches. We start by defining a few
+helper functions for loading the data and visualizing results.
+
+
+.. [1]  A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor
+    Third-Party Liability Claims (November 8, 2018).
+    `doi:10.2139/ssrn.3164764 <http://dx.doi.org/10.2139/ssrn.3164764>`_
+
+"""
+print(__doc__)
+
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#          Roman Yurchak <rth.yurchak@gmail.com>
+# License: BSD 3 clause
+from functools import partial
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import GeneralizedLinearRegressor
+from sklearn.linear_model._glm import TweedieDistribution
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
+from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
+
+from sklearn.metrics import mean_absolute_error
+
+
+def load_mtpl2(n_samples=100000):
+    """Fetcher for French Motor Third-Party Liability Claims dataset
+
+    Parameters
+    ----------
+    n_samples: int, default=100000
+      number of samples to select (for faster run time).
+    """
+
+    # Note: this should use the OpenML DataFrame fetcher in the future
+    df_freq = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv",
+        dtype={"IDpol": np.int},
+        index_col=0,
+    )
+
+    df_sev = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff",
+        index_col=0,
+    )
+
+    # sum ClaimAmount over identical IDs
+    df_sev = df_sev.groupby(level=0).sum()
+
+    df = df_freq.join(df_sev, how="left")
+    df["ClaimAmount"].fillna(0, inplace=True)
+
+    # unquote string fields
+    for column_name in df.columns[df.dtypes.values == np.object]:
+        df[column_name] = df[column_name].str.strip("'")
+    return df.iloc[:n_samples]
+
+
+def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None,
+                  title=None, kind_weight=None, ax=None):
+    """Plot observed and predicted - aggregated per feature level.
+
+    Parameters
+    ----------
+    df : DataFrame with at least one column named feature
+    observed : str
+        a column name of the observed target
+    predicted : frame
+        a dataframe, with the same index as df, with the predicted target
+    weight : str
+        column name with the values of weights/exposure
+    """
+    # aggregate observed and predicted variables by feature level
+    df_ = df.loc[:, [feature, weight]].copy()
+    df_["observed"] = df[observed] * df[weight]
+    df_["predicted"] = y_predicted * df[weight]
+    df_ = (
+        df_.groupby([feature])[weight, "observed", "predicted"]
+        .sum()
+        .assign(observed=lambda x: x["observed"] / x[weight])
+        .assign(predicted=lambda x: x["predicted"] / x[weight])
+    )
+
+    ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax)
+    y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8
+    ax.fill_between(
+        df_.index,
+        0,
+        y_max * df_[weight] / df_[weight].values.max(),
+        color="g",
+        alpha=0.1,
+    )
+    ax.set(
+        ylabel=y_label if y_label is not None else None,
+        title=title if title is not None else "Train: Observed vs Predicted",
+    )
+
+
+##############################################################################
+#
+# 1. Loading datasets and pre-processing
+# --------------------------------------
+#
+# We construct the freMTPL2 dataset by joining the  freMTPL2freq table,
+# containing the number of claims (``ClaimNb``) with the freMTPL2sev table
+# containing the claim amount (``ClaimAmount``) for the same user ids.
+
+df = load_mtpl2(n_samples=100000)
+
+# Note: filter out claims with zero amount, as the severity model
+# requires a strictly positive target values.
+df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
+
+# correct for unreasonable observations (that might be data error)
+df["ClaimNb"].clip(upper=4, inplace=True)
+df["Exposure"].clip(upper=1, inplace=True)
+
+column_trans = ColumnTransformer(
+    [
+        ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
+        (
+            "Veh_Brand_Gas_Region",
+            OneHotEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
+        ),
+        ("BonusMalus", "passthrough", ["BonusMalus"]),
+        (
+            "Density_log",
+            make_pipeline(
+                FunctionTransformer(np.log, validate=False), StandardScaler()
+            ),
+            ["Density"],
+        ),
+    ],
+    remainder="drop",
+)
+X = column_trans.fit_transform(df)
+
+
+df["Frequency"] = df.ClaimNb / df.Exposure
+df["AvgClaimAmount"] = df.ClaimAmount / np.fmax(df.ClaimNb, 1)
+
+print(df[df.ClaimAmount > 0].head())
+
+##############################################################################
+#
+# 2. Frequency model -- Poisson distribution
+# -------------------------------------------
+#
+# The number of claims (``ClaimNb``) is a positive integer that can be modeled
+# as a Poisson distribution. It is then assumed to be the number of discrete
+# events occuring with a constant rate in a given time interval (``Exposure``).
+# Here we model the frequency ``y = ClaimNb / Exposure``,
+# which is still a (scaled) Poisson distribution.
+#
+# A very important property of the Poisson distribution is its mean-variance
+# relation: The variance is proportional to the mean.
+
+df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2)
+
+glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0)
+glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
+
+
+def mean_deviance(estimator, y, y_pred, weights):
+    if hasattr(estimator, "_family_instance"):
+        return estimator._family_instance.deviance(y, y_pred, weights) / len(y)
+    else:
+        return np.nan
+
+
+def score_estimator(
+    estimator, X_train, X_test, df_train, df_test, target, weights
+):
+    res = []
+
+    for subset_label, X, df in [
+        ("train", X_train, df_train),
+        ("test", X_test, df_test),
+    ]:
+        y, _weights = df[target], df[weights]
+
+        for score_label, metric in [
+            ("D² explaned", None),
+            ("mean deviance", partial(mean_deviance, estimator)),
+            ("mean abs. error", mean_absolute_error),
+        ]:
+            if estimator.__class__.__name__ == "ClaimProdEstimator":
+                # ClaimProdEstimator is the product of the frequency and
+                # severity models, together with a denormalized by the exposure
+                # values. It does not fully follow the scikit-learn API and we
+                # must handle it separately.
+                y_pred = estimator.predict(X, exposure=df.Exposure.values)
+            else:
+                y_pred = estimator.predict(X)
+            if metric is None:
+                if not hasattr(estimator, "score"):
+                    continue
+                score = estimator.score(X, y, _weights)
+            else:
+                score = metric(y, y_pred, _weights)
+
+            res.append(
+                {"subset": subset_label, "metric": score_label, "score": score}
+            )
+
+    res = (
+        pd.DataFrame(res)
+        .set_index(["metric", "subset"])
+        .score.unstack(-1)
+        .round(3)
+    )
+    return res
+
+
+scores = score_estimator(
+    glm_freq,
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target="Frequency",
+    weights="Exposure",
+)
+print(scores)
+
+##############################################################################
+#
+# We can visually compare observed and predicted values, aggregated by
+# the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance
+# bonus/penalty (``BonusMalus``),
+
+fig, ax = plt.subplots(2, 2, figsize=(16, 8))
+fig.subplots_adjust(hspace=0.3, wspace=0.2)
+
+plot_obs_pred(
+    df_train,
+    "DrivAge",
+    "Frequency",
+    glm_freq.predict(X_train),
+    weight="Exposure",
+    y_label="Claim Frequency",
+    title="train data",
+    ax=ax[0, 0],
+)
+
+plot_obs_pred(
+    df_test,
+    "DrivAge",
+    "Frequency",
+    glm_freq.predict(X_test),
+    weight="Exposure",
+    y_label="Claim Frequency",
+    title="test data",
+    ax=ax[0, 1],
+)
+
+plot_obs_pred(
+    df_test,
+    "VehAge",
+    "Frequency",
+    glm_freq.predict(X_test),
+    weight="Exposure",
+    y_label="Claim Frequency",
+    title="test data",
+    ax=ax[1, 0],
+)
+
+plot_obs_pred(
+    df_test,
+    "BonusMalus",
+    "Frequency",
+    glm_freq.predict(X_test),
+    weight="Exposure",
+    y_label="Claim Frequency",
+    title="test data",
+    ax=ax[1, 1],
+)
+
+
+##############################################################################
+#
+# 3. Severity model -  Gamma Distribution
+# ---------------------------------------
+# The mean claim amount or severity (`AvgClaimAmount`) can be empirically
+# shown to follow a Gamma distribution. We fit a GLM model for the severity
+# with the same features as the frequency model.
+#
+# Note:
+# - We filter out ``ClaimAmount == 0``` as the Gamma distribution as support
+# on :math:`(0, \infty)` not :math:`[0, \infty)`.
+# - We use ``ClaimNb`` as sample weights.
+
+mask_train = df_train["ClaimAmount"] > 0
+mask_test = df_test["ClaimAmount"] > 0
+
+glm_sev = GeneralizedLinearRegressor(family="gamma", alpha=1)
+
+glm_sev.fit(
+    X_train[mask_train.values],
+    df_train.loc[mask_train, "AvgClaimAmount"],
+    sample_weight=df_train.loc[mask_train, "ClaimNb"],
+)
+
+
+scores = score_estimator(
+    glm_sev,
+    X_train[mask_train.values],
+    X_test[mask_test.values],
+    df_train[mask_train],
+    df_test[mask_test],
+    target="AvgClaimAmount",
+    weights="ClaimNb",
+)
+print(scores)
+
+##############################################################################
+#
+# Note that the resulting model is conditional on having at least one claim,
+# and cannot be used to predict the average claim amount in general,
+
+print(
+    "Mean AvgClaim Amount:               %.2f "
+    % df_train.AvgClaimAmount.mean()
+)
+print(
+    "Mean AvgClaim Amount | NbClaim > 0: %.2f"
+    % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean()
+)
+print(
+    "Predicted Mean AvgClaim Amount:     %.2f"
+    % glm_sev.predict(X_train).mean()
+)
+
+
+##############################################################################
+#
+# We can visually compare observed and predicted values, aggregated for
+# the drivers age (``Driv Age``),
+
+fig, ax = plt.subplots(1, 2, figsize=(16, 4))
+
+# plot DivAge
+plot_obs_pred(
+    df_train.loc[mask_train],
+    "DrivAge",
+    "AvgClaimAmount",
+    glm_sev.predict(X_train[mask_train.values]),
+    weight="Exposure",
+    y_label="Average Claim Severity",
+    title="train data",
+    ax=ax[0],
+)
+
+plot_obs_pred(
+    df_test.loc[mask_test],
+    "DrivAge",
+    "AvgClaimAmount",
+    glm_sev.predict(X_test[mask_test.values]),
+    weight="Exposure",
+    y_label="Average Claim Severity",
+    title="test data",
+    ax=ax[1],
+)
+
+
+##############################################################################
+#
+# 3. Total Claims Amount -- Compound Poisson distribution
+# -------------------------------------------------------
+#
+# As mentionned in the introduction, the total claim amount can be modeled
+# either as the product of the frequency model by the severity model.
+
+
+class ClaimProdEstimator:
+    """Total claim amount estimator
+
+    Computed as the product of the frequency model by the serverity model,
+    denormalized by exposure.
+    """
+
+    def __init__(self, est_freq, est_sev):
+        self.est_freq = est_freq
+        self.est_sev = est_sev
+
+    def predict(self, X, exposure):
+        """Predict the total claim amount
+
+        The predict method is not compatible with the scikit-learn API.
+        """
+        return exposure * self.est_freq.predict(X) * self.est_sev.predict(X)
+
+
+est_prod = ClaimProdEstimator(glm_freq, glm_sev)
+
+scores = score_estimator(
+    est_prod,
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target="ClaimAmount",
+    weights="Exposure",
+)
+print(scores)
+
+
+##############################################################################
+#
+# or as a unique Compound Poisson model, also corresponding to a Tweedie model
+# with a power :math:`p \in (1, 2)`. We determine the optimal hyperparameter
+# ``p`` with a grid search,
+
+from sklearn.model_selection import GridSearchCV
+
+# this takes a while
+params = {
+    "family": [
+        TweedieDistribution(power=power) for power in np.linspace(1, 2, 8)
+    ]
+}
+
+glm_total = GridSearchCV(
+    GeneralizedLinearRegressor(), cv=3, param_grid=params, n_jobs=-1
+)
+glm_total.fit(
+    X_train, df_train["ClaimAmount"], sample_weight=df_train["Exposure"]
+)
+
+
+print(
+    "Best hyperparameters: power=%.2f\n"
+    % glm_total.best_estimator_.family.power
+)
+
+scores = score_estimator(
+    glm_total.best_estimator_,
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target="ClaimAmount",
+    weights="Exposure",
+)
+print(scores)
+
+##############################################################################
+#
+# In this example, the mean absolute error is lower for the Compound Poisson
+# model than when using separate models for frequency and severity.
+#
+# We can additionally validate these models by comparing observed and predicted
+# total claim amount over the test and train subsets.
+
+res = []
+for subset_label, X, df in [
+    ("train", X_train, df_train),
+    ("test", X_test, df_test),
+]:
+    res.append(
+        {
+            "subset": subset_label,
+            "observed": df.ClaimAmount.values.sum(),
+            "predicted, frequency*severity model": np.sum(
+                est_prod.predict(X, exposure=df.Exposure.values)
+            ),
+            "predicted, tweedie, p=%.2f"
+            % glm_total.best_estimator_.family.power: np.sum(
+                glm_total.best_estimator_.predict(X)
+            ),
+        }
+    )
+
+print(pd.DataFrame(res).set_index("subset").T)

From 56069e5b3f5b453d9e8a487c9d27e20900ce4d63 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 28 Jun 2019 10:11:37 -0500
Subject: [PATCH 65/69] EXA Fix issues with older pandas versions in example

---
 .../linear_model/plot_tweedie_regression_insurance_claims.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index cb2ff667e8379..063d12e6e291b 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -137,8 +137,8 @@ def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None,
 df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
 
 # correct for unreasonable observations (that might be data error)
-df["ClaimNb"].clip(upper=4, inplace=True)
-df["Exposure"].clip(upper=1, inplace=True)
+df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
+df["Exposure"] = df["Exposure"].clip(upper=1)
 
 column_trans = ColumnTransformer(
     [

From 53f3c5f6670bbdefcd5ddf6d63e405401909cf06 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jul 2019 09:14:30 -0500
Subject: [PATCH 66/69] DOC Add second poisson regression example

---
 ...plot_poisson_regression_non_normal_loss.py | 257 ++++++++++++++++++
 1 file changed, 257 insertions(+)
 create mode 100644 examples/linear_model/plot_poisson_regression_non_normal_loss.py

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
new file mode 100644
index 0000000000000..b06adcb787560
--- /dev/null
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -0,0 +1,257 @@
+"""
+======================================
+Poisson regression and non normal loss
+======================================
+
+This example illustrate the use linear Poisson regression
+on the French Motor Third-Party Liability Claims dataset [1] and compare
+it with learning models with least squared error.
+
+
+We start by defining a few helper functions for loading the data and
+visualizing results.
+
+
+.. [1]  A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor
+    Third-Party Liability Claims (November 8, 2018).
+    `doi:10.2139/ssrn.3164764 <http://dx.doi.org/10.2139/ssrn.3164764>`_
+
+"""
+print(__doc__)
+
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#          Roman Yurchak <rth.yurchak@gmail.com>
+# License: BSD 3 clause
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+from scipy.special import xlogy
+
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import GeneralizedLinearRegressor, LinearRegression
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
+from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
+from sklearn.ensemble import GradientBoostingRegressor
+
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+
+
+def load_mtpl2(n_samples=100000):
+    """Fetcher for French Motor Third-Party Liability Claims dataset
+
+    Parameters
+    ----------
+    n_samples: int, default=100000
+      number of samples to select (for faster run time).
+    """
+
+    # Note: this should use the OpenML DataFrame fetcher in the future
+    df_freq = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv",
+        dtype={"IDpol": np.int},
+        index_col=0,
+    )
+
+    df_sev = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff",
+        index_col=0,
+    )
+
+    # sum ClaimAmount over identical IDs
+    df_sev = df_sev.groupby(level=0).sum()
+
+    df = df_freq.join(df_sev, how="left")
+    df["ClaimAmount"].fillna(0, inplace=True)
+
+    # unquote string fields
+    for column_name in df.columns[df.dtypes.values == np.object]:
+        df[column_name] = df[column_name].str.strip("'")
+    return df.iloc[:n_samples]
+
+
+##############################################################################
+#
+# 1. Loading datasets and pre-processing
+# --------------------------------------
+#
+# We construct the freMTPL2 dataset by joining the  freMTPL2freq table,
+# containing the number of claims (``ClaimNb``) with the freMTPL2sev table
+# containing the claim amount (``ClaimAmount``) for the same user ids.
+
+df = load_mtpl2(n_samples=100000)
+
+# Note: filter out claims with zero amount, as the severity model
+# requires a strictly positive target values.
+df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
+
+# correct for unreasonable observations (that might be data error)
+df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
+df["Exposure"] = df["Exposure"].clip(upper=1)
+
+column_trans = ColumnTransformer(
+    [
+        ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
+        (
+            "Veh_Brand_Gas_Region",
+            OneHotEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
+        ),
+        ("BonusMalus", "passthrough", ["BonusMalus"]),
+        (
+            "Density_log",
+            make_pipeline(
+                FunctionTransformer(np.log, validate=False), StandardScaler()
+            ),
+            ["Density"],
+        ),
+    ],
+    remainder="drop",
+)
+X = column_trans.fit_transform(df)
+
+##############################################################################
+#
+# The number of claims (``ClaimNb``) is a positive integer that can be modeled
+# as a Poisson distribution. It is then assumed to be the number of discrete
+# events occurring with a constant rate in a given time interval
+# (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``,
+# which is still a (scaled) Poisson distribution.
+#
+# A very important property of the Poisson distribution is its mean-variance
+# relation: The variance is proportional to the mean.
+
+df["Frequency"] = df.ClaimNb / df.Exposure
+
+print(
+   pd.cut(df.Frequency, [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts()
+)
+
+##############################################################################
+#
+# It worth noting that 96 % of users have 0 claims, and if we were to convert
+# this problem into a binary classification task, it would be significantly
+# imbalanced.
+#
+# To evaluate the pertinence of the used metrics, we will consider as a
+# baseline an estimator that returns 0 for any input.
+
+df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2)
+
+
+def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None):
+    y_true = np.atleast_1d(y_true)
+    y_pred = np.atleast_1d(y_pred)
+    dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred)
+    return np.average(dev, weights=sample_weights)
+
+
+eps = 1e-5
+print("MSE: %.3f" % mean_squared_error(
+        df_test.Frequency.values, np.zeros(len(df_test)),
+        df_test.Exposure.values))
+print("MAE: %.3f" % mean_absolute_error(
+        df_test.Frequency.values, np.zeros(len(df_test)),
+        df_test.Exposure.values))
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+        df_test.Frequency.values, eps + np.zeros(len(df_test)),
+        df_test.Exposure.values))
+
+
+##############################################################################
+#
+# We start by modeling the target variable with the least squares linear
+# regression model,
+
+
+linregr = LinearRegression()
+linregr.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
+
+print("LinearRegression")
+print("MSE: %.3f" % mean_squared_error(
+          df_test.Frequency.values, linregr.predict(X_test),
+          df_test.Exposure.values))
+print("MSE: %.3f" % mean_absolute_error(
+          df_test.Frequency.values, linregr.predict(X_test),
+          df_test.Exposure.values))
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+        df_test.Frequency.values, np.fmax(linregr.predict(X_test), eps),
+        df_test.Exposure.values))
+
+##############################################################################
+#
+# The Poisson deviance cannot be computed because negative values are
+# predicted by the model,
+
+print('Number Negatives: %s / total: %s' % (
+      (linregr.predict(X_test) < 0).sum(), X_test.shape[0]))
+
+##############################################################################
+#
+# Next we fit the Poisson regressor on the target variable,
+
+glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0)
+glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
+
+print("PoissonRegressor")
+print("MSE: %.3f" % mean_squared_error(
+        df_test.Frequency.values, glm_freq.predict(X_test),
+        df_test.Exposure.values))
+print("MAE: %.3f" % mean_absolute_error(
+        df_test.Frequency.values, glm_freq.predict(X_test),
+        df_test.Exposure.values))
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+        df_test.Frequency.values, glm_freq.predict(X_test),
+        df_test.Exposure.values))
+
+##############################################################################
+#
+# Finally we will consider a non linear model  with Gradient boosting that
+# still minimizes the least square error.
+
+
+gbr = GradientBoostingRegressor(max_depth=3)
+gbr.fit(X_train, df_train.Frequency.values,
+        sample_weight=df_train.Exposure.values)
+
+
+print("GradientBoostingRegressor")
+print("MSE: %.3f" % mean_squared_error(
+      df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
+print("MAE: %.3f" % mean_absolute_error(
+      df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+      df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
+
+##############################################################################
+#
+# In this example, although Gradient boosting minimizes the least square error,
+# because of a higher predictive power it also results in a smaller Poisson
+# deviance than the Poisson regression model.
+#
+# Evaluating models with a single train / test split is prone to numerical
+# errors, we can verify that we would also get equivalent resuts with the
+# cross-validation score.
+#
+# The difference between these models can also be visualized by comparing the
+# histogram of observed target values with that of predicted values,
+
+
+fig, ax = plt.subplots(1, 4, figsize=(16, 3))
+
+df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=ax[0])
+
+ax[0].set_title('Experimental data')
+
+for idx, model in enumerate([linregr, glm_freq, gbr]):
+    y_pred = model.predict(X_train)
+
+    pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=ax[idx+1])
+    ax[idx+1].set_title(model.__class__.__name__)
+
+for axi in ax:
+    axi.set(
+        yscale='log',
+        xlabel="y (Frequency)"
+    )

From e58d8e3a35ad0d5a6ceda167d5452d885ec855a3 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 14 Jul 2019 18:20:09 +0200
Subject: [PATCH 67/69] EXA wording and score in
 plot_tweedie_regression_insurance_claims.html

---
 ...lot_tweedie_regression_insurance_claims.py | 137 ++++++++++--------
 1 file changed, 78 insertions(+), 59 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 063d12e6e291b..00111b811f923 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -3,7 +3,7 @@
 Tweedie regression on insurance claims
 ======================================
 
-This example illustrate the use Poisson, Gamma and Tweedie regression
+This example illustrates the use of Poisson, Gamma and Tweedie regression
 on the French Motor Third-Party Liability Claims dataset, and is inspired
 by an R tutorial [1].
 
@@ -13,9 +13,10 @@
 which are:
 
 1. Model the number of claims with a Poisson distribution, the average
-   claim amount as a Gamma distribution and multiply the predictions, to get
-   the total claim amount.
-2. Model total claim amount directly, typically with a Tweedie distribution.
+   claim amount as a Gamma distribution and multiply the predictions of both in
+   order to get the total claim amount.
+2. Model total claim amount directly, typically with a Tweedie distribution of
+   Tweedie power :math:`p \\in (1, 2)`.
 
 In this example we will illustrate both approaches. We start by defining a few
 helper functions for loading the data and visualizing results.
@@ -49,7 +50,7 @@
 
 
 def load_mtpl2(n_samples=100000):
-    """Fetcher for French Motor Third-Party Liability Claims dataset
+    """Fetch the French Motor Third-Party Liability Claims dataset.
 
     Parameters
     ----------
@@ -81,24 +82,27 @@ def load_mtpl2(n_samples=100000):
     return df.iloc[:n_samples]
 
 
-def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None,
+def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
                   title=None, kind_weight=None, ax=None):
     """Plot observed and predicted - aggregated per feature level.
 
     Parameters
     ----------
-    df : DataFrame with at least one column named feature
+    df : DataFrame with at least three columns named feature, weight and
+         observed
+    feature: str
+        a column name of df for the feature to be plotted
+    weight : str
+        column name of df with the values of weights or exposure
     observed : str
-        a column name of the observed target
+        a column name of df with the observed target
     predicted : frame
         a dataframe, with the same index as df, with the predicted target
-    weight : str
-        column name with the values of weights/exposure
     """
     # aggregate observed and predicted variables by feature level
     df_ = df.loc[:, [feature, weight]].copy()
     df_["observed"] = df[observed] * df[weight]
-    df_["predicted"] = y_predicted * df[weight]
+    df_["predicted"] = predicted * df[weight]
     df_ = (
         df_.groupby([feature])[weight, "observed", "predicted"]
         .sum()
@@ -126,9 +130,10 @@ def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None,
 # 1. Loading datasets and pre-processing
 # --------------------------------------
 #
-# We construct the freMTPL2 dataset by joining the  freMTPL2freq table,
-# containing the number of claims (``ClaimNb``) with the freMTPL2sev table
-# containing the claim amount (``ClaimAmount``) for the same user ids.
+# We construct the freMTPL2 dataset by joining the freMTPL2freq table,
+# containing the number of claims (``ClaimNb``), with the freMTPL2sev table,
+# containing the claim amount (``ClaimAmount``) for the same policy ids
+# (``IDpol``).
 
 df = load_mtpl2(n_samples=100000)
 
@@ -206,14 +211,14 @@ def score_estimator(
         y, _weights = df[target], df[weights]
 
         for score_label, metric in [
-            ("D² explaned", None),
+            ("D² explained", None),
             ("mean deviance", partial(mean_deviance, estimator)),
             ("mean abs. error", mean_absolute_error),
         ]:
             if estimator.__class__.__name__ == "ClaimProdEstimator":
-                # ClaimProdEstimator is the product of the frequency and
-                # severity models, together with a denormalized by the exposure
-                # values. It does not fully follow the scikit-learn API and we
+                # ClaimProdEstimator is the product of frequency and severity
+                # models, denormalized by the exposure values.
+                # It does not fully follow the scikit-learn API and we
                 # must handle it separately.
                 y_pred = estimator.predict(X, exposure=df.Exposure.values)
             else:
@@ -253,50 +258,50 @@ def score_estimator(
 #
 # We can visually compare observed and predicted values, aggregated by
 # the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance
-# bonus/penalty (``BonusMalus``),
+# bonus/malus (``BonusMalus``).
 
 fig, ax = plt.subplots(2, 2, figsize=(16, 8))
 fig.subplots_adjust(hspace=0.3, wspace=0.2)
 
 plot_obs_pred(
-    df_train,
-    "DrivAge",
-    "Frequency",
-    glm_freq.predict(X_train),
+    df=df_train,
+    feature="DrivAge",
     weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_train),
     y_label="Claim Frequency",
     title="train data",
     ax=ax[0, 0],
 )
 
 plot_obs_pred(
-    df_test,
-    "DrivAge",
-    "Frequency",
-    glm_freq.predict(X_test),
+    df=df_test,
+    feature="DrivAge",
     weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_test),
     y_label="Claim Frequency",
     title="test data",
     ax=ax[0, 1],
 )
 
 plot_obs_pred(
-    df_test,
-    "VehAge",
-    "Frequency",
-    glm_freq.predict(X_test),
+    df=df_test,
+    feature="VehAge",
     weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_test),
     y_label="Claim Frequency",
     title="test data",
     ax=ax[1, 0],
 )
 
 plot_obs_pred(
-    df_test,
-    "BonusMalus",
-    "Frequency",
-    glm_freq.predict(X_test),
+    df=df_test,
+    feature="BonusMalus",
     weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_test),
     y_label="Claim Frequency",
     title="test data",
     ax=ax[1, 1],
@@ -308,12 +313,13 @@ def score_estimator(
 # 3. Severity model -  Gamma Distribution
 # ---------------------------------------
 # The mean claim amount or severity (`AvgClaimAmount`) can be empirically
-# shown to follow a Gamma distribution. We fit a GLM model for the severity
-# with the same features as the frequency model.
+# shown to follow approximately a Gamma distribution. We fit a GLM model for
+# the severity with the same features as the frequency model.
 #
 # Note:
-# - We filter out ``ClaimAmount == 0``` as the Gamma distribution as support
-# on :math:`(0, \infty)` not :math:`[0, \infty)`.
+#
+# - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support
+#   on :math:`(0, \infty)`, not :math:`[0, \infty)`.
 # - We use ``ClaimNb`` as sample weights.
 
 mask_train = df_train["ClaimAmount"] > 0
@@ -341,19 +347,20 @@ def score_estimator(
 
 ##############################################################################
 #
-# Note that the resulting model is conditional on having at least one claim,
-# and cannot be used to predict the average claim amount in general,
+# Note that the resulting model is the average claim amount per claim. As such,
+# it is conditional on having at least one claim, and cannot be used to predict
+# the average claim amount per policy in general.
 
 print(
-    "Mean AvgClaim Amount:               %.2f "
+    "Mean AvgClaim Amount per policy:              %.2f "
     % df_train.AvgClaimAmount.mean()
 )
 print(
-    "Mean AvgClaim Amount | NbClaim > 0: %.2f"
+    "Mean AvgClaim Amount | NbClaim > 0:           %.2f"
     % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean()
 )
 print(
-    "Predicted Mean AvgClaim Amount:     %.2f"
+    "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f"
     % glm_sev.predict(X_train).mean()
 )
 
@@ -361,28 +368,28 @@ def score_estimator(
 ##############################################################################
 #
 # We can visually compare observed and predicted values, aggregated for
-# the drivers age (``Driv Age``),
+# the drivers age (``DrivAge``).
 
 fig, ax = plt.subplots(1, 2, figsize=(16, 4))
 
 # plot DivAge
 plot_obs_pred(
-    df_train.loc[mask_train],
-    "DrivAge",
-    "AvgClaimAmount",
-    glm_sev.predict(X_train[mask_train.values]),
+    df=df_train.loc[mask_train],
+    feature="DrivAge",
     weight="Exposure",
+    observed="AvgClaimAmount",
+    predicted=glm_sev.predict(X_train[mask_train.values]),
     y_label="Average Claim Severity",
     title="train data",
     ax=ax[0],
 )
 
 plot_obs_pred(
-    df_test.loc[mask_test],
-    "DrivAge",
-    "AvgClaimAmount",
-    glm_sev.predict(X_test[mask_test.values]),
+    df=df_test.loc[mask_test],
+    feature="DrivAge",
     weight="Exposure",
+    observed="AvgClaimAmount",
+    predicted=glm_sev.predict(X_test[mask_test.values]),
     y_label="Average Claim Severity",
     title="test data",
     ax=ax[1],
@@ -391,31 +398,41 @@ def score_estimator(
 
 ##############################################################################
 #
-# 3. Total Claims Amount -- Compound Poisson distribution
+# 4. Total Claims Amount -- Compound Poisson distribution
 # -------------------------------------------------------
 #
 # As mentionned in the introduction, the total claim amount can be modeled
-# either as the product of the frequency model by the severity model.
+# either as the product of the frequency model by the severity model,
 
 
 class ClaimProdEstimator:
-    """Total claim amount estimator
+    """Total claim amount estimator.
 
     Computed as the product of the frequency model by the serverity model,
-    denormalized by exposure.
+    denormalized by exposure. Use Tweedie deviance with `p=1.5`.
     """
 
     def __init__(self, est_freq, est_sev):
         self.est_freq = est_freq
         self.est_sev = est_sev
+        self._family_instance = TweedieDistribution(power=1.5)
 
     def predict(self, X, exposure):
-        """Predict the total claim amount
+        """Predict the total claim amount.
 
         The predict method is not compatible with the scikit-learn API.
         """
         return exposure * self.est_freq.predict(X) * self.est_sev.predict(X)
 
+    def score(self, X, y, sample_weight=None):
+        """Compute D², the percentage of deviance explained."""
+        mu = self.predict(X, exposure=sample_weight)
+        dev = self._family_instance.deviance(y, mu, weights=sample_weight)
+        y_mean = np.average(y, weights=sample_weight)
+        dev_null = self._family_instance.deviance(y, y_mean,
+                                                  weights=sample_weight)
+        return 1. - dev / dev_null
+
 
 est_prod = ClaimProdEstimator(glm_freq, glm_sev)
 
@@ -476,7 +493,9 @@ def predict(self, X, exposure):
 # model than when using separate models for frequency and severity.
 #
 # We can additionally validate these models by comparing observed and predicted
-# total claim amount over the test and train subsets.
+# total claim amount over the test and train subsets. We see that in our case
+# the frequency-severity model underestimates the total claim amount, whereas
+# the Tweedie model overestimates.
 
 res = []
 for subset_label, X, df in [

From c3fc3929f23d38b0a655cfcbf60a600d79777666 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Mon, 15 Jul 2019 14:15:26 +0200
Subject: [PATCH 68/69] Address review comments

---
 .../plot_tweedie_regression_insurance_claims.py   | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 00111b811f923..1c8dd42df336d 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -46,7 +46,7 @@
 from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
 from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
 
-from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import mean_absolute_error, mean_squared_error
 
 
 def load_mtpl2(n_samples=100000):
@@ -83,7 +83,7 @@ def load_mtpl2(n_samples=100000):
 
 
 def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
-                  title=None, kind_weight=None, ax=None):
+                  title=None, ax=None):
     """Plot observed and predicted - aggregated per feature level.
 
     Parameters
@@ -141,9 +141,11 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 # requires a strictly positive target values.
 df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
 
-# correct for unreasonable observations (that might be data error)
+# Correct for unreasonable observations (that might be data error)
+# and a few exceptionally large claim amounts
 df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
 df["Exposure"] = df["Exposure"].clip(upper=1)
+df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)
 
 column_trans = ColumnTransformer(
     [
@@ -188,7 +190,9 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 
 df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2)
 
-glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0)
+# Some of the features are colinear, we use a weak penalization to avoid
+# numerical issues.
+glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=1e-2)
 glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
 
 
@@ -214,6 +218,7 @@ def score_estimator(
             ("D² explained", None),
             ("mean deviance", partial(mean_deviance, estimator)),
             ("mean abs. error", mean_absolute_error),
+            ("mean squared error", mean_squared_error),
         ]:
             if estimator.__class__.__name__ == "ClaimProdEstimator":
                 # ClaimProdEstimator is the product of frequency and severity
@@ -325,7 +330,7 @@ def score_estimator(
 mask_train = df_train["ClaimAmount"] > 0
 mask_test = df_test["ClaimAmount"] > 0
 
-glm_sev = GeneralizedLinearRegressor(family="gamma", alpha=1)
+glm_sev = GeneralizedLinearRegressor(family="gamma")
 
 glm_sev.fit(
     X_train[mask_train.values],

From 98054bc9a4416c49b26a3a253b9a7bef16a1e27b Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 27 Nov 2019 22:03:20 +0100
Subject: [PATCH 69/69] fix sparse P2 cases

---
 sklearn/linear_model/_glm.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
index b18731e73f328..b3c303ee9770b 100644
--- a/sklearn/linear_model/_glm.py
+++ b/sklearn/linear_model/_glm.py
@@ -1862,8 +1862,7 @@ def fit(self, X, y, sample_weight=None):
             elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and
                     P2.shape[0] == X.shape[1]):
                 if sparse.issparse(X):
-                    P2 = (sparse.dia_matrix((P2, 0),
-                          shape=(n_features, n_features))).tocsc()
+                    P2 = sparse.csc_matrix(P2)
             else:
                 raise ValueError("P2 must be either None or an array of shape "
                                  "(n_features, n_features) with "
@@ -1939,7 +1938,7 @@ def fit(self, X, y, sample_weight=None):
                     # efficiently, use only half of n_features
                     # k = how many eigenvals to compute
                     k = np.min([10, n_features // 10 + 1])
-                    sigma = 0  # start searching near this value
+                    sigma = -1000 * epsneg  # start searching near this value
                     which = 'SA'  # find smallest algebraic eigenvalues first
                     eigenvalues = splinalg.eigsh(P2, k=k, sigma=sigma,
                                                  which=which,